2 * Swap block device support for MTDs
3 * Turns an MTD device into a swap device with block wear leveling
5 * Copyright © 2007,2011 Nokia Corporation. All rights reserved.
7 * Authors: Jarkko Lavinen <jarkko.lavinen@nokia.com>
9 * Based on Richard Purdie's earlier implementation in 2007. Background
10 * support and lock-less operation written by Adrian Hunter.
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * version 2 as published by the Free Software Foundation.
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 #include <linux/kernel.h>
28 #include <linux/module.h>
29 #include <linux/mtd/mtd.h>
30 #include <linux/mtd/blktrans.h>
31 #include <linux/rbtree.h>
32 #include <linux/sched.h>
33 #include <linux/slab.h>
34 #include <linux/vmalloc.h>
35 #include <linux/genhd.h>
36 #include <linux/swap.h>
37 #include <linux/debugfs.h>
38 #include <linux/seq_file.h>
39 #include <linux/device.h>
40 #include <linux/math64.h>
42 #define MTDSWAP_PREFIX "mtdswap"
45 * The number of free eraseblocks when GC should stop
47 #define CLEAN_BLOCK_THRESHOLD 20
50 * Number of free eraseblocks below which GC can also collect low frag
53 #define LOW_FRAG_GC_THRESHOLD 5
56 * Wear level cost amortization. We want to do wear leveling on the background
57 * without disturbing gc too much. This is made by defining max GC frequency.
58 * Frequency value 6 means 1/6 of the GC passes will pick an erase block based
59 * on the biggest wear difference rather than the biggest dirtiness.
61 * The lower freq2 should be chosen so that it makes sure the maximum erase
62 * difference will decrease even if a malicious application is deliberately
63 * trying to make erase differences large.
65 #define MAX_ERASE_DIFF 4000
66 #define COLLECT_NONDIRTY_BASE MAX_ERASE_DIFF
67 #define COLLECT_NONDIRTY_FREQ1 6
68 #define COLLECT_NONDIRTY_FREQ2 4
70 #define PAGE_UNDEF UINT_MAX
71 #define BLOCK_UNDEF UINT_MAX
72 #define BLOCK_ERROR (UINT_MAX - 1)
73 #define BLOCK_MAX (UINT_MAX - 2)
75 #define EBLOCK_BAD (1 << 0)
76 #define EBLOCK_NOMAGIC (1 << 1)
77 #define EBLOCK_BITFLIP (1 << 2)
78 #define EBLOCK_FAILED (1 << 3)
79 #define EBLOCK_READERR (1 << 4)
80 #define EBLOCK_IDX_SHIFT 5
87 unsigned int active_count
;
88 unsigned int erase_count
;
89 unsigned int pad
; /* speeds up pointer decrement */
92 #define MTDSWAP_ECNT_MIN(rbroot) (rb_entry(rb_first(rbroot), struct swap_eb, \
94 #define MTDSWAP_ECNT_MAX(rbroot) (rb_entry(rb_last(rbroot), struct swap_eb, \
114 struct mtd_blktrans_dev
*mbd_dev
;
115 struct mtd_info
*mtd
;
118 unsigned int *page_data
;
119 unsigned int *revmap
;
122 unsigned int spare_eblks
;
123 unsigned int pages_per_eblk
;
124 unsigned int max_erase_count
;
125 struct swap_eb
*eb_data
;
127 struct mtdswap_tree trees
[MTDSWAP_TREE_CNT
];
129 unsigned long long sect_read_count
;
130 unsigned long long sect_write_count
;
131 unsigned long long mtd_write_count
;
132 unsigned long long mtd_read_count
;
133 unsigned long long discard_count
;
134 unsigned long long discard_page_count
;
136 unsigned int curr_write_pos
;
137 struct swap_eb
*curr_write
;
143 struct mtdswap_oobdata
{
148 #define MTDSWAP_MAGIC_CLEAN 0x2095
149 #define MTDSWAP_MAGIC_DIRTY (MTDSWAP_MAGIC_CLEAN + 1)
150 #define MTDSWAP_TYPE_CLEAN 0
151 #define MTDSWAP_TYPE_DIRTY 1
152 #define MTDSWAP_OOBSIZE sizeof(struct mtdswap_oobdata)
154 #define MTDSWAP_ERASE_RETRIES 3 /* Before marking erase block bad */
155 #define MTDSWAP_IO_RETRIES 3
158 MTDSWAP_SCANNED_CLEAN
,
159 MTDSWAP_SCANNED_DIRTY
,
160 MTDSWAP_SCANNED_BITFLIP
,
165 * In the worst case mtdswap_writesect() has allocated the last clean
166 * page from the current block and is then pre-empted by the GC
167 * thread. The thread can consume a full erase block when moving a
170 #define MIN_SPARE_EBLOCKS 2
171 #define MIN_ERASE_BLOCKS (MIN_SPARE_EBLOCKS + 1)
173 #define TREE_ROOT(d, name) (&d->trees[MTDSWAP_ ## name].root)
174 #define TREE_EMPTY(d, name) (TREE_ROOT(d, name)->rb_node == NULL)
175 #define TREE_NONEMPTY(d, name) (!TREE_EMPTY(d, name))
176 #define TREE_COUNT(d, name) (d->trees[MTDSWAP_ ## name].count)
178 #define MTDSWAP_MBD_TO_MTDSWAP(dev) ((struct mtdswap_dev *)dev->priv)
180 static char partitions
[128] = "";
181 module_param_string(partitions
, partitions
, sizeof(partitions
), 0444);
182 MODULE_PARM_DESC(partitions
, "MTD partition numbers to use as swap "
183 "partitions=\"1,3,5\"");
185 static unsigned int spare_eblocks
= 10;
186 module_param(spare_eblocks
, uint
, 0444);
187 MODULE_PARM_DESC(spare_eblocks
, "Percentage of spare erase blocks for "
188 "garbage collection (default 10%)");
190 static bool header
; /* false */
191 module_param(header
, bool, 0444);
192 MODULE_PARM_DESC(header
,
193 "Include builtin swap header (default 0, without header)");
195 static int mtdswap_gc(struct mtdswap_dev
*d
, unsigned int background
);
197 static loff_t
mtdswap_eb_offset(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
199 return (loff_t
)(eb
- d
->eb_data
) * d
->mtd
->erasesize
;
202 static void mtdswap_eb_detach(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
205 struct mtdswap_tree
*tp
;
208 tp
= container_of(eb
->root
, struct mtdswap_tree
, root
);
209 oldidx
= tp
- &d
->trees
[0];
211 d
->trees
[oldidx
].count
--;
212 rb_erase(&eb
->rb
, eb
->root
);
216 static void __mtdswap_rb_add(struct rb_root
*root
, struct swap_eb
*eb
)
218 struct rb_node
**p
, *parent
= NULL
;
224 cur
= rb_entry(parent
, struct swap_eb
, rb
);
225 if (eb
->erase_count
> cur
->erase_count
)
231 rb_link_node(&eb
->rb
, parent
, p
);
232 rb_insert_color(&eb
->rb
, root
);
235 static void mtdswap_rb_add(struct mtdswap_dev
*d
, struct swap_eb
*eb
, int idx
)
237 struct rb_root
*root
;
239 if (eb
->root
== &d
->trees
[idx
].root
)
242 mtdswap_eb_detach(d
, eb
);
243 root
= &d
->trees
[idx
].root
;
244 __mtdswap_rb_add(root
, eb
);
246 d
->trees
[idx
].count
++;
249 static struct rb_node
*mtdswap_rb_index(struct rb_root
*root
, unsigned int idx
)
256 while (i
< idx
&& p
) {
264 static int mtdswap_handle_badblock(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
270 eb
->flags
|= EBLOCK_BAD
;
271 mtdswap_eb_detach(d
, eb
);
274 /* badblocks not supported */
275 if (!mtd_can_have_bb(d
->mtd
))
278 offset
= mtdswap_eb_offset(d
, eb
);
279 dev_warn(d
->dev
, "Marking bad block at %08llx\n", offset
);
280 ret
= mtd_block_markbad(d
->mtd
, offset
);
283 dev_warn(d
->dev
, "Mark block bad failed for block at %08llx "
284 "error %d\n", offset
, ret
);
292 static int mtdswap_handle_write_error(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
294 unsigned int marked
= eb
->flags
& EBLOCK_FAILED
;
295 struct swap_eb
*curr_write
= d
->curr_write
;
297 eb
->flags
|= EBLOCK_FAILED
;
298 if (curr_write
== eb
) {
299 d
->curr_write
= NULL
;
301 if (!marked
&& d
->curr_write_pos
!= 0) {
302 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
307 return mtdswap_handle_badblock(d
, eb
);
310 static int mtdswap_read_oob(struct mtdswap_dev
*d
, loff_t from
,
311 struct mtd_oob_ops
*ops
)
313 int ret
= mtd_read_oob(d
->mtd
, from
, ops
);
315 if (mtd_is_bitflip(ret
))
319 dev_warn(d
->dev
, "Read OOB failed %d for block at %08llx\n",
324 if (ops
->oobretlen
< ops
->ooblen
) {
325 dev_warn(d
->dev
, "Read OOB return short read (%zd bytes not "
326 "%zd) for block at %08llx\n",
327 ops
->oobretlen
, ops
->ooblen
, from
);
334 static int mtdswap_read_markers(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
336 struct mtdswap_oobdata
*data
, *data2
;
339 struct mtd_oob_ops ops
;
341 offset
= mtdswap_eb_offset(d
, eb
);
343 /* Check first if the block is bad. */
344 if (mtd_can_have_bb(d
->mtd
) && mtd_block_isbad(d
->mtd
, offset
))
345 return MTDSWAP_SCANNED_BAD
;
347 ops
.ooblen
= 2 * d
->mtd
->oobavail
;
348 ops
.oobbuf
= d
->oob_buf
;
351 ops
.mode
= MTD_OPS_AUTO_OOB
;
353 ret
= mtdswap_read_oob(d
, offset
, &ops
);
355 if (ret
&& !mtd_is_bitflip(ret
))
358 data
= (struct mtdswap_oobdata
*)d
->oob_buf
;
359 data2
= (struct mtdswap_oobdata
*)
360 (d
->oob_buf
+ d
->mtd
->oobavail
);
362 if (le16_to_cpu(data
->magic
) == MTDSWAP_MAGIC_CLEAN
) {
363 eb
->erase_count
= le32_to_cpu(data
->count
);
364 if (mtd_is_bitflip(ret
))
365 ret
= MTDSWAP_SCANNED_BITFLIP
;
367 if (le16_to_cpu(data2
->magic
) == MTDSWAP_MAGIC_DIRTY
)
368 ret
= MTDSWAP_SCANNED_DIRTY
;
370 ret
= MTDSWAP_SCANNED_CLEAN
;
373 eb
->flags
|= EBLOCK_NOMAGIC
;
374 ret
= MTDSWAP_SCANNED_DIRTY
;
380 static int mtdswap_write_marker(struct mtdswap_dev
*d
, struct swap_eb
*eb
,
383 struct mtdswap_oobdata n
;
386 struct mtd_oob_ops ops
;
389 ops
.oobbuf
= (uint8_t *)&n
;
390 ops
.mode
= MTD_OPS_AUTO_OOB
;
393 if (marker
== MTDSWAP_TYPE_CLEAN
) {
394 n
.magic
= cpu_to_le16(MTDSWAP_MAGIC_CLEAN
);
395 n
.count
= cpu_to_le32(eb
->erase_count
);
396 ops
.ooblen
= MTDSWAP_OOBSIZE
;
397 offset
= mtdswap_eb_offset(d
, eb
);
399 n
.magic
= cpu_to_le16(MTDSWAP_MAGIC_DIRTY
);
400 ops
.ooblen
= sizeof(n
.magic
);
401 offset
= mtdswap_eb_offset(d
, eb
) + d
->mtd
->writesize
;
404 ret
= mtd_write_oob(d
->mtd
, offset
, &ops
);
407 dev_warn(d
->dev
, "Write OOB failed for block at %08llx "
408 "error %d\n", offset
, ret
);
409 if (ret
== -EIO
|| mtd_is_eccerr(ret
))
410 mtdswap_handle_write_error(d
, eb
);
414 if (ops
.oobretlen
!= ops
.ooblen
) {
415 dev_warn(d
->dev
, "Short OOB write for block at %08llx: "
417 offset
, ops
.oobretlen
, ops
.ooblen
);
425 * Are there any erase blocks without MAGIC_CLEAN header, presumably
426 * because power was cut off after erase but before header write? We
427 * need to guestimate the erase count.
429 static void mtdswap_check_counts(struct mtdswap_dev
*d
)
431 struct rb_root hist_root
= RB_ROOT
;
432 struct rb_node
*medrb
;
434 unsigned int i
, cnt
, median
;
437 for (i
= 0; i
< d
->eblks
; i
++) {
440 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_BAD
| EBLOCK_READERR
))
443 __mtdswap_rb_add(&hist_root
, eb
);
450 medrb
= mtdswap_rb_index(&hist_root
, cnt
/ 2);
451 median
= rb_entry(medrb
, struct swap_eb
, rb
)->erase_count
;
453 d
->max_erase_count
= MTDSWAP_ECNT_MAX(&hist_root
);
455 for (i
= 0; i
< d
->eblks
; i
++) {
458 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_READERR
))
459 eb
->erase_count
= median
;
461 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_BAD
| EBLOCK_READERR
))
464 rb_erase(&eb
->rb
, &hist_root
);
468 static void mtdswap_scan_eblks(struct mtdswap_dev
*d
)
474 for (i
= 0; i
< d
->eblks
; i
++) {
477 status
= mtdswap_read_markers(d
, eb
);
479 eb
->flags
|= EBLOCK_READERR
;
480 else if (status
== MTDSWAP_SCANNED_BAD
) {
481 eb
->flags
|= EBLOCK_BAD
;
486 case MTDSWAP_SCANNED_CLEAN
:
489 case MTDSWAP_SCANNED_DIRTY
:
490 case MTDSWAP_SCANNED_BITFLIP
:
494 idx
= MTDSWAP_FAILING
;
497 eb
->flags
|= (idx
<< EBLOCK_IDX_SHIFT
);
500 mtdswap_check_counts(d
);
502 for (i
= 0; i
< d
->eblks
; i
++) {
505 if (eb
->flags
& EBLOCK_BAD
)
508 idx
= eb
->flags
>> EBLOCK_IDX_SHIFT
;
509 mtdswap_rb_add(d
, eb
, idx
);
514 * Place eblk into a tree corresponding to its number of active blocks
517 static void mtdswap_store_eb(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
519 unsigned int weight
= eb
->active_count
;
520 unsigned int maxweight
= d
->pages_per_eblk
;
522 if (eb
== d
->curr_write
)
525 if (eb
->flags
& EBLOCK_BITFLIP
)
526 mtdswap_rb_add(d
, eb
, MTDSWAP_BITFLIP
);
527 else if (eb
->flags
& (EBLOCK_READERR
| EBLOCK_FAILED
))
528 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
529 if (weight
== maxweight
)
530 mtdswap_rb_add(d
, eb
, MTDSWAP_USED
);
531 else if (weight
== 0)
532 mtdswap_rb_add(d
, eb
, MTDSWAP_DIRTY
);
533 else if (weight
> (maxweight
/2))
534 mtdswap_rb_add(d
, eb
, MTDSWAP_LOWFRAG
);
536 mtdswap_rb_add(d
, eb
, MTDSWAP_HIFRAG
);
540 static void mtdswap_erase_callback(struct erase_info
*done
)
542 wait_queue_head_t
*wait_q
= (wait_queue_head_t
*)done
->priv
;
546 static int mtdswap_erase_block(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
548 struct mtd_info
*mtd
= d
->mtd
;
549 struct erase_info erase
;
550 wait_queue_head_t wq
;
551 unsigned int retries
= 0;
555 if (eb
->erase_count
> d
->max_erase_count
)
556 d
->max_erase_count
= eb
->erase_count
;
559 init_waitqueue_head(&wq
);
560 memset(&erase
, 0, sizeof(struct erase_info
));
563 erase
.callback
= mtdswap_erase_callback
;
564 erase
.addr
= mtdswap_eb_offset(d
, eb
);
565 erase
.len
= mtd
->erasesize
;
566 erase
.priv
= (u_long
)&wq
;
568 ret
= mtd_erase(mtd
, &erase
);
570 if (retries
++ < MTDSWAP_ERASE_RETRIES
) {
572 "erase of erase block %#llx on %s failed",
573 erase
.addr
, mtd
->name
);
578 dev_err(d
->dev
, "Cannot erase erase block %#llx on %s\n",
579 erase
.addr
, mtd
->name
);
581 mtdswap_handle_badblock(d
, eb
);
585 ret
= wait_event_interruptible(wq
, erase
.state
== MTD_ERASE_DONE
||
586 erase
.state
== MTD_ERASE_FAILED
);
588 dev_err(d
->dev
, "Interrupted erase block %#llx erasure on %s\n",
589 erase
.addr
, mtd
->name
);
593 if (erase
.state
== MTD_ERASE_FAILED
) {
594 if (retries
++ < MTDSWAP_ERASE_RETRIES
) {
596 "erase of erase block %#llx on %s failed",
597 erase
.addr
, mtd
->name
);
602 mtdswap_handle_badblock(d
, eb
);
609 static int mtdswap_map_free_block(struct mtdswap_dev
*d
, unsigned int page
,
613 struct swap_eb
*old_eb
= d
->curr_write
;
614 struct rb_root
*clean_root
;
617 if (old_eb
== NULL
|| d
->curr_write_pos
>= d
->pages_per_eblk
) {
619 if (TREE_EMPTY(d
, CLEAN
))
622 clean_root
= TREE_ROOT(d
, CLEAN
);
623 eb
= rb_entry(rb_first(clean_root
), struct swap_eb
, rb
);
624 rb_erase(&eb
->rb
, clean_root
);
626 TREE_COUNT(d
, CLEAN
)--;
628 ret
= mtdswap_write_marker(d
, eb
, MTDSWAP_TYPE_DIRTY
);
629 } while (ret
== -EIO
|| mtd_is_eccerr(ret
));
634 d
->curr_write_pos
= 0;
637 mtdswap_store_eb(d
, old_eb
);
640 *block
= (d
->curr_write
- d
->eb_data
) * d
->pages_per_eblk
+
643 d
->curr_write
->active_count
++;
644 d
->revmap
[*block
] = page
;
650 static unsigned int mtdswap_free_page_cnt(struct mtdswap_dev
*d
)
652 return TREE_COUNT(d
, CLEAN
) * d
->pages_per_eblk
+
653 d
->pages_per_eblk
- d
->curr_write_pos
;
656 static unsigned int mtdswap_enough_free_pages(struct mtdswap_dev
*d
)
658 return mtdswap_free_page_cnt(d
) > d
->pages_per_eblk
;
661 static int mtdswap_write_block(struct mtdswap_dev
*d
, char *buf
,
662 unsigned int page
, unsigned int *bp
, int gc_context
)
664 struct mtd_info
*mtd
= d
->mtd
;
672 while (!mtdswap_enough_free_pages(d
))
673 if (mtdswap_gc(d
, 0) > 0)
676 ret
= mtdswap_map_free_block(d
, page
, bp
);
677 eb
= d
->eb_data
+ (*bp
/ d
->pages_per_eblk
);
679 if (ret
== -EIO
|| mtd_is_eccerr(ret
)) {
680 d
->curr_write
= NULL
;
682 d
->revmap
[*bp
] = PAGE_UNDEF
;
689 writepos
= (loff_t
)*bp
<< PAGE_SHIFT
;
690 ret
= mtd_write(mtd
, writepos
, PAGE_SIZE
, &retlen
, buf
);
691 if (ret
== -EIO
|| mtd_is_eccerr(ret
)) {
694 d
->revmap
[*bp
] = PAGE_UNDEF
;
695 mtdswap_handle_write_error(d
, eb
);
700 dev_err(d
->dev
, "Write to MTD device failed: %d (%zd written)",
705 if (retlen
!= PAGE_SIZE
) {
706 dev_err(d
->dev
, "Short write to MTD device: %zd written",
717 d
->revmap
[*bp
] = PAGE_UNDEF
;
722 static int mtdswap_move_block(struct mtdswap_dev
*d
, unsigned int oldblock
,
723 unsigned int *newblock
)
725 struct mtd_info
*mtd
= d
->mtd
;
726 struct swap_eb
*eb
, *oldeb
;
729 unsigned int page
, retries
;
732 page
= d
->revmap
[oldblock
];
733 readpos
= (loff_t
) oldblock
<< PAGE_SHIFT
;
737 ret
= mtd_read(mtd
, readpos
, PAGE_SIZE
, &retlen
, d
->page_buf
);
739 if (ret
< 0 && !mtd_is_bitflip(ret
)) {
740 oldeb
= d
->eb_data
+ oldblock
/ d
->pages_per_eblk
;
741 oldeb
->flags
|= EBLOCK_READERR
;
743 dev_err(d
->dev
, "Read Error: %d (block %u)\n", ret
,
746 if (retries
< MTDSWAP_IO_RETRIES
)
752 if (retlen
!= PAGE_SIZE
) {
753 dev_err(d
->dev
, "Short read: %zd (block %u)\n", retlen
,
759 ret
= mtdswap_write_block(d
, d
->page_buf
, page
, newblock
, 1);
761 d
->page_data
[page
] = BLOCK_ERROR
;
762 dev_err(d
->dev
, "Write error: %d\n", ret
);
766 eb
= d
->eb_data
+ *newblock
/ d
->pages_per_eblk
;
767 d
->page_data
[page
] = *newblock
;
768 d
->revmap
[oldblock
] = PAGE_UNDEF
;
769 eb
= d
->eb_data
+ oldblock
/ d
->pages_per_eblk
;
775 d
->page_data
[page
] = BLOCK_ERROR
;
776 d
->revmap
[oldblock
] = PAGE_UNDEF
;
780 static int mtdswap_gc_eblock(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
782 unsigned int i
, block
, eblk_base
, newblock
;
786 eblk_base
= (eb
- d
->eb_data
) * d
->pages_per_eblk
;
788 for (i
= 0; i
< d
->pages_per_eblk
; i
++) {
789 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
792 block
= eblk_base
+ i
;
793 if (d
->revmap
[block
] == PAGE_UNDEF
)
796 ret
= mtdswap_move_block(d
, block
, &newblock
);
797 if (ret
< 0 && !errcode
)
804 static int __mtdswap_choose_gc_tree(struct mtdswap_dev
*d
)
808 if (TREE_COUNT(d
, CLEAN
) < LOW_FRAG_GC_THRESHOLD
)
809 stopat
= MTDSWAP_LOWFRAG
;
811 stopat
= MTDSWAP_HIFRAG
;
813 for (idx
= MTDSWAP_BITFLIP
; idx
>= stopat
; idx
--)
814 if (d
->trees
[idx
].root
.rb_node
!= NULL
)
820 static int mtdswap_wlfreq(unsigned int maxdiff
)
822 unsigned int h
, x
, y
, dist
, base
;
825 * Calculate linear ramp down from f1 to f2 when maxdiff goes from
826 * MAX_ERASE_DIFF to MAX_ERASE_DIFF + COLLECT_NONDIRTY_BASE. Similar
827 * to triangle with height f1 - f1 and width COLLECT_NONDIRTY_BASE.
830 dist
= maxdiff
- MAX_ERASE_DIFF
;
831 if (dist
> COLLECT_NONDIRTY_BASE
)
832 dist
= COLLECT_NONDIRTY_BASE
;
835 * Modelling the slop as right angular triangle with base
836 * COLLECT_NONDIRTY_BASE and height freq1 - freq2. The ratio y/x is
837 * equal to the ratio h/base.
839 h
= COLLECT_NONDIRTY_FREQ1
- COLLECT_NONDIRTY_FREQ2
;
840 base
= COLLECT_NONDIRTY_BASE
;
843 y
= (x
* h
+ base
/ 2) / base
;
845 return COLLECT_NONDIRTY_FREQ2
+ y
;
848 static int mtdswap_choose_wl_tree(struct mtdswap_dev
*d
)
850 static unsigned int pick_cnt
;
851 unsigned int i
, idx
= -1, wear
, max
;
852 struct rb_root
*root
;
855 for (i
= 0; i
<= MTDSWAP_DIRTY
; i
++) {
856 root
= &d
->trees
[i
].root
;
857 if (root
->rb_node
== NULL
)
860 wear
= d
->max_erase_count
- MTDSWAP_ECNT_MIN(root
);
867 if (max
> MAX_ERASE_DIFF
&& pick_cnt
>= mtdswap_wlfreq(max
) - 1) {
876 static int mtdswap_choose_gc_tree(struct mtdswap_dev
*d
,
877 unsigned int background
)
881 if (TREE_NONEMPTY(d
, FAILING
) &&
882 (background
|| (TREE_EMPTY(d
, CLEAN
) && TREE_EMPTY(d
, DIRTY
))))
883 return MTDSWAP_FAILING
;
885 idx
= mtdswap_choose_wl_tree(d
);
886 if (idx
>= MTDSWAP_CLEAN
)
889 return __mtdswap_choose_gc_tree(d
);
892 static struct swap_eb
*mtdswap_pick_gc_eblk(struct mtdswap_dev
*d
,
893 unsigned int background
)
895 struct rb_root
*rp
= NULL
;
896 struct swap_eb
*eb
= NULL
;
899 if (background
&& TREE_COUNT(d
, CLEAN
) > CLEAN_BLOCK_THRESHOLD
&&
900 TREE_EMPTY(d
, DIRTY
) && TREE_EMPTY(d
, FAILING
))
903 idx
= mtdswap_choose_gc_tree(d
, background
);
907 rp
= &d
->trees
[idx
].root
;
908 eb
= rb_entry(rb_first(rp
), struct swap_eb
, rb
);
910 rb_erase(&eb
->rb
, rp
);
912 d
->trees
[idx
].count
--;
916 static unsigned int mtdswap_test_patt(unsigned int i
)
918 return i
% 2 ? 0x55555555 : 0xAAAAAAAA;
921 static unsigned int mtdswap_eblk_passes(struct mtdswap_dev
*d
,
924 struct mtd_info
*mtd
= d
->mtd
;
925 unsigned int test
, i
, j
, patt
, mtd_pages
;
927 unsigned int *p1
= (unsigned int *)d
->page_buf
;
928 unsigned char *p2
= (unsigned char *)d
->oob_buf
;
929 struct mtd_oob_ops ops
;
932 ops
.mode
= MTD_OPS_AUTO_OOB
;
933 ops
.len
= mtd
->writesize
;
934 ops
.ooblen
= mtd
->oobavail
;
936 ops
.datbuf
= d
->page_buf
;
937 ops
.oobbuf
= d
->oob_buf
;
938 base
= mtdswap_eb_offset(d
, eb
);
939 mtd_pages
= d
->pages_per_eblk
* PAGE_SIZE
/ mtd
->writesize
;
941 for (test
= 0; test
< 2; test
++) {
943 for (i
= 0; i
< mtd_pages
; i
++) {
944 patt
= mtdswap_test_patt(test
+ i
);
945 memset(d
->page_buf
, patt
, mtd
->writesize
);
946 memset(d
->oob_buf
, patt
, mtd
->oobavail
);
947 ret
= mtd_write_oob(mtd
, pos
, &ops
);
951 pos
+= mtd
->writesize
;
955 for (i
= 0; i
< mtd_pages
; i
++) {
956 ret
= mtd_read_oob(mtd
, pos
, &ops
);
960 patt
= mtdswap_test_patt(test
+ i
);
961 for (j
= 0; j
< mtd
->writesize
/sizeof(int); j
++)
965 for (j
= 0; j
< mtd
->oobavail
; j
++)
966 if (p2
[j
] != (unsigned char)patt
)
969 pos
+= mtd
->writesize
;
972 ret
= mtdswap_erase_block(d
, eb
);
977 eb
->flags
&= ~EBLOCK_READERR
;
981 mtdswap_handle_badblock(d
, eb
);
985 static int mtdswap_gc(struct mtdswap_dev
*d
, unsigned int background
)
990 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
993 eb
= mtdswap_pick_gc_eblk(d
, background
);
997 ret
= mtdswap_gc_eblock(d
, eb
);
1001 if (eb
->flags
& EBLOCK_FAILED
) {
1002 mtdswap_handle_badblock(d
, eb
);
1006 eb
->flags
&= ~EBLOCK_BITFLIP
;
1007 ret
= mtdswap_erase_block(d
, eb
);
1008 if ((eb
->flags
& EBLOCK_READERR
) &&
1009 (ret
|| !mtdswap_eblk_passes(d
, eb
)))
1013 ret
= mtdswap_write_marker(d
, eb
, MTDSWAP_TYPE_CLEAN
);
1016 mtdswap_rb_add(d
, eb
, MTDSWAP_CLEAN
);
1017 else if (ret
!= -EIO
&& !mtd_is_eccerr(ret
))
1018 mtdswap_rb_add(d
, eb
, MTDSWAP_DIRTY
);
1023 static void mtdswap_background(struct mtd_blktrans_dev
*dev
)
1025 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1029 ret
= mtdswap_gc(d
, 1);
1030 if (ret
|| mtd_blktrans_cease_background(dev
))
1035 static void mtdswap_cleanup(struct mtdswap_dev
*d
)
1039 vfree(d
->page_data
);
1044 static int mtdswap_flush(struct mtd_blktrans_dev
*dev
)
1046 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1052 static unsigned int mtdswap_badblocks(struct mtd_info
*mtd
, uint64_t size
)
1055 unsigned int badcnt
;
1059 if (mtd_can_have_bb(mtd
))
1060 for (offset
= 0; offset
< size
; offset
+= mtd
->erasesize
)
1061 if (mtd_block_isbad(mtd
, offset
))
1067 static int mtdswap_writesect(struct mtd_blktrans_dev
*dev
,
1068 unsigned long page
, char *buf
)
1070 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1071 unsigned int newblock
, mapped
;
1075 d
->sect_write_count
++;
1077 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
1081 /* Ignore writes to the header page */
1082 if (unlikely(page
== 0))
1088 mapped
= d
->page_data
[page
];
1089 if (mapped
<= BLOCK_MAX
) {
1090 eb
= d
->eb_data
+ (mapped
/ d
->pages_per_eblk
);
1092 mtdswap_store_eb(d
, eb
);
1093 d
->page_data
[page
] = BLOCK_UNDEF
;
1094 d
->revmap
[mapped
] = PAGE_UNDEF
;
1097 ret
= mtdswap_write_block(d
, buf
, page
, &newblock
, 0);
1098 d
->mtd_write_count
++;
1103 eb
= d
->eb_data
+ (newblock
/ d
->pages_per_eblk
);
1104 d
->page_data
[page
] = newblock
;
1109 /* Provide a dummy swap header for the kernel */
1110 static int mtdswap_auto_header(struct mtdswap_dev
*d
, char *buf
)
1112 union swap_header
*hd
= (union swap_header
*)(buf
);
1114 memset(buf
, 0, PAGE_SIZE
- 10);
1116 hd
->info
.version
= 1;
1117 hd
->info
.last_page
= d
->mbd_dev
->size
- 1;
1118 hd
->info
.nr_badpages
= 0;
1120 memcpy(buf
+ PAGE_SIZE
- 10, "SWAPSPACE2", 10);
1125 static int mtdswap_readsect(struct mtd_blktrans_dev
*dev
,
1126 unsigned long page
, char *buf
)
1128 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1129 struct mtd_info
*mtd
= d
->mtd
;
1130 unsigned int realblock
, retries
;
1136 d
->sect_read_count
++;
1139 if (unlikely(page
== 0))
1140 return mtdswap_auto_header(d
, buf
);
1145 realblock
= d
->page_data
[page
];
1146 if (realblock
> BLOCK_MAX
) {
1147 memset(buf
, 0x0, PAGE_SIZE
);
1148 if (realblock
== BLOCK_UNDEF
)
1154 eb
= d
->eb_data
+ (realblock
/ d
->pages_per_eblk
);
1155 BUG_ON(d
->revmap
[realblock
] == PAGE_UNDEF
);
1157 readpos
= (loff_t
)realblock
<< PAGE_SHIFT
;
1161 ret
= mtd_read(mtd
, readpos
, PAGE_SIZE
, &retlen
, buf
);
1163 d
->mtd_read_count
++;
1164 if (mtd_is_bitflip(ret
)) {
1165 eb
->flags
|= EBLOCK_BITFLIP
;
1166 mtdswap_rb_add(d
, eb
, MTDSWAP_BITFLIP
);
1171 dev_err(d
->dev
, "Read error %d\n", ret
);
1172 eb
->flags
|= EBLOCK_READERR
;
1173 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
1175 if (retries
< MTDSWAP_IO_RETRIES
)
1181 if (retlen
!= PAGE_SIZE
) {
1182 dev_err(d
->dev
, "Short read %zd\n", retlen
);
1189 static int mtdswap_discard(struct mtd_blktrans_dev
*dev
, unsigned long first
,
1192 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1195 unsigned int mapped
;
1199 for (page
= first
; page
< first
+ nr_pages
; page
++) {
1200 mapped
= d
->page_data
[page
];
1201 if (mapped
<= BLOCK_MAX
) {
1202 eb
= d
->eb_data
+ (mapped
/ d
->pages_per_eblk
);
1204 mtdswap_store_eb(d
, eb
);
1205 d
->page_data
[page
] = BLOCK_UNDEF
;
1206 d
->revmap
[mapped
] = PAGE_UNDEF
;
1207 d
->discard_page_count
++;
1208 } else if (mapped
== BLOCK_ERROR
) {
1209 d
->page_data
[page
] = BLOCK_UNDEF
;
1210 d
->discard_page_count
++;
1217 static int mtdswap_show(struct seq_file
*s
, void *data
)
1219 struct mtdswap_dev
*d
= (struct mtdswap_dev
*) s
->private;
1221 unsigned int count
[MTDSWAP_TREE_CNT
];
1222 unsigned int min
[MTDSWAP_TREE_CNT
];
1223 unsigned int max
[MTDSWAP_TREE_CNT
];
1224 unsigned int i
, cw
= 0, cwp
= 0, cwecount
= 0, bb_cnt
, mapped
, pages
;
1226 static const char * const name
[] = {
1227 "clean", "used", "low", "high", "dirty", "bitflip", "failing"
1230 mutex_lock(&d
->mbd_dev
->lock
);
1232 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++) {
1233 struct rb_root
*root
= &d
->trees
[i
].root
;
1235 if (root
->rb_node
) {
1236 count
[i
] = d
->trees
[i
].count
;
1237 min
[i
] = MTDSWAP_ECNT_MIN(root
);
1238 max
[i
] = MTDSWAP_ECNT_MAX(root
);
1243 if (d
->curr_write
) {
1245 cwp
= d
->curr_write_pos
;
1246 cwecount
= d
->curr_write
->erase_count
;
1250 for (i
= 0; i
< d
->eblks
; i
++)
1251 sum
+= d
->eb_data
[i
].erase_count
;
1253 use_size
= (uint64_t)d
->eblks
* d
->mtd
->erasesize
;
1254 bb_cnt
= mtdswap_badblocks(d
->mtd
, use_size
);
1257 pages
= d
->mbd_dev
->size
;
1258 for (i
= 0; i
< pages
; i
++)
1259 if (d
->page_data
[i
] != BLOCK_UNDEF
)
1262 mutex_unlock(&d
->mbd_dev
->lock
);
1264 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++) {
1268 if (min
[i
] != max
[i
])
1269 seq_printf(s
, "%s:\t%5d erase blocks, erased min %d, "
1271 name
[i
], count
[i
], min
[i
], max
[i
]);
1273 seq_printf(s
, "%s:\t%5d erase blocks, all erased %d "
1274 "times\n", name
[i
], count
[i
], min
[i
]);
1278 seq_printf(s
, "bad:\t%5u erase blocks\n", bb_cnt
);
1281 seq_printf(s
, "current erase block: %u pages used, %u free, "
1282 "erased %u times\n",
1283 cwp
, d
->pages_per_eblk
- cwp
, cwecount
);
1285 seq_printf(s
, "total erasures: %lu\n", sum
);
1289 seq_printf(s
, "mtdswap_readsect count: %llu\n", d
->sect_read_count
);
1290 seq_printf(s
, "mtdswap_writesect count: %llu\n", d
->sect_write_count
);
1291 seq_printf(s
, "mtdswap_discard count: %llu\n", d
->discard_count
);
1292 seq_printf(s
, "mtd read count: %llu\n", d
->mtd_read_count
);
1293 seq_printf(s
, "mtd write count: %llu\n", d
->mtd_write_count
);
1294 seq_printf(s
, "discarded pages count: %llu\n", d
->discard_page_count
);
1297 seq_printf(s
, "total pages: %u\n", pages
);
1298 seq_printf(s
, "pages mapped: %u\n", mapped
);
1303 static int mtdswap_open(struct inode
*inode
, struct file
*file
)
1305 return single_open(file
, mtdswap_show
, inode
->i_private
);
1308 static const struct file_operations mtdswap_fops
= {
1309 .open
= mtdswap_open
,
1311 .llseek
= seq_lseek
,
1312 .release
= single_release
,
1315 static int mtdswap_add_debugfs(struct mtdswap_dev
*d
)
1317 struct dentry
*root
= d
->mtd
->dbg
.dfs_dir
;
1318 struct dentry
*dent
;
1320 if (!IS_ENABLED(CONFIG_DEBUG_FS
))
1323 if (IS_ERR_OR_NULL(root
))
1326 dent
= debugfs_create_file("mtdswap_stats", S_IRUSR
, root
, d
,
1329 dev_err(d
->dev
, "debugfs_create_file failed\n");
1336 static int mtdswap_init(struct mtdswap_dev
*d
, unsigned int eblocks
,
1337 unsigned int spare_cnt
)
1339 struct mtd_info
*mtd
= d
->mbd_dev
->mtd
;
1340 unsigned int i
, eblk_bytes
, pages
, blocks
;
1345 d
->spare_eblks
= spare_cnt
;
1346 d
->pages_per_eblk
= mtd
->erasesize
>> PAGE_SHIFT
;
1348 pages
= d
->mbd_dev
->size
;
1349 blocks
= eblocks
* d
->pages_per_eblk
;
1351 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++)
1352 d
->trees
[i
].root
= RB_ROOT
;
1354 d
->page_data
= vmalloc(sizeof(int)*pages
);
1356 goto page_data_fail
;
1358 d
->revmap
= vmalloc(sizeof(int)*blocks
);
1362 eblk_bytes
= sizeof(struct swap_eb
)*d
->eblks
;
1363 d
->eb_data
= vzalloc(eblk_bytes
);
1367 for (i
= 0; i
< pages
; i
++)
1368 d
->page_data
[i
] = BLOCK_UNDEF
;
1370 for (i
= 0; i
< blocks
; i
++)
1371 d
->revmap
[i
] = PAGE_UNDEF
;
1373 d
->page_buf
= kmalloc(PAGE_SIZE
, GFP_KERNEL
);
1377 d
->oob_buf
= kmalloc(2 * mtd
->oobavail
, GFP_KERNEL
);
1381 mtdswap_scan_eblks(d
);
1392 vfree(d
->page_data
);
1394 printk(KERN_ERR
"%s: init failed (%d)\n", MTDSWAP_PREFIX
, ret
);
1398 static void mtdswap_add_mtd(struct mtd_blktrans_ops
*tr
, struct mtd_info
*mtd
)
1400 struct mtdswap_dev
*d
;
1401 struct mtd_blktrans_dev
*mbd_dev
;
1405 unsigned int eblocks
, eavailable
, bad_blocks
, spare_cnt
;
1406 uint64_t swap_size
, use_size
, size_limit
;
1409 parts
= &partitions
[0];
1413 while ((this_opt
= strsep(&parts
, ",")) != NULL
) {
1414 if (kstrtoul(this_opt
, 0, &part
) < 0)
1417 if (mtd
->index
== part
)
1421 if (mtd
->index
!= part
)
1424 if (mtd
->erasesize
< PAGE_SIZE
|| mtd
->erasesize
% PAGE_SIZE
) {
1425 printk(KERN_ERR
"%s: Erase size %u not multiple of PAGE_SIZE "
1426 "%lu\n", MTDSWAP_PREFIX
, mtd
->erasesize
, PAGE_SIZE
);
1430 if (PAGE_SIZE
% mtd
->writesize
|| mtd
->writesize
> PAGE_SIZE
) {
1431 printk(KERN_ERR
"%s: PAGE_SIZE %lu not multiple of write size"
1432 " %u\n", MTDSWAP_PREFIX
, PAGE_SIZE
, mtd
->writesize
);
1436 if (!mtd
->oobsize
|| mtd
->oobavail
< MTDSWAP_OOBSIZE
) {
1437 printk(KERN_ERR
"%s: Not enough free bytes in OOB, "
1438 "%d available, %zu needed.\n",
1439 MTDSWAP_PREFIX
, mtd
->oobavail
, MTDSWAP_OOBSIZE
);
1443 if (spare_eblocks
> 100)
1444 spare_eblocks
= 100;
1446 use_size
= mtd
->size
;
1447 size_limit
= (uint64_t) BLOCK_MAX
* PAGE_SIZE
;
1449 if (mtd
->size
> size_limit
) {
1450 printk(KERN_WARNING
"%s: Device too large. Limiting size to "
1451 "%llu bytes\n", MTDSWAP_PREFIX
, size_limit
);
1452 use_size
= size_limit
;
1455 eblocks
= mtd_div_by_eb(use_size
, mtd
);
1456 use_size
= (uint64_t)eblocks
* mtd
->erasesize
;
1457 bad_blocks
= mtdswap_badblocks(mtd
, use_size
);
1458 eavailable
= eblocks
- bad_blocks
;
1460 if (eavailable
< MIN_ERASE_BLOCKS
) {
1461 printk(KERN_ERR
"%s: Not enough erase blocks. %u available, "
1462 "%d needed\n", MTDSWAP_PREFIX
, eavailable
,
1467 spare_cnt
= div_u64((uint64_t)eavailable
* spare_eblocks
, 100);
1469 if (spare_cnt
< MIN_SPARE_EBLOCKS
)
1470 spare_cnt
= MIN_SPARE_EBLOCKS
;
1472 if (spare_cnt
> eavailable
- 1)
1473 spare_cnt
= eavailable
- 1;
1475 swap_size
= (uint64_t)(eavailable
- spare_cnt
) * mtd
->erasesize
+
1476 (header
? PAGE_SIZE
: 0);
1478 printk(KERN_INFO
"%s: Enabling MTD swap on device %lu, size %llu KB, "
1479 "%u spare, %u bad blocks\n",
1480 MTDSWAP_PREFIX
, part
, swap_size
/ 1024, spare_cnt
, bad_blocks
);
1482 d
= kzalloc(sizeof(struct mtdswap_dev
), GFP_KERNEL
);
1486 mbd_dev
= kzalloc(sizeof(struct mtd_blktrans_dev
), GFP_KERNEL
);
1492 d
->mbd_dev
= mbd_dev
;
1496 mbd_dev
->devnum
= mtd
->index
;
1497 mbd_dev
->size
= swap_size
>> PAGE_SHIFT
;
1500 if (!(mtd
->flags
& MTD_WRITEABLE
))
1501 mbd_dev
->readonly
= 1;
1503 if (mtdswap_init(d
, eblocks
, spare_cnt
) < 0)
1506 if (add_mtd_blktrans_dev(mbd_dev
) < 0)
1509 d
->dev
= disk_to_dev(mbd_dev
->disk
);
1511 ret
= mtdswap_add_debugfs(d
);
1513 goto debugfs_failed
;
1518 del_mtd_blktrans_dev(mbd_dev
);
1528 static void mtdswap_remove_dev(struct mtd_blktrans_dev
*dev
)
1530 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1532 del_mtd_blktrans_dev(dev
);
1537 static struct mtd_blktrans_ops mtdswap_ops
= {
1541 .blksize
= PAGE_SIZE
,
1542 .flush
= mtdswap_flush
,
1543 .readsect
= mtdswap_readsect
,
1544 .writesect
= mtdswap_writesect
,
1545 .discard
= mtdswap_discard
,
1546 .background
= mtdswap_background
,
1547 .add_mtd
= mtdswap_add_mtd
,
1548 .remove_dev
= mtdswap_remove_dev
,
1549 .owner
= THIS_MODULE
,
1552 static int __init
mtdswap_modinit(void)
1554 return register_mtd_blktrans(&mtdswap_ops
);
1557 static void __exit
mtdswap_modexit(void)
1559 deregister_mtd_blktrans(&mtdswap_ops
);
1562 module_init(mtdswap_modinit
);
1563 module_exit(mtdswap_modexit
);
1566 MODULE_LICENSE("GPL");
1567 MODULE_AUTHOR("Jarkko Lavinen <jarkko.lavinen@nokia.com>");
1568 MODULE_DESCRIPTION("Block device access to an MTD suitable for using as "