2 * Swap block device support for MTDs
3 * Turns an MTD device into a swap device with block wear leveling
5 * Copyright © 2007,2011 Nokia Corporation. All rights reserved.
7 * Authors: Jarkko Lavinen <jarkko.lavinen@nokia.com>
9 * Based on Richard Purdie's earlier implementation in 2007. Background
10 * support and lock-less operation written by Adrian Hunter.
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * version 2 as published by the Free Software Foundation.
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 #include <linux/kernel.h>
28 #include <linux/module.h>
29 #include <linux/mtd/mtd.h>
30 #include <linux/mtd/blktrans.h>
31 #include <linux/rbtree.h>
32 #include <linux/sched.h>
33 #include <linux/slab.h>
34 #include <linux/vmalloc.h>
35 #include <linux/genhd.h>
36 #include <linux/swap.h>
37 #include <linux/debugfs.h>
38 #include <linux/seq_file.h>
39 #include <linux/device.h>
40 #include <linux/math64.h>
42 #define MTDSWAP_PREFIX "mtdswap"
45 * The number of free eraseblocks when GC should stop
47 #define CLEAN_BLOCK_THRESHOLD 20
50 * Number of free eraseblocks below which GC can also collect low frag
53 #define LOW_FRAG_GC_TRESHOLD 5
56 * Wear level cost amortization. We want to do wear leveling on the background
57 * without disturbing gc too much. This is made by defining max GC frequency.
58 * Frequency value 6 means 1/6 of the GC passes will pick an erase block based
59 * on the biggest wear difference rather than the biggest dirtiness.
61 * The lower freq2 should be chosen so that it makes sure the maximum erase
62 * difference will decrease even if a malicious application is deliberately
63 * trying to make erase differences large.
65 #define MAX_ERASE_DIFF 4000
66 #define COLLECT_NONDIRTY_BASE MAX_ERASE_DIFF
67 #define COLLECT_NONDIRTY_FREQ1 6
68 #define COLLECT_NONDIRTY_FREQ2 4
70 #define PAGE_UNDEF UINT_MAX
71 #define BLOCK_UNDEF UINT_MAX
72 #define BLOCK_ERROR (UINT_MAX - 1)
73 #define BLOCK_MAX (UINT_MAX - 2)
75 #define EBLOCK_BAD (1 << 0)
76 #define EBLOCK_NOMAGIC (1 << 1)
77 #define EBLOCK_BITFLIP (1 << 2)
78 #define EBLOCK_FAILED (1 << 3)
79 #define EBLOCK_READERR (1 << 4)
80 #define EBLOCK_IDX_SHIFT 5
87 unsigned int active_count
;
88 unsigned int erase_count
;
89 unsigned int pad
; /* speeds up pointer decrement */
92 #define MTDSWAP_ECNT_MIN(rbroot) (rb_entry(rb_first(rbroot), struct swap_eb, \
94 #define MTDSWAP_ECNT_MAX(rbroot) (rb_entry(rb_last(rbroot), struct swap_eb, \
114 struct mtd_blktrans_dev
*mbd_dev
;
115 struct mtd_info
*mtd
;
118 unsigned int *page_data
;
119 unsigned int *revmap
;
122 unsigned int spare_eblks
;
123 unsigned int pages_per_eblk
;
124 unsigned int max_erase_count
;
125 struct swap_eb
*eb_data
;
127 struct mtdswap_tree trees
[MTDSWAP_TREE_CNT
];
129 unsigned long long sect_read_count
;
130 unsigned long long sect_write_count
;
131 unsigned long long mtd_write_count
;
132 unsigned long long mtd_read_count
;
133 unsigned long long discard_count
;
134 unsigned long long discard_page_count
;
136 unsigned int curr_write_pos
;
137 struct swap_eb
*curr_write
;
142 struct dentry
*debugfs_root
;
145 struct mtdswap_oobdata
{
150 #define MTDSWAP_MAGIC_CLEAN 0x2095
151 #define MTDSWAP_MAGIC_DIRTY (MTDSWAP_MAGIC_CLEAN + 1)
152 #define MTDSWAP_TYPE_CLEAN 0
153 #define MTDSWAP_TYPE_DIRTY 1
154 #define MTDSWAP_OOBSIZE sizeof(struct mtdswap_oobdata)
156 #define MTDSWAP_ERASE_RETRIES 3 /* Before marking erase block bad */
157 #define MTDSWAP_IO_RETRIES 3
160 MTDSWAP_SCANNED_CLEAN
,
161 MTDSWAP_SCANNED_DIRTY
,
162 MTDSWAP_SCANNED_BITFLIP
,
167 * In the worst case mtdswap_writesect() has allocated the last clean
168 * page from the current block and is then pre-empted by the GC
169 * thread. The thread can consume a full erase block when moving a
172 #define MIN_SPARE_EBLOCKS 2
173 #define MIN_ERASE_BLOCKS (MIN_SPARE_EBLOCKS + 1)
175 #define TREE_ROOT(d, name) (&d->trees[MTDSWAP_ ## name].root)
176 #define TREE_EMPTY(d, name) (TREE_ROOT(d, name)->rb_node == NULL)
177 #define TREE_NONEMPTY(d, name) (!TREE_EMPTY(d, name))
178 #define TREE_COUNT(d, name) (d->trees[MTDSWAP_ ## name].count)
180 #define MTDSWAP_MBD_TO_MTDSWAP(dev) ((struct mtdswap_dev *)dev->priv)
182 static char partitions
[128] = "";
183 module_param_string(partitions
, partitions
, sizeof(partitions
), 0444);
184 MODULE_PARM_DESC(partitions
, "MTD partition numbers to use as swap "
185 "partitions=\"1,3,5\"");
187 static unsigned int spare_eblocks
= 10;
188 module_param(spare_eblocks
, uint
, 0444);
189 MODULE_PARM_DESC(spare_eblocks
, "Percentage of spare erase blocks for "
190 "garbage collection (default 10%)");
192 static bool header
; /* false */
193 module_param(header
, bool, 0444);
194 MODULE_PARM_DESC(header
,
195 "Include builtin swap header (default 0, without header)");
197 static int mtdswap_gc(struct mtdswap_dev
*d
, unsigned int background
);
199 static loff_t
mtdswap_eb_offset(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
201 return (loff_t
)(eb
- d
->eb_data
) * d
->mtd
->erasesize
;
204 static void mtdswap_eb_detach(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
207 struct mtdswap_tree
*tp
;
210 tp
= container_of(eb
->root
, struct mtdswap_tree
, root
);
211 oldidx
= tp
- &d
->trees
[0];
213 d
->trees
[oldidx
].count
--;
214 rb_erase(&eb
->rb
, eb
->root
);
218 static void __mtdswap_rb_add(struct rb_root
*root
, struct swap_eb
*eb
)
220 struct rb_node
**p
, *parent
= NULL
;
226 cur
= rb_entry(parent
, struct swap_eb
, rb
);
227 if (eb
->erase_count
> cur
->erase_count
)
233 rb_link_node(&eb
->rb
, parent
, p
);
234 rb_insert_color(&eb
->rb
, root
);
237 static void mtdswap_rb_add(struct mtdswap_dev
*d
, struct swap_eb
*eb
, int idx
)
239 struct rb_root
*root
;
241 if (eb
->root
== &d
->trees
[idx
].root
)
244 mtdswap_eb_detach(d
, eb
);
245 root
= &d
->trees
[idx
].root
;
246 __mtdswap_rb_add(root
, eb
);
248 d
->trees
[idx
].count
++;
251 static struct rb_node
*mtdswap_rb_index(struct rb_root
*root
, unsigned int idx
)
258 while (i
< idx
&& p
) {
266 static int mtdswap_handle_badblock(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
272 eb
->flags
|= EBLOCK_BAD
;
273 mtdswap_eb_detach(d
, eb
);
276 /* badblocks not supported */
277 if (!mtd_can_have_bb(d
->mtd
))
280 offset
= mtdswap_eb_offset(d
, eb
);
281 dev_warn(d
->dev
, "Marking bad block at %08llx\n", offset
);
282 ret
= mtd_block_markbad(d
->mtd
, offset
);
285 dev_warn(d
->dev
, "Mark block bad failed for block at %08llx "
286 "error %d\n", offset
, ret
);
294 static int mtdswap_handle_write_error(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
296 unsigned int marked
= eb
->flags
& EBLOCK_FAILED
;
297 struct swap_eb
*curr_write
= d
->curr_write
;
299 eb
->flags
|= EBLOCK_FAILED
;
300 if (curr_write
== eb
) {
301 d
->curr_write
= NULL
;
303 if (!marked
&& d
->curr_write_pos
!= 0) {
304 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
309 return mtdswap_handle_badblock(d
, eb
);
312 static int mtdswap_read_oob(struct mtdswap_dev
*d
, loff_t from
,
313 struct mtd_oob_ops
*ops
)
315 int ret
= mtd_read_oob(d
->mtd
, from
, ops
);
317 if (mtd_is_bitflip(ret
))
321 dev_warn(d
->dev
, "Read OOB failed %d for block at %08llx\n",
326 if (ops
->oobretlen
< ops
->ooblen
) {
327 dev_warn(d
->dev
, "Read OOB return short read (%zd bytes not "
328 "%zd) for block at %08llx\n",
329 ops
->oobretlen
, ops
->ooblen
, from
);
336 static int mtdswap_read_markers(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
338 struct mtdswap_oobdata
*data
, *data2
;
341 struct mtd_oob_ops ops
;
343 offset
= mtdswap_eb_offset(d
, eb
);
345 /* Check first if the block is bad. */
346 if (mtd_can_have_bb(d
->mtd
) && mtd_block_isbad(d
->mtd
, offset
))
347 return MTDSWAP_SCANNED_BAD
;
349 ops
.ooblen
= 2 * d
->mtd
->oobavail
;
350 ops
.oobbuf
= d
->oob_buf
;
353 ops
.mode
= MTD_OPS_AUTO_OOB
;
355 ret
= mtdswap_read_oob(d
, offset
, &ops
);
357 if (ret
&& !mtd_is_bitflip(ret
))
360 data
= (struct mtdswap_oobdata
*)d
->oob_buf
;
361 data2
= (struct mtdswap_oobdata
*)
362 (d
->oob_buf
+ d
->mtd
->oobavail
);
364 if (le16_to_cpu(data
->magic
) == MTDSWAP_MAGIC_CLEAN
) {
365 eb
->erase_count
= le32_to_cpu(data
->count
);
366 if (mtd_is_bitflip(ret
))
367 ret
= MTDSWAP_SCANNED_BITFLIP
;
369 if (le16_to_cpu(data2
->magic
) == MTDSWAP_MAGIC_DIRTY
)
370 ret
= MTDSWAP_SCANNED_DIRTY
;
372 ret
= MTDSWAP_SCANNED_CLEAN
;
375 eb
->flags
|= EBLOCK_NOMAGIC
;
376 ret
= MTDSWAP_SCANNED_DIRTY
;
382 static int mtdswap_write_marker(struct mtdswap_dev
*d
, struct swap_eb
*eb
,
385 struct mtdswap_oobdata n
;
388 struct mtd_oob_ops ops
;
391 ops
.oobbuf
= (uint8_t *)&n
;
392 ops
.mode
= MTD_OPS_AUTO_OOB
;
395 if (marker
== MTDSWAP_TYPE_CLEAN
) {
396 n
.magic
= cpu_to_le16(MTDSWAP_MAGIC_CLEAN
);
397 n
.count
= cpu_to_le32(eb
->erase_count
);
398 ops
.ooblen
= MTDSWAP_OOBSIZE
;
399 offset
= mtdswap_eb_offset(d
, eb
);
401 n
.magic
= cpu_to_le16(MTDSWAP_MAGIC_DIRTY
);
402 ops
.ooblen
= sizeof(n
.magic
);
403 offset
= mtdswap_eb_offset(d
, eb
) + d
->mtd
->writesize
;
406 ret
= mtd_write_oob(d
->mtd
, offset
, &ops
);
409 dev_warn(d
->dev
, "Write OOB failed for block at %08llx "
410 "error %d\n", offset
, ret
);
411 if (ret
== -EIO
|| mtd_is_eccerr(ret
))
412 mtdswap_handle_write_error(d
, eb
);
416 if (ops
.oobretlen
!= ops
.ooblen
) {
417 dev_warn(d
->dev
, "Short OOB write for block at %08llx: "
419 offset
, ops
.oobretlen
, ops
.ooblen
);
427 * Are there any erase blocks without MAGIC_CLEAN header, presumably
428 * because power was cut off after erase but before header write? We
429 * need to guestimate the erase count.
431 static void mtdswap_check_counts(struct mtdswap_dev
*d
)
433 struct rb_root hist_root
= RB_ROOT
;
434 struct rb_node
*medrb
;
436 unsigned int i
, cnt
, median
;
439 for (i
= 0; i
< d
->eblks
; i
++) {
442 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_BAD
| EBLOCK_READERR
))
445 __mtdswap_rb_add(&hist_root
, eb
);
452 medrb
= mtdswap_rb_index(&hist_root
, cnt
/ 2);
453 median
= rb_entry(medrb
, struct swap_eb
, rb
)->erase_count
;
455 d
->max_erase_count
= MTDSWAP_ECNT_MAX(&hist_root
);
457 for (i
= 0; i
< d
->eblks
; i
++) {
460 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_READERR
))
461 eb
->erase_count
= median
;
463 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_BAD
| EBLOCK_READERR
))
466 rb_erase(&eb
->rb
, &hist_root
);
470 static void mtdswap_scan_eblks(struct mtdswap_dev
*d
)
476 for (i
= 0; i
< d
->eblks
; i
++) {
479 status
= mtdswap_read_markers(d
, eb
);
481 eb
->flags
|= EBLOCK_READERR
;
482 else if (status
== MTDSWAP_SCANNED_BAD
) {
483 eb
->flags
|= EBLOCK_BAD
;
488 case MTDSWAP_SCANNED_CLEAN
:
491 case MTDSWAP_SCANNED_DIRTY
:
492 case MTDSWAP_SCANNED_BITFLIP
:
496 idx
= MTDSWAP_FAILING
;
499 eb
->flags
|= (idx
<< EBLOCK_IDX_SHIFT
);
502 mtdswap_check_counts(d
);
504 for (i
= 0; i
< d
->eblks
; i
++) {
507 if (eb
->flags
& EBLOCK_BAD
)
510 idx
= eb
->flags
>> EBLOCK_IDX_SHIFT
;
511 mtdswap_rb_add(d
, eb
, idx
);
516 * Place eblk into a tree corresponding to its number of active blocks
519 static void mtdswap_store_eb(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
521 unsigned int weight
= eb
->active_count
;
522 unsigned int maxweight
= d
->pages_per_eblk
;
524 if (eb
== d
->curr_write
)
527 if (eb
->flags
& EBLOCK_BITFLIP
)
528 mtdswap_rb_add(d
, eb
, MTDSWAP_BITFLIP
);
529 else if (eb
->flags
& (EBLOCK_READERR
| EBLOCK_FAILED
))
530 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
531 if (weight
== maxweight
)
532 mtdswap_rb_add(d
, eb
, MTDSWAP_USED
);
533 else if (weight
== 0)
534 mtdswap_rb_add(d
, eb
, MTDSWAP_DIRTY
);
535 else if (weight
> (maxweight
/2))
536 mtdswap_rb_add(d
, eb
, MTDSWAP_LOWFRAG
);
538 mtdswap_rb_add(d
, eb
, MTDSWAP_HIFRAG
);
542 static void mtdswap_erase_callback(struct erase_info
*done
)
544 wait_queue_head_t
*wait_q
= (wait_queue_head_t
*)done
->priv
;
548 static int mtdswap_erase_block(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
550 struct mtd_info
*mtd
= d
->mtd
;
551 struct erase_info erase
;
552 wait_queue_head_t wq
;
553 unsigned int retries
= 0;
557 if (eb
->erase_count
> d
->max_erase_count
)
558 d
->max_erase_count
= eb
->erase_count
;
561 init_waitqueue_head(&wq
);
562 memset(&erase
, 0, sizeof(struct erase_info
));
565 erase
.callback
= mtdswap_erase_callback
;
566 erase
.addr
= mtdswap_eb_offset(d
, eb
);
567 erase
.len
= mtd
->erasesize
;
568 erase
.priv
= (u_long
)&wq
;
570 ret
= mtd_erase(mtd
, &erase
);
572 if (retries
++ < MTDSWAP_ERASE_RETRIES
) {
574 "erase of erase block %#llx on %s failed",
575 erase
.addr
, mtd
->name
);
580 dev_err(d
->dev
, "Cannot erase erase block %#llx on %s\n",
581 erase
.addr
, mtd
->name
);
583 mtdswap_handle_badblock(d
, eb
);
587 ret
= wait_event_interruptible(wq
, erase
.state
== MTD_ERASE_DONE
||
588 erase
.state
== MTD_ERASE_FAILED
);
590 dev_err(d
->dev
, "Interrupted erase block %#llx erassure on %s",
591 erase
.addr
, mtd
->name
);
595 if (erase
.state
== MTD_ERASE_FAILED
) {
596 if (retries
++ < MTDSWAP_ERASE_RETRIES
) {
598 "erase of erase block %#llx on %s failed",
599 erase
.addr
, mtd
->name
);
604 mtdswap_handle_badblock(d
, eb
);
611 static int mtdswap_map_free_block(struct mtdswap_dev
*d
, unsigned int page
,
615 struct swap_eb
*old_eb
= d
->curr_write
;
616 struct rb_root
*clean_root
;
619 if (old_eb
== NULL
|| d
->curr_write_pos
>= d
->pages_per_eblk
) {
621 if (TREE_EMPTY(d
, CLEAN
))
624 clean_root
= TREE_ROOT(d
, CLEAN
);
625 eb
= rb_entry(rb_first(clean_root
), struct swap_eb
, rb
);
626 rb_erase(&eb
->rb
, clean_root
);
628 TREE_COUNT(d
, CLEAN
)--;
630 ret
= mtdswap_write_marker(d
, eb
, MTDSWAP_TYPE_DIRTY
);
631 } while (ret
== -EIO
|| mtd_is_eccerr(ret
));
636 d
->curr_write_pos
= 0;
639 mtdswap_store_eb(d
, old_eb
);
642 *block
= (d
->curr_write
- d
->eb_data
) * d
->pages_per_eblk
+
645 d
->curr_write
->active_count
++;
646 d
->revmap
[*block
] = page
;
652 static unsigned int mtdswap_free_page_cnt(struct mtdswap_dev
*d
)
654 return TREE_COUNT(d
, CLEAN
) * d
->pages_per_eblk
+
655 d
->pages_per_eblk
- d
->curr_write_pos
;
658 static unsigned int mtdswap_enough_free_pages(struct mtdswap_dev
*d
)
660 return mtdswap_free_page_cnt(d
) > d
->pages_per_eblk
;
663 static int mtdswap_write_block(struct mtdswap_dev
*d
, char *buf
,
664 unsigned int page
, unsigned int *bp
, int gc_context
)
666 struct mtd_info
*mtd
= d
->mtd
;
674 while (!mtdswap_enough_free_pages(d
))
675 if (mtdswap_gc(d
, 0) > 0)
678 ret
= mtdswap_map_free_block(d
, page
, bp
);
679 eb
= d
->eb_data
+ (*bp
/ d
->pages_per_eblk
);
681 if (ret
== -EIO
|| mtd_is_eccerr(ret
)) {
682 d
->curr_write
= NULL
;
684 d
->revmap
[*bp
] = PAGE_UNDEF
;
691 writepos
= (loff_t
)*bp
<< PAGE_SHIFT
;
692 ret
= mtd_write(mtd
, writepos
, PAGE_SIZE
, &retlen
, buf
);
693 if (ret
== -EIO
|| mtd_is_eccerr(ret
)) {
696 d
->revmap
[*bp
] = PAGE_UNDEF
;
697 mtdswap_handle_write_error(d
, eb
);
702 dev_err(d
->dev
, "Write to MTD device failed: %d (%zd written)",
707 if (retlen
!= PAGE_SIZE
) {
708 dev_err(d
->dev
, "Short write to MTD device: %zd written",
719 d
->revmap
[*bp
] = PAGE_UNDEF
;
724 static int mtdswap_move_block(struct mtdswap_dev
*d
, unsigned int oldblock
,
725 unsigned int *newblock
)
727 struct mtd_info
*mtd
= d
->mtd
;
728 struct swap_eb
*eb
, *oldeb
;
731 unsigned int page
, retries
;
734 page
= d
->revmap
[oldblock
];
735 readpos
= (loff_t
) oldblock
<< PAGE_SHIFT
;
739 ret
= mtd_read(mtd
, readpos
, PAGE_SIZE
, &retlen
, d
->page_buf
);
741 if (ret
< 0 && !mtd_is_bitflip(ret
)) {
742 oldeb
= d
->eb_data
+ oldblock
/ d
->pages_per_eblk
;
743 oldeb
->flags
|= EBLOCK_READERR
;
745 dev_err(d
->dev
, "Read Error: %d (block %u)\n", ret
,
748 if (retries
< MTDSWAP_IO_RETRIES
)
754 if (retlen
!= PAGE_SIZE
) {
755 dev_err(d
->dev
, "Short read: %zd (block %u)\n", retlen
,
761 ret
= mtdswap_write_block(d
, d
->page_buf
, page
, newblock
, 1);
763 d
->page_data
[page
] = BLOCK_ERROR
;
764 dev_err(d
->dev
, "Write error: %d\n", ret
);
768 eb
= d
->eb_data
+ *newblock
/ d
->pages_per_eblk
;
769 d
->page_data
[page
] = *newblock
;
770 d
->revmap
[oldblock
] = PAGE_UNDEF
;
771 eb
= d
->eb_data
+ oldblock
/ d
->pages_per_eblk
;
777 d
->page_data
[page
] = BLOCK_ERROR
;
778 d
->revmap
[oldblock
] = PAGE_UNDEF
;
782 static int mtdswap_gc_eblock(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
784 unsigned int i
, block
, eblk_base
, newblock
;
788 eblk_base
= (eb
- d
->eb_data
) * d
->pages_per_eblk
;
790 for (i
= 0; i
< d
->pages_per_eblk
; i
++) {
791 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
794 block
= eblk_base
+ i
;
795 if (d
->revmap
[block
] == PAGE_UNDEF
)
798 ret
= mtdswap_move_block(d
, block
, &newblock
);
799 if (ret
< 0 && !errcode
)
806 static int __mtdswap_choose_gc_tree(struct mtdswap_dev
*d
)
810 if (TREE_COUNT(d
, CLEAN
) < LOW_FRAG_GC_TRESHOLD
)
811 stopat
= MTDSWAP_LOWFRAG
;
813 stopat
= MTDSWAP_HIFRAG
;
815 for (idx
= MTDSWAP_BITFLIP
; idx
>= stopat
; idx
--)
816 if (d
->trees
[idx
].root
.rb_node
!= NULL
)
822 static int mtdswap_wlfreq(unsigned int maxdiff
)
824 unsigned int h
, x
, y
, dist
, base
;
827 * Calculate linear ramp down from f1 to f2 when maxdiff goes from
828 * MAX_ERASE_DIFF to MAX_ERASE_DIFF + COLLECT_NONDIRTY_BASE. Similar
829 * to triangle with height f1 - f1 and width COLLECT_NONDIRTY_BASE.
832 dist
= maxdiff
- MAX_ERASE_DIFF
;
833 if (dist
> COLLECT_NONDIRTY_BASE
)
834 dist
= COLLECT_NONDIRTY_BASE
;
837 * Modelling the slop as right angular triangle with base
838 * COLLECT_NONDIRTY_BASE and height freq1 - freq2. The ratio y/x is
839 * equal to the ratio h/base.
841 h
= COLLECT_NONDIRTY_FREQ1
- COLLECT_NONDIRTY_FREQ2
;
842 base
= COLLECT_NONDIRTY_BASE
;
845 y
= (x
* h
+ base
/ 2) / base
;
847 return COLLECT_NONDIRTY_FREQ2
+ y
;
850 static int mtdswap_choose_wl_tree(struct mtdswap_dev
*d
)
852 static unsigned int pick_cnt
;
853 unsigned int i
, idx
= -1, wear
, max
;
854 struct rb_root
*root
;
857 for (i
= 0; i
<= MTDSWAP_DIRTY
; i
++) {
858 root
= &d
->trees
[i
].root
;
859 if (root
->rb_node
== NULL
)
862 wear
= d
->max_erase_count
- MTDSWAP_ECNT_MIN(root
);
869 if (max
> MAX_ERASE_DIFF
&& pick_cnt
>= mtdswap_wlfreq(max
) - 1) {
878 static int mtdswap_choose_gc_tree(struct mtdswap_dev
*d
,
879 unsigned int background
)
883 if (TREE_NONEMPTY(d
, FAILING
) &&
884 (background
|| (TREE_EMPTY(d
, CLEAN
) && TREE_EMPTY(d
, DIRTY
))))
885 return MTDSWAP_FAILING
;
887 idx
= mtdswap_choose_wl_tree(d
);
888 if (idx
>= MTDSWAP_CLEAN
)
891 return __mtdswap_choose_gc_tree(d
);
894 static struct swap_eb
*mtdswap_pick_gc_eblk(struct mtdswap_dev
*d
,
895 unsigned int background
)
897 struct rb_root
*rp
= NULL
;
898 struct swap_eb
*eb
= NULL
;
901 if (background
&& TREE_COUNT(d
, CLEAN
) > CLEAN_BLOCK_THRESHOLD
&&
902 TREE_EMPTY(d
, DIRTY
) && TREE_EMPTY(d
, FAILING
))
905 idx
= mtdswap_choose_gc_tree(d
, background
);
909 rp
= &d
->trees
[idx
].root
;
910 eb
= rb_entry(rb_first(rp
), struct swap_eb
, rb
);
912 rb_erase(&eb
->rb
, rp
);
914 d
->trees
[idx
].count
--;
918 static unsigned int mtdswap_test_patt(unsigned int i
)
920 return i
% 2 ? 0x55555555 : 0xAAAAAAAA;
923 static unsigned int mtdswap_eblk_passes(struct mtdswap_dev
*d
,
926 struct mtd_info
*mtd
= d
->mtd
;
927 unsigned int test
, i
, j
, patt
, mtd_pages
;
929 unsigned int *p1
= (unsigned int *)d
->page_buf
;
930 unsigned char *p2
= (unsigned char *)d
->oob_buf
;
931 struct mtd_oob_ops ops
;
934 ops
.mode
= MTD_OPS_AUTO_OOB
;
935 ops
.len
= mtd
->writesize
;
936 ops
.ooblen
= mtd
->oobavail
;
938 ops
.datbuf
= d
->page_buf
;
939 ops
.oobbuf
= d
->oob_buf
;
940 base
= mtdswap_eb_offset(d
, eb
);
941 mtd_pages
= d
->pages_per_eblk
* PAGE_SIZE
/ mtd
->writesize
;
943 for (test
= 0; test
< 2; test
++) {
945 for (i
= 0; i
< mtd_pages
; i
++) {
946 patt
= mtdswap_test_patt(test
+ i
);
947 memset(d
->page_buf
, patt
, mtd
->writesize
);
948 memset(d
->oob_buf
, patt
, mtd
->oobavail
);
949 ret
= mtd_write_oob(mtd
, pos
, &ops
);
953 pos
+= mtd
->writesize
;
957 for (i
= 0; i
< mtd_pages
; i
++) {
958 ret
= mtd_read_oob(mtd
, pos
, &ops
);
962 patt
= mtdswap_test_patt(test
+ i
);
963 for (j
= 0; j
< mtd
->writesize
/sizeof(int); j
++)
967 for (j
= 0; j
< mtd
->oobavail
; j
++)
968 if (p2
[j
] != (unsigned char)patt
)
971 pos
+= mtd
->writesize
;
974 ret
= mtdswap_erase_block(d
, eb
);
979 eb
->flags
&= ~EBLOCK_READERR
;
983 mtdswap_handle_badblock(d
, eb
);
987 static int mtdswap_gc(struct mtdswap_dev
*d
, unsigned int background
)
992 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
995 eb
= mtdswap_pick_gc_eblk(d
, background
);
999 ret
= mtdswap_gc_eblock(d
, eb
);
1003 if (eb
->flags
& EBLOCK_FAILED
) {
1004 mtdswap_handle_badblock(d
, eb
);
1008 eb
->flags
&= ~EBLOCK_BITFLIP
;
1009 ret
= mtdswap_erase_block(d
, eb
);
1010 if ((eb
->flags
& EBLOCK_READERR
) &&
1011 (ret
|| !mtdswap_eblk_passes(d
, eb
)))
1015 ret
= mtdswap_write_marker(d
, eb
, MTDSWAP_TYPE_CLEAN
);
1018 mtdswap_rb_add(d
, eb
, MTDSWAP_CLEAN
);
1019 else if (ret
!= -EIO
&& !mtd_is_eccerr(ret
))
1020 mtdswap_rb_add(d
, eb
, MTDSWAP_DIRTY
);
1025 static void mtdswap_background(struct mtd_blktrans_dev
*dev
)
1027 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1031 ret
= mtdswap_gc(d
, 1);
1032 if (ret
|| mtd_blktrans_cease_background(dev
))
1037 static void mtdswap_cleanup(struct mtdswap_dev
*d
)
1041 vfree(d
->page_data
);
1046 static int mtdswap_flush(struct mtd_blktrans_dev
*dev
)
1048 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1054 static unsigned int mtdswap_badblocks(struct mtd_info
*mtd
, uint64_t size
)
1057 unsigned int badcnt
;
1061 if (mtd_can_have_bb(mtd
))
1062 for (offset
= 0; offset
< size
; offset
+= mtd
->erasesize
)
1063 if (mtd_block_isbad(mtd
, offset
))
1069 static int mtdswap_writesect(struct mtd_blktrans_dev
*dev
,
1070 unsigned long page
, char *buf
)
1072 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1073 unsigned int newblock
, mapped
;
1077 d
->sect_write_count
++;
1079 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
1083 /* Ignore writes to the header page */
1084 if (unlikely(page
== 0))
1090 mapped
= d
->page_data
[page
];
1091 if (mapped
<= BLOCK_MAX
) {
1092 eb
= d
->eb_data
+ (mapped
/ d
->pages_per_eblk
);
1094 mtdswap_store_eb(d
, eb
);
1095 d
->page_data
[page
] = BLOCK_UNDEF
;
1096 d
->revmap
[mapped
] = PAGE_UNDEF
;
1099 ret
= mtdswap_write_block(d
, buf
, page
, &newblock
, 0);
1100 d
->mtd_write_count
++;
1105 eb
= d
->eb_data
+ (newblock
/ d
->pages_per_eblk
);
1106 d
->page_data
[page
] = newblock
;
1111 /* Provide a dummy swap header for the kernel */
1112 static int mtdswap_auto_header(struct mtdswap_dev
*d
, char *buf
)
1114 union swap_header
*hd
= (union swap_header
*)(buf
);
1116 memset(buf
, 0, PAGE_SIZE
- 10);
1118 hd
->info
.version
= 1;
1119 hd
->info
.last_page
= d
->mbd_dev
->size
- 1;
1120 hd
->info
.nr_badpages
= 0;
1122 memcpy(buf
+ PAGE_SIZE
- 10, "SWAPSPACE2", 10);
1127 static int mtdswap_readsect(struct mtd_blktrans_dev
*dev
,
1128 unsigned long page
, char *buf
)
1130 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1131 struct mtd_info
*mtd
= d
->mtd
;
1132 unsigned int realblock
, retries
;
1138 d
->sect_read_count
++;
1141 if (unlikely(page
== 0))
1142 return mtdswap_auto_header(d
, buf
);
1147 realblock
= d
->page_data
[page
];
1148 if (realblock
> BLOCK_MAX
) {
1149 memset(buf
, 0x0, PAGE_SIZE
);
1150 if (realblock
== BLOCK_UNDEF
)
1156 eb
= d
->eb_data
+ (realblock
/ d
->pages_per_eblk
);
1157 BUG_ON(d
->revmap
[realblock
] == PAGE_UNDEF
);
1159 readpos
= (loff_t
)realblock
<< PAGE_SHIFT
;
1163 ret
= mtd_read(mtd
, readpos
, PAGE_SIZE
, &retlen
, buf
);
1165 d
->mtd_read_count
++;
1166 if (mtd_is_bitflip(ret
)) {
1167 eb
->flags
|= EBLOCK_BITFLIP
;
1168 mtdswap_rb_add(d
, eb
, MTDSWAP_BITFLIP
);
1173 dev_err(d
->dev
, "Read error %d\n", ret
);
1174 eb
->flags
|= EBLOCK_READERR
;
1175 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
1177 if (retries
< MTDSWAP_IO_RETRIES
)
1183 if (retlen
!= PAGE_SIZE
) {
1184 dev_err(d
->dev
, "Short read %zd\n", retlen
);
1191 static int mtdswap_discard(struct mtd_blktrans_dev
*dev
, unsigned long first
,
1194 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1197 unsigned int mapped
;
1201 for (page
= first
; page
< first
+ nr_pages
; page
++) {
1202 mapped
= d
->page_data
[page
];
1203 if (mapped
<= BLOCK_MAX
) {
1204 eb
= d
->eb_data
+ (mapped
/ d
->pages_per_eblk
);
1206 mtdswap_store_eb(d
, eb
);
1207 d
->page_data
[page
] = BLOCK_UNDEF
;
1208 d
->revmap
[mapped
] = PAGE_UNDEF
;
1209 d
->discard_page_count
++;
1210 } else if (mapped
== BLOCK_ERROR
) {
1211 d
->page_data
[page
] = BLOCK_UNDEF
;
1212 d
->discard_page_count
++;
1219 static int mtdswap_show(struct seq_file
*s
, void *data
)
1221 struct mtdswap_dev
*d
= (struct mtdswap_dev
*) s
->private;
1223 unsigned int count
[MTDSWAP_TREE_CNT
];
1224 unsigned int min
[MTDSWAP_TREE_CNT
];
1225 unsigned int max
[MTDSWAP_TREE_CNT
];
1226 unsigned int i
, cw
= 0, cwp
= 0, cwecount
= 0, bb_cnt
, mapped
, pages
;
1228 char *name
[] = {"clean", "used", "low", "high", "dirty", "bitflip",
1231 mutex_lock(&d
->mbd_dev
->lock
);
1233 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++) {
1234 struct rb_root
*root
= &d
->trees
[i
].root
;
1236 if (root
->rb_node
) {
1237 count
[i
] = d
->trees
[i
].count
;
1238 min
[i
] = rb_entry(rb_first(root
), struct swap_eb
,
1240 max
[i
] = rb_entry(rb_last(root
), struct swap_eb
,
1246 if (d
->curr_write
) {
1248 cwp
= d
->curr_write_pos
;
1249 cwecount
= d
->curr_write
->erase_count
;
1253 for (i
= 0; i
< d
->eblks
; i
++)
1254 sum
+= d
->eb_data
[i
].erase_count
;
1256 use_size
= (uint64_t)d
->eblks
* d
->mtd
->erasesize
;
1257 bb_cnt
= mtdswap_badblocks(d
->mtd
, use_size
);
1260 pages
= d
->mbd_dev
->size
;
1261 for (i
= 0; i
< pages
; i
++)
1262 if (d
->page_data
[i
] != BLOCK_UNDEF
)
1265 mutex_unlock(&d
->mbd_dev
->lock
);
1267 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++) {
1271 if (min
[i
] != max
[i
])
1272 seq_printf(s
, "%s:\t%5d erase blocks, erased min %d, "
1274 name
[i
], count
[i
], min
[i
], max
[i
]);
1276 seq_printf(s
, "%s:\t%5d erase blocks, all erased %d "
1277 "times\n", name
[i
], count
[i
], min
[i
]);
1281 seq_printf(s
, "bad:\t%5u erase blocks\n", bb_cnt
);
1284 seq_printf(s
, "current erase block: %u pages used, %u free, "
1285 "erased %u times\n",
1286 cwp
, d
->pages_per_eblk
- cwp
, cwecount
);
1288 seq_printf(s
, "total erasures: %lu\n", sum
);
1292 seq_printf(s
, "mtdswap_readsect count: %llu\n", d
->sect_read_count
);
1293 seq_printf(s
, "mtdswap_writesect count: %llu\n", d
->sect_write_count
);
1294 seq_printf(s
, "mtdswap_discard count: %llu\n", d
->discard_count
);
1295 seq_printf(s
, "mtd read count: %llu\n", d
->mtd_read_count
);
1296 seq_printf(s
, "mtd write count: %llu\n", d
->mtd_write_count
);
1297 seq_printf(s
, "discarded pages count: %llu\n", d
->discard_page_count
);
1300 seq_printf(s
, "total pages: %u\n", pages
);
1301 seq_printf(s
, "pages mapped: %u\n", mapped
);
1306 static int mtdswap_open(struct inode
*inode
, struct file
*file
)
1308 return single_open(file
, mtdswap_show
, inode
->i_private
);
1311 static const struct file_operations mtdswap_fops
= {
1312 .open
= mtdswap_open
,
1314 .llseek
= seq_lseek
,
1315 .release
= single_release
,
1318 static int mtdswap_add_debugfs(struct mtdswap_dev
*d
)
1320 struct gendisk
*gd
= d
->mbd_dev
->disk
;
1321 struct device
*dev
= disk_to_dev(gd
);
1323 struct dentry
*root
;
1324 struct dentry
*dent
;
1326 root
= debugfs_create_dir(gd
->disk_name
, NULL
);
1331 dev_err(dev
, "failed to initialize debugfs\n");
1335 d
->debugfs_root
= root
;
1337 dent
= debugfs_create_file("stats", S_IRUSR
, root
, d
,
1340 dev_err(d
->dev
, "debugfs_create_file failed\n");
1341 debugfs_remove_recursive(root
);
1342 d
->debugfs_root
= NULL
;
1349 static int mtdswap_init(struct mtdswap_dev
*d
, unsigned int eblocks
,
1350 unsigned int spare_cnt
)
1352 struct mtd_info
*mtd
= d
->mbd_dev
->mtd
;
1353 unsigned int i
, eblk_bytes
, pages
, blocks
;
1358 d
->spare_eblks
= spare_cnt
;
1359 d
->pages_per_eblk
= mtd
->erasesize
>> PAGE_SHIFT
;
1361 pages
= d
->mbd_dev
->size
;
1362 blocks
= eblocks
* d
->pages_per_eblk
;
1364 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++)
1365 d
->trees
[i
].root
= RB_ROOT
;
1367 d
->page_data
= vmalloc(sizeof(int)*pages
);
1369 goto page_data_fail
;
1371 d
->revmap
= vmalloc(sizeof(int)*blocks
);
1375 eblk_bytes
= sizeof(struct swap_eb
)*d
->eblks
;
1376 d
->eb_data
= vzalloc(eblk_bytes
);
1380 for (i
= 0; i
< pages
; i
++)
1381 d
->page_data
[i
] = BLOCK_UNDEF
;
1383 for (i
= 0; i
< blocks
; i
++)
1384 d
->revmap
[i
] = PAGE_UNDEF
;
1386 d
->page_buf
= kmalloc(PAGE_SIZE
, GFP_KERNEL
);
1390 d
->oob_buf
= kmalloc(2 * mtd
->oobavail
, GFP_KERNEL
);
1394 mtdswap_scan_eblks(d
);
1405 vfree(d
->page_data
);
1407 printk(KERN_ERR
"%s: init failed (%d)\n", MTDSWAP_PREFIX
, ret
);
1411 static void mtdswap_add_mtd(struct mtd_blktrans_ops
*tr
, struct mtd_info
*mtd
)
1413 struct mtdswap_dev
*d
;
1414 struct mtd_blktrans_dev
*mbd_dev
;
1418 unsigned int eblocks
, eavailable
, bad_blocks
, spare_cnt
;
1419 uint64_t swap_size
, use_size
, size_limit
;
1422 parts
= &partitions
[0];
1426 while ((this_opt
= strsep(&parts
, ",")) != NULL
) {
1427 if (kstrtoul(this_opt
, 0, &part
) < 0)
1430 if (mtd
->index
== part
)
1434 if (mtd
->index
!= part
)
1437 if (mtd
->erasesize
< PAGE_SIZE
|| mtd
->erasesize
% PAGE_SIZE
) {
1438 printk(KERN_ERR
"%s: Erase size %u not multiple of PAGE_SIZE "
1439 "%lu\n", MTDSWAP_PREFIX
, mtd
->erasesize
, PAGE_SIZE
);
1443 if (PAGE_SIZE
% mtd
->writesize
|| mtd
->writesize
> PAGE_SIZE
) {
1444 printk(KERN_ERR
"%s: PAGE_SIZE %lu not multiple of write size"
1445 " %u\n", MTDSWAP_PREFIX
, PAGE_SIZE
, mtd
->writesize
);
1449 if (!mtd
->oobsize
|| mtd
->oobavail
< MTDSWAP_OOBSIZE
) {
1450 printk(KERN_ERR
"%s: Not enough free bytes in OOB, "
1451 "%d available, %zu needed.\n",
1452 MTDSWAP_PREFIX
, mtd
->oobavail
, MTDSWAP_OOBSIZE
);
1456 if (spare_eblocks
> 100)
1457 spare_eblocks
= 100;
1459 use_size
= mtd
->size
;
1460 size_limit
= (uint64_t) BLOCK_MAX
* PAGE_SIZE
;
1462 if (mtd
->size
> size_limit
) {
1463 printk(KERN_WARNING
"%s: Device too large. Limiting size to "
1464 "%llu bytes\n", MTDSWAP_PREFIX
, size_limit
);
1465 use_size
= size_limit
;
1468 eblocks
= mtd_div_by_eb(use_size
, mtd
);
1469 use_size
= (uint64_t)eblocks
* mtd
->erasesize
;
1470 bad_blocks
= mtdswap_badblocks(mtd
, use_size
);
1471 eavailable
= eblocks
- bad_blocks
;
1473 if (eavailable
< MIN_ERASE_BLOCKS
) {
1474 printk(KERN_ERR
"%s: Not enough erase blocks. %u available, "
1475 "%d needed\n", MTDSWAP_PREFIX
, eavailable
,
1480 spare_cnt
= div_u64((uint64_t)eavailable
* spare_eblocks
, 100);
1482 if (spare_cnt
< MIN_SPARE_EBLOCKS
)
1483 spare_cnt
= MIN_SPARE_EBLOCKS
;
1485 if (spare_cnt
> eavailable
- 1)
1486 spare_cnt
= eavailable
- 1;
1488 swap_size
= (uint64_t)(eavailable
- spare_cnt
) * mtd
->erasesize
+
1489 (header
? PAGE_SIZE
: 0);
1491 printk(KERN_INFO
"%s: Enabling MTD swap on device %lu, size %llu KB, "
1492 "%u spare, %u bad blocks\n",
1493 MTDSWAP_PREFIX
, part
, swap_size
/ 1024, spare_cnt
, bad_blocks
);
1495 d
= kzalloc(sizeof(struct mtdswap_dev
), GFP_KERNEL
);
1499 mbd_dev
= kzalloc(sizeof(struct mtd_blktrans_dev
), GFP_KERNEL
);
1505 d
->mbd_dev
= mbd_dev
;
1509 mbd_dev
->devnum
= mtd
->index
;
1510 mbd_dev
->size
= swap_size
>> PAGE_SHIFT
;
1513 if (!(mtd
->flags
& MTD_WRITEABLE
))
1514 mbd_dev
->readonly
= 1;
1516 if (mtdswap_init(d
, eblocks
, spare_cnt
) < 0)
1519 if (add_mtd_blktrans_dev(mbd_dev
) < 0)
1522 d
->dev
= disk_to_dev(mbd_dev
->disk
);
1524 ret
= mtdswap_add_debugfs(d
);
1526 goto debugfs_failed
;
1531 del_mtd_blktrans_dev(mbd_dev
);
1541 static void mtdswap_remove_dev(struct mtd_blktrans_dev
*dev
)
1543 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1545 debugfs_remove_recursive(d
->debugfs_root
);
1546 del_mtd_blktrans_dev(dev
);
1551 static struct mtd_blktrans_ops mtdswap_ops
= {
1555 .blksize
= PAGE_SIZE
,
1556 .flush
= mtdswap_flush
,
1557 .readsect
= mtdswap_readsect
,
1558 .writesect
= mtdswap_writesect
,
1559 .discard
= mtdswap_discard
,
1560 .background
= mtdswap_background
,
1561 .add_mtd
= mtdswap_add_mtd
,
1562 .remove_dev
= mtdswap_remove_dev
,
1563 .owner
= THIS_MODULE
,
1566 static int __init
mtdswap_modinit(void)
1568 return register_mtd_blktrans(&mtdswap_ops
);
1571 static void __exit
mtdswap_modexit(void)
1573 deregister_mtd_blktrans(&mtdswap_ops
);
1576 module_init(mtdswap_modinit
);
1577 module_exit(mtdswap_modexit
);
1580 MODULE_LICENSE("GPL");
1581 MODULE_AUTHOR("Jarkko Lavinen <jarkko.lavinen@nokia.com>");
1582 MODULE_DESCRIPTION("Block device access to an MTD suitable for using as "