2 * Compressed RAM based swap device
4 * Copyright (C) 2008, 2009 Nitin Gupta
6 * This code is released using a dual license strategy: BSD/GPL
7 * You can choose the licence that better fits your requirements.
9 * Released under the terms of 3-clause BSD License
10 * Released under the terms of GNU General Public License Version 2.0
12 * Project home: http://compcache.googlecode.com
15 #define KMSG_COMPONENT "ramzswap"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18 #include <linux/module.h>
19 #include <linux/kernel.h>
20 #include <linux/bitops.h>
21 #include <linux/blkdev.h>
22 #include <linux/buffer_head.h>
23 #include <linux/device.h>
24 #include <linux/genhd.h>
25 #include <linux/highmem.h>
26 #include <linux/lzo.h>
27 #include <linux/mutex.h>
28 #include <linux/string.h>
29 #include <linux/swap.h>
30 #include <linux/swapops.h>
31 #include <linux/vmalloc.h>
32 #include <linux/version.h>
34 #include "ramzswap_drv.h"
37 static int ramzswap_major
;
38 static struct ramzswap
*devices
;
41 * Pages that compress to larger than this size are
42 * forwarded to backing swap, if present or stored
43 * uncompressed in memory otherwise.
45 static unsigned int max_zpage_size
;
47 /* Module params (documentation at end) */
48 static unsigned int num_devices
;
50 static int rzs_test_flag(struct ramzswap
*rzs
, u32 index
,
51 enum rzs_pageflags flag
)
53 return rzs
->table
[index
].flags
& BIT(flag
);
56 static void rzs_set_flag(struct ramzswap
*rzs
, u32 index
,
57 enum rzs_pageflags flag
)
59 rzs
->table
[index
].flags
|= BIT(flag
);
62 static void rzs_clear_flag(struct ramzswap
*rzs
, u32 index
,
63 enum rzs_pageflags flag
)
65 rzs
->table
[index
].flags
&= ~BIT(flag
);
68 static int page_zero_filled(void *ptr
)
73 page
= (unsigned long *)ptr
;
75 for (pos
= 0; pos
!= PAGE_SIZE
/ sizeof(*page
); pos
++) {
84 * memlimit cannot be greater than backing disk size.
86 static void ramzswap_set_memlimit(struct ramzswap
*rzs
, size_t totalram_bytes
)
88 int memlimit_valid
= 1;
91 pr_info("Memory limit not set.\n");
95 if (rzs
->memlimit
> rzs
->disksize
) {
96 pr_info("Memory limit cannot be greater than "
97 "disksize: limit=%zu, disksize=%zu\n",
98 rzs
->memlimit
, rzs
->disksize
);
102 if (!memlimit_valid
) {
103 size_t mempart
, disksize
;
104 pr_info("Using default: smaller of (%u%% of RAM) and "
105 "(backing disk size).\n",
106 default_memlimit_perc_ram
);
107 mempart
= default_memlimit_perc_ram
* (totalram_bytes
/ 100);
108 disksize
= rzs
->disksize
;
109 rzs
->memlimit
= mempart
> disksize
? disksize
: mempart
;
112 if (rzs
->memlimit
> totalram_bytes
/ 2) {
114 "Its not advisable setting limit more than half of "
115 "size of memory since we expect a 2:1 compression ratio. "
116 "Limit represents amount of *compressed* data we can keep "
118 "\tMemory Size: %zu kB\n"
119 "\tLimit you selected: %zu kB\n"
120 "Continuing anyway ...\n",
121 totalram_bytes
>> 10, rzs
->memlimit
>> 10
125 rzs
->memlimit
&= PAGE_MASK
;
126 BUG_ON(!rzs
->memlimit
);
129 static void ramzswap_set_disksize(struct ramzswap
*rzs
, size_t totalram_bytes
)
131 if (!rzs
->disksize
) {
133 "disk size not provided. You can use disksize_kb module "
134 "param to specify size.\nUsing default: (%u%% of RAM).\n",
135 default_disksize_perc_ram
137 rzs
->disksize
= default_disksize_perc_ram
*
138 (totalram_bytes
/ 100);
141 if (rzs
->disksize
> 2 * (totalram_bytes
)) {
143 "There is little point creating a ramzswap of greater than "
144 "twice the size of memory since we expect a 2:1 compression "
145 "ratio. Note that ramzswap uses about 0.1%% of the size of "
146 "the swap device when not in use so a huge ramzswap is "
148 "\tMemory Size: %zu kB\n"
149 "\tSize you selected: %zu kB\n"
150 "Continuing anyway ...\n",
151 totalram_bytes
>> 10, rzs
->disksize
155 rzs
->disksize
&= PAGE_MASK
;
159 * Swap header (1st page of swap device) contains information
160 * to indentify it as a swap partition. Prepare such a header
161 * for ramzswap device (ramzswap0) so that swapon can identify
162 * it as swap partition. In case backing swap device is provided,
163 * copy its swap header.
165 static int setup_swap_header(struct ramzswap
*rzs
, union swap_header
*s
)
169 struct address_space
*mapping
;
170 union swap_header
*backing_swap_header
;
173 * There is no backing swap device. Create a swap header
174 * that is acceptable by swapon.
176 if (!rzs
->backing_swap
) {
178 s
->info
.last_page
= (rzs
->disksize
>> PAGE_SHIFT
) - 1;
179 s
->info
.nr_badpages
= 0;
180 memcpy(s
->magic
.magic
, "SWAPSPACE2", 10);
185 * We have a backing swap device. Copy its swap header
186 * to ramzswap device header. If this header contains
187 * invalid information (backing device not a swap
188 * partition, etc.), swapon will fail for ramzswap
189 * which is correct behavior - we don't want to swap
190 * over filesystem partition!
193 /* Read the backing swap header (code from sys_swapon) */
194 mapping
= rzs
->swap_file
->f_mapping
;
195 if (!mapping
->a_ops
->readpage
) {
200 page
= read_mapping_page(mapping
, 0, rzs
->swap_file
);
206 backing_swap_header
= kmap(page
);
207 memcpy(s
, backing_swap_header
, sizeof(*s
));
208 if (s
->info
.nr_badpages
) {
209 pr_info("Cannot use backing swap with bad pages (%u)\n",
210 s
->info
.nr_badpages
);
214 * ramzswap disksize equals number of usable pages in backing
215 * swap. Set last_page in swap header to match this disksize
216 * ('last_page' means 0-based index of last usable swap page).
218 s
->info
.last_page
= (rzs
->disksize
>> PAGE_SHIFT
) - 1;
225 static void ramzswap_flush_dcache_page(struct page
*page
)
230 * Ugly hack to get flush_dcache_page() work on ARM.
231 * page_mapping(page) == NULL after clearing this swap cache flag.
232 * Without clearing this flag, flush_dcache_page() will simply set
233 * "PG_dcache_dirty" bit and return.
235 if (PageSwapCache(page
)) {
237 ClearPageSwapCache(page
);
240 flush_dcache_page(page
);
243 SetPageSwapCache(page
);
247 void ramzswap_ioctl_get_stats(struct ramzswap
*rzs
,
248 struct ramzswap_ioctl_stats
*s
)
250 strncpy(s
->backing_swap_name
, rzs
->backing_swap_name
,
251 MAX_SWAP_NAME_LEN
- 1);
252 s
->backing_swap_name
[MAX_SWAP_NAME_LEN
- 1] = '\0';
254 s
->disksize
= rzs
->disksize
;
255 s
->memlimit
= rzs
->memlimit
;
257 #if defined(CONFIG_RAMZSWAP_STATS)
259 struct ramzswap_stats
*rs
= &rzs
->stats
;
260 size_t succ_writes
, mem_used
;
261 unsigned int good_compress_perc
= 0, no_compress_perc
= 0;
263 mem_used
= xv_get_total_size_bytes(rzs
->mem_pool
)
264 + (rs
->pages_expand
<< PAGE_SHIFT
);
265 succ_writes
= rs
->num_writes
- rs
->failed_writes
;
267 if (succ_writes
&& rs
->pages_stored
) {
268 good_compress_perc
= rs
->good_compress
* 100
270 no_compress_perc
= rs
->pages_expand
* 100
274 s
->num_reads
= rs
->num_reads
;
275 s
->num_writes
= rs
->num_writes
;
276 s
->failed_reads
= rs
->failed_reads
;
277 s
->failed_writes
= rs
->failed_writes
;
278 s
->invalid_io
= rs
->invalid_io
;
279 s
->pages_zero
= rs
->pages_zero
;
281 s
->good_compress_pct
= good_compress_perc
;
282 s
->pages_expand_pct
= no_compress_perc
;
284 s
->pages_stored
= rs
->pages_stored
;
285 s
->pages_used
= mem_used
>> PAGE_SHIFT
;
286 s
->orig_data_size
= rs
->pages_stored
<< PAGE_SHIFT
;
287 s
->compr_data_size
= rs
->compr_size
;
288 s
->mem_used_total
= mem_used
;
290 s
->bdev_num_reads
= rs
->bdev_num_reads
;
291 s
->bdev_num_writes
= rs
->bdev_num_writes
;
293 #endif /* CONFIG_RAMZSWAP_STATS */
296 static int add_backing_swap_extent(struct ramzswap
*rzs
,
301 struct list_head
*head
;
302 struct page
*curr_page
, *new_page
;
303 unsigned int extents_per_page
= PAGE_SIZE
/
304 sizeof(struct ramzswap_backing_extent
);
306 idx
= rzs
->num_extents
% extents_per_page
;
308 new_page
= alloc_page(__GFP_ZERO
);
312 if (rzs
->num_extents
) {
313 curr_page
= virt_to_page(rzs
->curr_extent
);
314 head
= &curr_page
->lru
;
316 head
= &rzs
->backing_swap_extent_list
;
319 list_add(&new_page
->lru
, head
);
320 rzs
->curr_extent
= page_address(new_page
);
323 rzs
->curr_extent
->phy_pagenum
= phy_pagenum
;
324 rzs
->curr_extent
->num_pages
= num_pages
;
326 pr_debug("add_extent: idx=%u, phy_pgnum=%lu, num_pgs=%lu, "
327 "pg_last=%lu, curr_ext=%p\n", idx
, phy_pagenum
, num_pages
,
328 phy_pagenum
+ num_pages
- 1, rzs
->curr_extent
);
330 if (idx
!= extents_per_page
- 1)
336 static int setup_backing_swap_extents(struct ramzswap
*rzs
,
337 struct inode
*inode
, unsigned long *num_pages
)
341 unsigned blocks_per_page
;
342 pgoff_t contig_pages
= 0, total_pages
= 0;
343 pgoff_t pagenum
= 0, prev_pagenum
= 0;
344 sector_t probe_block
= 0;
347 blkbits
= inode
->i_blkbits
;
348 blocks_per_page
= PAGE_SIZE
>> blkbits
;
350 last_block
= i_size_read(inode
) >> blkbits
;
351 while (probe_block
+ blocks_per_page
<= last_block
) {
352 unsigned block_in_page
;
353 sector_t first_block
;
355 first_block
= bmap(inode
, probe_block
);
356 if (first_block
== 0)
359 /* It must be PAGE_SIZE aligned on-disk */
360 if (first_block
& (blocks_per_page
- 1)) {
365 /* All blocks within this page must be contiguous on disk */
366 for (block_in_page
= 1; block_in_page
< blocks_per_page
;
370 block
= bmap(inode
, probe_block
+ block_in_page
);
373 if (block
!= first_block
+ block_in_page
) {
381 * We found a PAGE_SIZE length, PAGE_SIZE aligned
384 pagenum
= first_block
>> (PAGE_SHIFT
- blkbits
);
386 if (total_pages
&& (pagenum
!= prev_pagenum
+ 1)) {
387 ret
= add_backing_swap_extent(rzs
, prev_pagenum
-
388 (contig_pages
- 1), contig_pages
);
396 prev_pagenum
= pagenum
;
397 probe_block
+= blocks_per_page
;
404 pr_debug("adding last extent: pagenum=%lu, "
405 "contig_pages=%lu\n", pagenum
, contig_pages
);
406 ret
= add_backing_swap_extent(rzs
,
407 prev_pagenum
- (contig_pages
- 1), contig_pages
);
412 if (!rzs
->num_extents
) {
413 pr_err("No swap extents found!\n");
418 *num_pages
= total_pages
;
419 pr_info("Found %lu extents containing %luk\n",
420 rzs
->num_extents
, *num_pages
<< (PAGE_SHIFT
- 10));
425 pr_err("Backing swapfile has holes\n");
428 while (ret
&& !list_empty(&rzs
->backing_swap_extent_list
)) {
430 struct list_head
*entry
= rzs
->backing_swap_extent_list
.next
;
431 page
= list_entry(entry
, struct page
, lru
);
438 static void map_backing_swap_extents(struct ramzswap
*rzs
)
440 struct ramzswap_backing_extent
*se
;
441 struct page
*table_page
, *se_page
;
442 unsigned long num_pages
, num_table_pages
, entry
;
443 unsigned long se_idx
, span
;
444 unsigned entries_per_page
= PAGE_SIZE
/ sizeof(*rzs
->table
);
445 unsigned extents_per_page
= PAGE_SIZE
/ sizeof(*se
);
447 /* True for block device */
448 if (!rzs
->num_extents
)
451 se_page
= list_entry(rzs
->backing_swap_extent_list
.next
,
453 se
= page_address(se_page
);
454 span
= se
->num_pages
;
455 num_pages
= rzs
->disksize
>> PAGE_SHIFT
;
456 num_table_pages
= DIV_ROUND_UP(num_pages
* sizeof(*rzs
->table
),
461 while (num_table_pages
--) {
462 table_page
= vmalloc_to_page(&rzs
->table
[entry
]);
463 while (span
<= entry
) {
465 if (se_idx
== rzs
->num_extents
)
468 if (!(se_idx
% extents_per_page
)) {
469 se_page
= list_entry(se_page
->lru
.next
,
471 se
= page_address(se_page
);
475 span
+= se
->num_pages
;
477 table_page
->mapping
= (struct address_space
*)se
;
478 table_page
->private = se
->num_pages
- (span
- entry
);
479 pr_debug("map_table: entry=%lu, span=%lu, map=%p, priv=%lu\n",
480 entry
, span
, table_page
->mapping
, table_page
->private);
481 entry
+= entries_per_page
;
486 * Check if value of backing_swap module param is sane.
487 * Claim this device and set ramzswap size equal to
488 * size of this block device.
490 static int setup_backing_swap(struct ramzswap
*rzs
)
494 unsigned long num_pages
= 0;
496 struct file
*swap_file
;
497 struct address_space
*mapping
;
498 struct block_device
*bdev
= NULL
;
500 if (!rzs
->backing_swap_name
[0]) {
501 pr_debug("backing_swap param not given\n");
505 pr_info("Using backing swap device: %s\n", rzs
->backing_swap_name
);
507 swap_file
= filp_open(rzs
->backing_swap_name
,
508 O_RDWR
| O_LARGEFILE
, 0);
509 if (IS_ERR(swap_file
)) {
510 pr_err("Error opening backing device: %s\n",
511 rzs
->backing_swap_name
);
516 mapping
= swap_file
->f_mapping
;
517 inode
= mapping
->host
;
519 if (S_ISBLK(inode
->i_mode
)) {
520 bdev
= I_BDEV(inode
);
521 ret
= bd_claim(bdev
, setup_backing_swap
);
526 disksize
= i_size_read(inode
);
527 } else if (S_ISREG(inode
->i_mode
)) {
528 bdev
= inode
->i_sb
->s_bdev
;
529 if (IS_SWAPFILE(inode
)) {
533 ret
= setup_backing_swap_extents(rzs
, inode
, &num_pages
);
536 disksize
= num_pages
<< PAGE_SHIFT
;
541 rzs
->swap_file
= swap_file
;
542 rzs
->backing_swap
= bdev
;
543 rzs
->disksize
= disksize
;
544 BUG_ON(!rzs
->disksize
);
551 filp_close(swap_file
, NULL
);
554 rzs
->backing_swap
= NULL
;
559 * Map logical page number 'pagenum' to physical page number
560 * on backing swap device. For block device, this is a nop.
562 u32
map_backing_swap_page(struct ramzswap
*rzs
, u32 pagenum
)
564 u32 skip_pages
, entries_per_page
;
565 size_t delta
, se_offset
, skipped
;
566 struct page
*table_page
, *se_page
;
567 struct ramzswap_backing_extent
*se
;
569 if (!rzs
->num_extents
)
572 entries_per_page
= PAGE_SIZE
/ sizeof(*rzs
->table
);
574 table_page
= vmalloc_to_page(&rzs
->table
[pagenum
]);
575 se
= (struct ramzswap_backing_extent
*)table_page
->mapping
;
576 se_page
= virt_to_page(se
);
578 skip_pages
= pagenum
- (pagenum
/ entries_per_page
* entries_per_page
);
579 se_offset
= table_page
->private + skip_pages
;
581 if (se_offset
< se
->num_pages
)
582 return se
->phy_pagenum
+ se_offset
;
584 skipped
= se
->num_pages
- table_page
->private;
586 struct ramzswap_backing_extent
*se_base
;
587 u32 se_entries_per_page
= PAGE_SIZE
/ sizeof(*se
);
589 /* Get next swap extent */
590 se_base
= (struct ramzswap_backing_extent
*)
591 page_address(se_page
);
592 if (se
- se_base
== se_entries_per_page
- 1) {
593 se_page
= list_entry(se_page
->lru
.next
,
595 se
= page_address(se_page
);
600 skipped
+= se
->num_pages
;
601 } while (skipped
< skip_pages
);
603 delta
= skipped
- skip_pages
;
604 se_offset
= se
->num_pages
- delta
;
606 return se
->phy_pagenum
+ se_offset
;
609 static void ramzswap_free_page(struct ramzswap
*rzs
, size_t index
)
614 struct page
*page
= rzs
->table
[index
].page
;
615 u32 offset
= rzs
->table
[index
].offset
;
617 if (unlikely(!page
)) {
618 if (rzs_test_flag(rzs
, index
, RZS_ZERO
)) {
619 rzs_clear_flag(rzs
, index
, RZS_ZERO
);
620 stat_dec(rzs
->stats
.pages_zero
);
625 if (unlikely(rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
))) {
628 rzs_clear_flag(rzs
, index
, RZS_UNCOMPRESSED
);
629 stat_dec(rzs
->stats
.pages_expand
);
633 obj
= kmap_atomic(page
, KM_USER0
) + offset
;
634 clen
= xv_get_object_size(obj
) - sizeof(struct zobj_header
);
635 kunmap_atomic(obj
, KM_USER0
);
637 xv_free(rzs
->mem_pool
, page
, offset
);
638 if (clen
<= PAGE_SIZE
/ 2)
639 stat_dec(rzs
->stats
.good_compress
);
642 rzs
->stats
.compr_size
-= clen
;
643 stat_dec(rzs
->stats
.pages_stored
);
645 rzs
->table
[index
].page
= NULL
;
646 rzs
->table
[index
].offset
= 0;
649 static int handle_zero_page(struct bio
*bio
)
652 struct page
*page
= bio
->bi_io_vec
[0].bv_page
;
654 user_mem
= kmap_atomic(page
, KM_USER0
);
655 memset(user_mem
, 0, PAGE_SIZE
);
656 kunmap_atomic(user_mem
, KM_USER0
);
658 ramzswap_flush_dcache_page(page
);
660 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
665 static int handle_uncompressed_page(struct ramzswap
*rzs
, struct bio
*bio
)
669 unsigned char *user_mem
, *cmem
;
671 page
= bio
->bi_io_vec
[0].bv_page
;
672 index
= bio
->bi_sector
>> SECTORS_PER_PAGE_SHIFT
;
674 user_mem
= kmap_atomic(page
, KM_USER0
);
675 cmem
= kmap_atomic(rzs
->table
[index
].page
, KM_USER1
) +
676 rzs
->table
[index
].offset
;
678 memcpy(user_mem
, cmem
, PAGE_SIZE
);
679 kunmap_atomic(user_mem
, KM_USER0
);
680 kunmap_atomic(cmem
, KM_USER1
);
682 ramzswap_flush_dcache_page(page
);
684 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
691 * Called when request page is not present in ramzswap.
692 * Its either in backing swap device (if present) or
693 * this is an attempt to read before any previous write
694 * to this location - this happens due to readahead when
695 * swap device is read from user-space (e.g. during swapon)
697 static int handle_ramzswap_fault(struct ramzswap
*rzs
, struct bio
*bio
)
700 * Always forward such requests to backing swap
701 * device (if present)
703 if (rzs
->backing_swap
) {
705 stat_dec(rzs
->stats
.num_reads
);
706 stat_inc(rzs
->stats
.bdev_num_reads
);
707 bio
->bi_bdev
= rzs
->backing_swap
;
710 * In case backing swap is a file, find the right offset within
711 * the file corresponding to logical position 'index'. For block
712 * device, this is a nop.
714 pagenum
= bio
->bi_sector
>> SECTORS_PER_PAGE_SHIFT
;
715 bio
->bi_sector
= map_backing_swap_page(rzs
, pagenum
)
716 << SECTORS_PER_PAGE_SHIFT
;
721 * Its unlikely event in case backing dev is
724 pr_debug("Read before write on swap device: "
725 "sector=%lu, size=%u, offset=%u\n",
726 (ulong
)(bio
->bi_sector
), bio
->bi_size
,
727 bio
->bi_io_vec
[0].bv_offset
);
729 /* Do nothing. Just return success */
730 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
735 static int ramzswap_read(struct ramzswap
*rzs
, struct bio
*bio
)
741 struct zobj_header
*zheader
;
742 unsigned char *user_mem
, *cmem
;
744 stat_inc(rzs
->stats
.num_reads
);
746 page
= bio
->bi_io_vec
[0].bv_page
;
747 index
= bio
->bi_sector
>> SECTORS_PER_PAGE_SHIFT
;
749 if (rzs_test_flag(rzs
, index
, RZS_ZERO
))
750 return handle_zero_page(bio
);
752 /* Requested page is not present in compressed area */
753 if (!rzs
->table
[index
].page
)
754 return handle_ramzswap_fault(rzs
, bio
);
756 /* Page is stored uncompressed since its incompressible */
757 if (unlikely(rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
)))
758 return handle_uncompressed_page(rzs
, bio
);
760 user_mem
= kmap_atomic(page
, KM_USER0
);
763 cmem
= kmap_atomic(rzs
->table
[index
].page
, KM_USER1
) +
764 rzs
->table
[index
].offset
;
766 ret
= lzo1x_decompress_safe(
767 cmem
+ sizeof(*zheader
),
768 xv_get_object_size(cmem
) - sizeof(*zheader
),
771 kunmap_atomic(user_mem
, KM_USER0
);
772 kunmap_atomic(cmem
, KM_USER1
);
774 /* should NEVER happen */
775 if (unlikely(ret
!= LZO_E_OK
)) {
776 pr_err("Decompression failed! err=%d, page=%u\n",
778 stat_inc(rzs
->stats
.failed_reads
);
782 ramzswap_flush_dcache_page(page
);
784 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
793 static int ramzswap_write(struct ramzswap
*rzs
, struct bio
*bio
)
795 int ret
, fwd_write_request
= 0;
798 struct zobj_header
*zheader
;
799 struct page
*page
, *page_store
;
800 unsigned char *user_mem
, *cmem
, *src
;
802 stat_inc(rzs
->stats
.num_writes
);
804 page
= bio
->bi_io_vec
[0].bv_page
;
805 index
= bio
->bi_sector
>> SECTORS_PER_PAGE_SHIFT
;
807 src
= rzs
->compress_buffer
;
810 * System swaps to same sector again when the stored page
811 * is no longer referenced by any process. So, its now safe
812 * to free the memory that was allocated for this page.
814 if (rzs
->table
[index
].page
)
815 ramzswap_free_page(rzs
, index
);
818 * No memory ia allocated for zero filled pages.
819 * Simply clear zero page flag.
821 if (rzs_test_flag(rzs
, index
, RZS_ZERO
)) {
822 stat_dec(rzs
->stats
.pages_zero
);
823 rzs_clear_flag(rzs
, index
, RZS_ZERO
);
826 mutex_lock(&rzs
->lock
);
828 user_mem
= kmap_atomic(page
, KM_USER0
);
829 if (page_zero_filled(user_mem
)) {
830 kunmap_atomic(user_mem
, KM_USER0
);
831 mutex_unlock(&rzs
->lock
);
832 stat_inc(rzs
->stats
.pages_zero
);
833 rzs_set_flag(rzs
, index
, RZS_ZERO
);
835 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
840 if (rzs
->backing_swap
&&
841 (rzs
->stats
.compr_size
> rzs
->memlimit
- PAGE_SIZE
)) {
842 kunmap_atomic(user_mem
, KM_USER0
);
843 mutex_unlock(&rzs
->lock
);
844 fwd_write_request
= 1;
848 ret
= lzo1x_1_compress(user_mem
, PAGE_SIZE
, src
, &clen
,
849 rzs
->compress_workmem
);
851 kunmap_atomic(user_mem
, KM_USER0
);
853 if (unlikely(ret
!= LZO_E_OK
)) {
854 mutex_unlock(&rzs
->lock
);
855 pr_err("Compression failed! err=%d\n", ret
);
856 stat_inc(rzs
->stats
.failed_writes
);
861 * Page is incompressible. Forward it to backing swap
862 * if present. Otherwise, store it as-is (uncompressed)
863 * since we do not want to return too many swap write
864 * errors which has side effect of hanging the system.
866 if (unlikely(clen
> max_zpage_size
)) {
867 if (rzs
->backing_swap
) {
868 mutex_unlock(&rzs
->lock
);
869 fwd_write_request
= 1;
874 page_store
= alloc_page(GFP_NOIO
| __GFP_HIGHMEM
);
875 if (unlikely(!page_store
)) {
876 mutex_unlock(&rzs
->lock
);
877 pr_info("Error allocating memory for incompressible "
878 "page: %u\n", index
);
879 stat_inc(rzs
->stats
.failed_writes
);
884 rzs_set_flag(rzs
, index
, RZS_UNCOMPRESSED
);
885 stat_inc(rzs
->stats
.pages_expand
);
886 rzs
->table
[index
].page
= page_store
;
887 src
= kmap_atomic(page
, KM_USER0
);
891 if (xv_malloc(rzs
->mem_pool
, clen
+ sizeof(*zheader
),
892 &rzs
->table
[index
].page
, &offset
,
893 GFP_NOIO
| __GFP_HIGHMEM
)) {
894 mutex_unlock(&rzs
->lock
);
895 pr_info("Error allocating memory for compressed "
896 "page: %u, size=%zu\n", index
, clen
);
897 stat_inc(rzs
->stats
.failed_writes
);
898 if (rzs
->backing_swap
)
899 fwd_write_request
= 1;
904 rzs
->table
[index
].offset
= offset
;
906 cmem
= kmap_atomic(rzs
->table
[index
].page
, KM_USER1
) +
907 rzs
->table
[index
].offset
;
910 /* Back-reference needed for memory defragmentation */
911 if (!rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
)) {
912 zheader
= (struct zobj_header
*)cmem
;
913 zheader
->table_idx
= index
;
914 cmem
+= sizeof(*zheader
);
918 memcpy(cmem
, src
, clen
);
920 kunmap_atomic(cmem
, KM_USER1
);
921 if (unlikely(rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
)))
922 kunmap_atomic(src
, KM_USER0
);
925 rzs
->stats
.compr_size
+= clen
;
926 stat_inc(rzs
->stats
.pages_stored
);
927 if (clen
<= PAGE_SIZE
/ 2)
928 stat_inc(rzs
->stats
.good_compress
);
930 mutex_unlock(&rzs
->lock
);
932 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
937 if (fwd_write_request
) {
938 stat_inc(rzs
->stats
.bdev_num_writes
);
939 bio
->bi_bdev
= rzs
->backing_swap
;
942 * TODO: We currently have linear mapping of ramzswap and
943 * backing swap sectors. This is not desired since we want
944 * to optimize writes to backing swap to minimize disk seeks
945 * or have effective wear leveling (for SSDs). Also, a
946 * non-linear mapping is required to implement compressed
949 bio
->bi_sector
= get_backing_swap_page()
950 << SECTORS_PER_PAGE_SHIFT
;
953 * In case backing swap is a file, find the right offset within
954 * the file corresponding to logical position 'index'. For block
955 * device, this is a nop.
957 bio
->bi_sector
= map_backing_swap_page(rzs
, index
)
958 << SECTORS_PER_PAGE_SHIFT
;
968 * Check if request is within bounds and page aligned.
970 static inline int valid_swap_request(struct ramzswap
*rzs
, struct bio
*bio
)
973 (bio
->bi_sector
>= (rzs
->disksize
>> SECTOR_SHIFT
)) ||
974 (bio
->bi_sector
& (SECTORS_PER_PAGE
- 1)) ||
975 (bio
->bi_vcnt
!= 1) ||
976 (bio
->bi_size
!= PAGE_SIZE
) ||
977 (bio
->bi_io_vec
[0].bv_offset
!= 0))) {
982 /* swap request is valid */
987 * Handler function for all ramzswap I/O requests.
989 static int ramzswap_make_request(struct request_queue
*queue
, struct bio
*bio
)
992 struct ramzswap
*rzs
= queue
->queuedata
;
994 if (unlikely(!rzs
->init_done
)) {
999 if (!valid_swap_request(rzs
, bio
)) {
1000 stat_inc(rzs
->stats
.invalid_io
);
1005 switch (bio_data_dir(bio
)) {
1007 ret
= ramzswap_read(rzs
, bio
);
1011 ret
= ramzswap_write(rzs
, bio
);
1018 static void reset_device(struct ramzswap
*rzs
)
1020 int is_backing_blkdev
= 0;
1021 size_t index
, num_pages
;
1022 unsigned entries_per_page
;
1023 unsigned long num_table_pages
, entry
= 0;
1025 if (rzs
->backing_swap
&& !rzs
->num_extents
)
1026 is_backing_blkdev
= 1;
1028 num_pages
= rzs
->disksize
>> PAGE_SHIFT
;
1030 /* Free various per-device buffers */
1031 kfree(rzs
->compress_workmem
);
1032 free_pages((unsigned long)rzs
->compress_buffer
, 1);
1034 rzs
->compress_workmem
= NULL
;
1035 rzs
->compress_buffer
= NULL
;
1037 /* Free all pages that are still in this ramzswap device */
1038 for (index
= 0; index
< num_pages
; index
++) {
1042 page
= rzs
->table
[index
].page
;
1043 offset
= rzs
->table
[index
].offset
;
1048 if (unlikely(rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
)))
1051 xv_free(rzs
->mem_pool
, page
, offset
);
1054 entries_per_page
= PAGE_SIZE
/ sizeof(*rzs
->table
);
1055 num_table_pages
= DIV_ROUND_UP(num_pages
* sizeof(*rzs
->table
),
1058 * Set page->mapping to NULL for every table page.
1059 * Otherwise, we will hit bad_page() during free.
1061 while (rzs
->num_extents
&& num_table_pages
--) {
1063 page
= vmalloc_to_page(&rzs
->table
[entry
]);
1064 page
->mapping
= NULL
;
1065 entry
+= entries_per_page
;
1070 xv_destroy_pool(rzs
->mem_pool
);
1071 rzs
->mem_pool
= NULL
;
1073 /* Free all swap extent pages */
1074 while (!list_empty(&rzs
->backing_swap_extent_list
)) {
1076 struct list_head
*entry
;
1077 entry
= rzs
->backing_swap_extent_list
.next
;
1078 page
= list_entry(entry
, struct page
, lru
);
1082 INIT_LIST_HEAD(&rzs
->backing_swap_extent_list
);
1083 rzs
->num_extents
= 0;
1085 /* Close backing swap device, if present */
1086 if (rzs
->backing_swap
) {
1087 if (is_backing_blkdev
)
1088 bd_release(rzs
->backing_swap
);
1089 filp_close(rzs
->swap_file
, NULL
);
1090 rzs
->backing_swap
= NULL
;
1094 memset(&rzs
->stats
, 0, sizeof(rzs
->stats
));
1099 /* Back to uninitialized state */
1103 static int ramzswap_ioctl_init_device(struct ramzswap
*rzs
)
1108 union swap_header
*swap_header
;
1110 if (rzs
->init_done
) {
1111 pr_info("Device already initialized!\n");
1115 ret
= setup_backing_swap(rzs
);
1119 if (rzs
->backing_swap
)
1120 ramzswap_set_memlimit(rzs
, totalram_pages
<< PAGE_SHIFT
);
1122 ramzswap_set_disksize(rzs
, totalram_pages
<< PAGE_SHIFT
);
1124 rzs
->compress_workmem
= kzalloc(LZO1X_MEM_COMPRESS
, GFP_KERNEL
);
1125 if (!rzs
->compress_workmem
) {
1126 pr_err("Error allocating compressor working memory!\n");
1131 rzs
->compress_buffer
= (void *)__get_free_pages(__GFP_ZERO
, 1);
1132 if (!rzs
->compress_buffer
) {
1133 pr_err("Error allocating compressor buffer space\n");
1138 num_pages
= rzs
->disksize
>> PAGE_SHIFT
;
1139 rzs
->table
= vmalloc(num_pages
* sizeof(*rzs
->table
));
1141 pr_err("Error allocating ramzswap address table\n");
1142 /* To prevent accessing table entries during cleanup */
1147 memset(rzs
->table
, 0, num_pages
* sizeof(*rzs
->table
));
1149 map_backing_swap_extents(rzs
);
1151 page
= alloc_page(__GFP_ZERO
);
1153 pr_err("Error allocating swap header page\n");
1157 rzs
->table
[0].page
= page
;
1158 rzs_set_flag(rzs
, 0, RZS_UNCOMPRESSED
);
1160 swap_header
= kmap(page
);
1161 ret
= setup_swap_header(rzs
, swap_header
);
1164 pr_err("Error setting swap header\n");
1168 set_capacity(rzs
->disk
, rzs
->disksize
>> SECTOR_SHIFT
);
1171 * We have ident mapping of sectors for ramzswap and
1172 * and the backing swap device. So, this queue flag
1173 * should be according to backing dev.
1175 if (!rzs
->backing_swap
||
1176 blk_queue_nonrot(rzs
->backing_swap
->bd_disk
->queue
))
1177 queue_flag_set_unlocked(QUEUE_FLAG_NONROT
, rzs
->disk
->queue
);
1179 rzs
->mem_pool
= xv_create_pool();
1180 if (!rzs
->mem_pool
) {
1181 pr_err("Error creating memory pool\n");
1187 * Pages that compress to size greater than this are forwarded
1188 * to physical swap disk (if backing dev is provided)
1189 * TODO: make this configurable
1191 if (rzs
->backing_swap
)
1192 max_zpage_size
= max_zpage_size_bdev
;
1194 max_zpage_size
= max_zpage_size_nobdev
;
1195 pr_debug("Max compressed page size: %u bytes\n", max_zpage_size
);
1199 pr_debug("Initialization done!\n");
1205 pr_err("Initialization failed: err=%d\n", ret
);
1209 static int ramzswap_ioctl_reset_device(struct ramzswap
*rzs
)
1217 static int ramzswap_ioctl(struct block_device
*bdev
, fmode_t mode
,
1218 unsigned int cmd
, unsigned long arg
)
1221 size_t disksize_kb
, memlimit_kb
;
1223 struct ramzswap
*rzs
= bdev
->bd_disk
->private_data
;
1226 case RZSIO_SET_DISKSIZE_KB
:
1227 if (rzs
->init_done
) {
1231 if (copy_from_user(&disksize_kb
, (void *)arg
,
1236 rzs
->disksize
= disksize_kb
<< 10;
1237 pr_info("Disk size set to %zu kB\n", disksize_kb
);
1240 case RZSIO_SET_MEMLIMIT_KB
:
1241 if (rzs
->init_done
) {
1242 /* TODO: allow changing memlimit */
1246 if (copy_from_user(&memlimit_kb
, (void *)arg
,
1251 rzs
->memlimit
= memlimit_kb
<< 10;
1252 pr_info("Memory limit set to %zu kB\n", memlimit_kb
);
1255 case RZSIO_SET_BACKING_SWAP
:
1256 if (rzs
->init_done
) {
1261 if (copy_from_user(&rzs
->backing_swap_name
, (void *)arg
,
1266 rzs
->backing_swap_name
[MAX_SWAP_NAME_LEN
- 1] = '\0';
1267 pr_info("Backing swap set to %s\n", rzs
->backing_swap_name
);
1270 case RZSIO_GET_STATS
:
1272 struct ramzswap_ioctl_stats
*stats
;
1273 if (!rzs
->init_done
) {
1277 stats
= kzalloc(sizeof(*stats
), GFP_KERNEL
);
1282 ramzswap_ioctl_get_stats(rzs
, stats
);
1283 if (copy_to_user((void *)arg
, stats
, sizeof(*stats
))) {
1292 ret
= ramzswap_ioctl_init_device(rzs
);
1296 /* Do not reset an active device! */
1297 if (bdev
->bd_holders
) {
1301 ret
= ramzswap_ioctl_reset_device(rzs
);
1305 pr_info("Invalid ioctl %u\n", cmd
);
1313 static struct block_device_operations ramzswap_devops
= {
1314 .ioctl
= ramzswap_ioctl
,
1315 .owner
= THIS_MODULE
,
1318 static void create_device(struct ramzswap
*rzs
, int device_id
)
1320 mutex_init(&rzs
->lock
);
1321 INIT_LIST_HEAD(&rzs
->backing_swap_extent_list
);
1323 rzs
->queue
= blk_alloc_queue(GFP_KERNEL
);
1325 pr_err("Error allocating disk queue for device %d\n",
1330 blk_queue_make_request(rzs
->queue
, ramzswap_make_request
);
1331 rzs
->queue
->queuedata
= rzs
;
1333 /* gendisk structure */
1334 rzs
->disk
= alloc_disk(1);
1336 blk_cleanup_queue(rzs
->queue
);
1337 pr_warning("Error allocating disk structure for device %d\n",
1342 rzs
->disk
->major
= ramzswap_major
;
1343 rzs
->disk
->first_minor
= device_id
;
1344 rzs
->disk
->fops
= &ramzswap_devops
;
1345 rzs
->disk
->queue
= rzs
->queue
;
1346 rzs
->disk
->private_data
= rzs
;
1347 snprintf(rzs
->disk
->disk_name
, 16, "ramzswap%d", device_id
);
1350 * Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl
1351 * or set equal to backing swap device (if provided)
1353 set_capacity(rzs
->disk
, 0);
1354 add_disk(rzs
->disk
);
1359 static void destroy_device(struct ramzswap
*rzs
)
1362 del_gendisk(rzs
->disk
);
1363 put_disk(rzs
->disk
);
1367 blk_cleanup_queue(rzs
->queue
);
1370 static int __init
ramzswap_init(void)
1374 if (num_devices
> max_num_devices
) {
1375 pr_warning("Invalid value for num_devices: %u\n",
1380 ramzswap_major
= register_blkdev(0, "ramzswap");
1381 if (ramzswap_major
<= 0) {
1382 pr_warning("Unable to get major number\n");
1387 pr_info("num_devices not specified. Using default: 1\n");
1391 /* Allocate the device array and initialize each one */
1392 pr_info("Creating %u devices ...\n", num_devices
);
1393 devices
= kzalloc(num_devices
* sizeof(struct ramzswap
), GFP_KERNEL
);
1399 for (i
= 0; i
< num_devices
; i
++)
1400 create_device(&devices
[i
], i
);
1404 unregister_blkdev(ramzswap_major
, "ramzswap");
1408 static void __exit
ramzswap_exit(void)
1411 struct ramzswap
*rzs
;
1413 for (i
= 0; i
< num_devices
; i
++) {
1416 destroy_device(rzs
);
1421 unregister_blkdev(ramzswap_major
, "ramzswap");
1424 pr_debug("Cleanup done!\n");
1427 module_param(num_devices
, uint
, 0);
1428 MODULE_PARM_DESC(num_devices
, "Number of ramzswap devices");
1430 module_init(ramzswap_init
);
1431 module_exit(ramzswap_exit
);
1433 MODULE_LICENSE("Dual BSD/GPL");
1434 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
1435 MODULE_DESCRIPTION("Compressed RAM Based Swap Device");