On Tue, Nov 06, 2007 at 02:33:53AM -0800, akpm@linux-foundation.org wrote:
[mmotm.git] / drivers / staging / ramzswap / ramzswap_drv.c
blobb839f05efbcedbd3a70142cebfaef12d654a3a5b
1 /*
2 * Compressed RAM based swap device
4 * Copyright (C) 2008, 2009 Nitin Gupta
6 * This code is released using a dual license strategy: BSD/GPL
7 * You can choose the licence that better fits your requirements.
9 * Released under the terms of 3-clause BSD License
10 * Released under the terms of GNU General Public License Version 2.0
12 * Project home: http://compcache.googlecode.com
15 #define KMSG_COMPONENT "ramzswap"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18 #include <linux/module.h>
19 #include <linux/kernel.h>
20 #include <linux/bitops.h>
21 #include <linux/blkdev.h>
22 #include <linux/buffer_head.h>
23 #include <linux/device.h>
24 #include <linux/genhd.h>
25 #include <linux/highmem.h>
26 #include <linux/lzo.h>
27 #include <linux/mutex.h>
28 #include <linux/string.h>
29 #include <linux/swap.h>
30 #include <linux/swapops.h>
31 #include <linux/vmalloc.h>
32 #include <linux/version.h>
34 #include "ramzswap_drv.h"
36 /* Globals */
37 static int ramzswap_major;
38 static struct ramzswap *devices;
41 * Pages that compress to larger than this size are
42 * forwarded to backing swap, if present or stored
43 * uncompressed in memory otherwise.
45 static unsigned int max_zpage_size;
47 /* Module params (documentation at end) */
48 static unsigned int num_devices;
50 static int rzs_test_flag(struct ramzswap *rzs, u32 index,
51 enum rzs_pageflags flag)
53 return rzs->table[index].flags & BIT(flag);
56 static void rzs_set_flag(struct ramzswap *rzs, u32 index,
57 enum rzs_pageflags flag)
59 rzs->table[index].flags |= BIT(flag);
62 static void rzs_clear_flag(struct ramzswap *rzs, u32 index,
63 enum rzs_pageflags flag)
65 rzs->table[index].flags &= ~BIT(flag);
68 static int page_zero_filled(void *ptr)
70 unsigned int pos;
71 unsigned long *page;
73 page = (unsigned long *)ptr;
75 for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
76 if (page[pos])
77 return 0;
80 return 1;
84 * memlimit cannot be greater than backing disk size.
86 static void ramzswap_set_memlimit(struct ramzswap *rzs, size_t totalram_bytes)
88 int memlimit_valid = 1;
90 if (!rzs->memlimit) {
91 pr_info("Memory limit not set.\n");
92 memlimit_valid = 0;
95 if (rzs->memlimit > rzs->disksize) {
96 pr_info("Memory limit cannot be greater than "
97 "disksize: limit=%zu, disksize=%zu\n",
98 rzs->memlimit, rzs->disksize);
99 memlimit_valid = 0;
102 if (!memlimit_valid) {
103 size_t mempart, disksize;
104 pr_info("Using default: smaller of (%u%% of RAM) and "
105 "(backing disk size).\n",
106 default_memlimit_perc_ram);
107 mempart = default_memlimit_perc_ram * (totalram_bytes / 100);
108 disksize = rzs->disksize;
109 rzs->memlimit = mempart > disksize ? disksize : mempart;
112 if (rzs->memlimit > totalram_bytes / 2) {
113 pr_info(
114 "Its not advisable setting limit more than half of "
115 "size of memory since we expect a 2:1 compression ratio. "
116 "Limit represents amount of *compressed* data we can keep "
117 "in memory!\n"
118 "\tMemory Size: %zu kB\n"
119 "\tLimit you selected: %zu kB\n"
120 "Continuing anyway ...\n",
121 totalram_bytes >> 10, rzs->memlimit >> 10
125 rzs->memlimit &= PAGE_MASK;
126 BUG_ON(!rzs->memlimit);
129 static void ramzswap_set_disksize(struct ramzswap *rzs, size_t totalram_bytes)
131 if (!rzs->disksize) {
132 pr_info(
133 "disk size not provided. You can use disksize_kb module "
134 "param to specify size.\nUsing default: (%u%% of RAM).\n",
135 default_disksize_perc_ram
137 rzs->disksize = default_disksize_perc_ram *
138 (totalram_bytes / 100);
141 if (rzs->disksize > 2 * (totalram_bytes)) {
142 pr_info(
143 "There is little point creating a ramzswap of greater than "
144 "twice the size of memory since we expect a 2:1 compression "
145 "ratio. Note that ramzswap uses about 0.1%% of the size of "
146 "the swap device when not in use so a huge ramzswap is "
147 "wasteful.\n"
148 "\tMemory Size: %zu kB\n"
149 "\tSize you selected: %zu kB\n"
150 "Continuing anyway ...\n",
151 totalram_bytes >> 10, rzs->disksize
155 rzs->disksize &= PAGE_MASK;
159 * Swap header (1st page of swap device) contains information
160 * to indentify it as a swap partition. Prepare such a header
161 * for ramzswap device (ramzswap0) so that swapon can identify
162 * it as swap partition. In case backing swap device is provided,
163 * copy its swap header.
165 static int setup_swap_header(struct ramzswap *rzs, union swap_header *s)
167 int ret = 0;
168 struct page *page;
169 struct address_space *mapping;
170 union swap_header *backing_swap_header;
173 * There is no backing swap device. Create a swap header
174 * that is acceptable by swapon.
176 if (!rzs->backing_swap) {
177 s->info.version = 1;
178 s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
179 s->info.nr_badpages = 0;
180 memcpy(s->magic.magic, "SWAPSPACE2", 10);
181 return 0;
185 * We have a backing swap device. Copy its swap header
186 * to ramzswap device header. If this header contains
187 * invalid information (backing device not a swap
188 * partition, etc.), swapon will fail for ramzswap
189 * which is correct behavior - we don't want to swap
190 * over filesystem partition!
193 /* Read the backing swap header (code from sys_swapon) */
194 mapping = rzs->swap_file->f_mapping;
195 if (!mapping->a_ops->readpage) {
196 ret = -EINVAL;
197 goto out;
200 page = read_mapping_page(mapping, 0, rzs->swap_file);
201 if (IS_ERR(page)) {
202 ret = PTR_ERR(page);
203 goto out;
206 backing_swap_header = kmap(page);
207 memcpy(s, backing_swap_header, sizeof(*s));
208 if (s->info.nr_badpages) {
209 pr_info("Cannot use backing swap with bad pages (%u)\n",
210 s->info.nr_badpages);
211 ret = -EINVAL;
214 * ramzswap disksize equals number of usable pages in backing
215 * swap. Set last_page in swap header to match this disksize
216 * ('last_page' means 0-based index of last usable swap page).
218 s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
219 kunmap(page);
221 out:
222 return ret;
225 static void ramzswap_flush_dcache_page(struct page *page)
227 #ifdef CONFIG_ARM
228 int flag = 0;
230 * Ugly hack to get flush_dcache_page() work on ARM.
231 * page_mapping(page) == NULL after clearing this swap cache flag.
232 * Without clearing this flag, flush_dcache_page() will simply set
233 * "PG_dcache_dirty" bit and return.
235 if (PageSwapCache(page)) {
236 flag = 1;
237 ClearPageSwapCache(page);
239 #endif
240 flush_dcache_page(page);
241 #ifdef CONFIG_ARM
242 if (flag)
243 SetPageSwapCache(page);
244 #endif
247 void ramzswap_ioctl_get_stats(struct ramzswap *rzs,
248 struct ramzswap_ioctl_stats *s)
250 strncpy(s->backing_swap_name, rzs->backing_swap_name,
251 MAX_SWAP_NAME_LEN - 1);
252 s->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
254 s->disksize = rzs->disksize;
255 s->memlimit = rzs->memlimit;
257 #if defined(CONFIG_RAMZSWAP_STATS)
259 struct ramzswap_stats *rs = &rzs->stats;
260 size_t succ_writes, mem_used;
261 unsigned int good_compress_perc = 0, no_compress_perc = 0;
263 mem_used = xv_get_total_size_bytes(rzs->mem_pool)
264 + (rs->pages_expand << PAGE_SHIFT);
265 succ_writes = rs->num_writes - rs->failed_writes;
267 if (succ_writes && rs->pages_stored) {
268 good_compress_perc = rs->good_compress * 100
269 / rs->pages_stored;
270 no_compress_perc = rs->pages_expand * 100
271 / rs->pages_stored;
274 s->num_reads = rs->num_reads;
275 s->num_writes = rs->num_writes;
276 s->failed_reads = rs->failed_reads;
277 s->failed_writes = rs->failed_writes;
278 s->invalid_io = rs->invalid_io;
279 s->pages_zero = rs->pages_zero;
281 s->good_compress_pct = good_compress_perc;
282 s->pages_expand_pct = no_compress_perc;
284 s->pages_stored = rs->pages_stored;
285 s->pages_used = mem_used >> PAGE_SHIFT;
286 s->orig_data_size = rs->pages_stored << PAGE_SHIFT;
287 s->compr_data_size = rs->compr_size;
288 s->mem_used_total = mem_used;
290 s->bdev_num_reads = rs->bdev_num_reads;
291 s->bdev_num_writes = rs->bdev_num_writes;
293 #endif /* CONFIG_RAMZSWAP_STATS */
296 static int add_backing_swap_extent(struct ramzswap *rzs,
297 pgoff_t phy_pagenum,
298 pgoff_t num_pages)
300 unsigned int idx;
301 struct list_head *head;
302 struct page *curr_page, *new_page;
303 unsigned int extents_per_page = PAGE_SIZE /
304 sizeof(struct ramzswap_backing_extent);
306 idx = rzs->num_extents % extents_per_page;
307 if (!idx) {
308 new_page = alloc_page(__GFP_ZERO);
309 if (!new_page)
310 return -ENOMEM;
312 if (rzs->num_extents) {
313 curr_page = virt_to_page(rzs->curr_extent);
314 head = &curr_page->lru;
315 } else {
316 head = &rzs->backing_swap_extent_list;
319 list_add(&new_page->lru, head);
320 rzs->curr_extent = page_address(new_page);
323 rzs->curr_extent->phy_pagenum = phy_pagenum;
324 rzs->curr_extent->num_pages = num_pages;
326 pr_debug("add_extent: idx=%u, phy_pgnum=%lu, num_pgs=%lu, "
327 "pg_last=%lu, curr_ext=%p\n", idx, phy_pagenum, num_pages,
328 phy_pagenum + num_pages - 1, rzs->curr_extent);
330 if (idx != extents_per_page - 1)
331 rzs->curr_extent++;
333 return 0;
336 static int setup_backing_swap_extents(struct ramzswap *rzs,
337 struct inode *inode, unsigned long *num_pages)
339 int ret = 0;
340 unsigned blkbits;
341 unsigned blocks_per_page;
342 pgoff_t contig_pages = 0, total_pages = 0;
343 pgoff_t pagenum = 0, prev_pagenum = 0;
344 sector_t probe_block = 0;
345 sector_t last_block;
347 blkbits = inode->i_blkbits;
348 blocks_per_page = PAGE_SIZE >> blkbits;
350 last_block = i_size_read(inode) >> blkbits;
351 while (probe_block + blocks_per_page <= last_block) {
352 unsigned block_in_page;
353 sector_t first_block;
355 first_block = bmap(inode, probe_block);
356 if (first_block == 0)
357 goto bad_bmap;
359 /* It must be PAGE_SIZE aligned on-disk */
360 if (first_block & (blocks_per_page - 1)) {
361 probe_block++;
362 goto probe_next;
365 /* All blocks within this page must be contiguous on disk */
366 for (block_in_page = 1; block_in_page < blocks_per_page;
367 block_in_page++) {
368 sector_t block;
370 block = bmap(inode, probe_block + block_in_page);
371 if (block == 0)
372 goto bad_bmap;
373 if (block != first_block + block_in_page) {
374 /* Discontiguity */
375 probe_block++;
376 goto probe_next;
381 * We found a PAGE_SIZE length, PAGE_SIZE aligned
382 * run of blocks.
384 pagenum = first_block >> (PAGE_SHIFT - blkbits);
386 if (total_pages && (pagenum != prev_pagenum + 1)) {
387 ret = add_backing_swap_extent(rzs, prev_pagenum -
388 (contig_pages - 1), contig_pages);
389 if (ret < 0)
390 goto out;
391 rzs->num_extents++;
392 contig_pages = 0;
394 total_pages++;
395 contig_pages++;
396 prev_pagenum = pagenum;
397 probe_block += blocks_per_page;
399 probe_next:
400 continue;
403 if (contig_pages) {
404 pr_debug("adding last extent: pagenum=%lu, "
405 "contig_pages=%lu\n", pagenum, contig_pages);
406 ret = add_backing_swap_extent(rzs,
407 prev_pagenum - (contig_pages - 1), contig_pages);
408 if (ret < 0)
409 goto out;
410 rzs->num_extents++;
412 if (!rzs->num_extents) {
413 pr_err("No swap extents found!\n");
414 ret = -EINVAL;
417 if (!ret) {
418 *num_pages = total_pages;
419 pr_info("Found %lu extents containing %luk\n",
420 rzs->num_extents, *num_pages << (PAGE_SHIFT - 10));
422 goto out;
424 bad_bmap:
425 pr_err("Backing swapfile has holes\n");
426 ret = -EINVAL;
427 out:
428 while (ret && !list_empty(&rzs->backing_swap_extent_list)) {
429 struct page *page;
430 struct list_head *entry = rzs->backing_swap_extent_list.next;
431 page = list_entry(entry, struct page, lru);
432 list_del(entry);
433 __free_page(page);
435 return ret;
438 static void map_backing_swap_extents(struct ramzswap *rzs)
440 struct ramzswap_backing_extent *se;
441 struct page *table_page, *se_page;
442 unsigned long num_pages, num_table_pages, entry;
443 unsigned long se_idx, span;
444 unsigned entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
445 unsigned extents_per_page = PAGE_SIZE / sizeof(*se);
447 /* True for block device */
448 if (!rzs->num_extents)
449 return;
451 se_page = list_entry(rzs->backing_swap_extent_list.next,
452 struct page, lru);
453 se = page_address(se_page);
454 span = se->num_pages;
455 num_pages = rzs->disksize >> PAGE_SHIFT;
456 num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
457 PAGE_SIZE);
459 entry = 0;
460 se_idx = 0;
461 while (num_table_pages--) {
462 table_page = vmalloc_to_page(&rzs->table[entry]);
463 while (span <= entry) {
464 se_idx++;
465 if (se_idx == rzs->num_extents)
466 BUG();
468 if (!(se_idx % extents_per_page)) {
469 se_page = list_entry(se_page->lru.next,
470 struct page, lru);
471 se = page_address(se_page);
472 } else
473 se++;
475 span += se->num_pages;
477 table_page->mapping = (struct address_space *)se;
478 table_page->private = se->num_pages - (span - entry);
479 pr_debug("map_table: entry=%lu, span=%lu, map=%p, priv=%lu\n",
480 entry, span, table_page->mapping, table_page->private);
481 entry += entries_per_page;
486 * Check if value of backing_swap module param is sane.
487 * Claim this device and set ramzswap size equal to
488 * size of this block device.
490 static int setup_backing_swap(struct ramzswap *rzs)
492 int ret = 0;
493 size_t disksize;
494 unsigned long num_pages = 0;
495 struct inode *inode;
496 struct file *swap_file;
497 struct address_space *mapping;
498 struct block_device *bdev = NULL;
500 if (!rzs->backing_swap_name[0]) {
501 pr_debug("backing_swap param not given\n");
502 goto out;
505 pr_info("Using backing swap device: %s\n", rzs->backing_swap_name);
507 swap_file = filp_open(rzs->backing_swap_name,
508 O_RDWR | O_LARGEFILE, 0);
509 if (IS_ERR(swap_file)) {
510 pr_err("Error opening backing device: %s\n",
511 rzs->backing_swap_name);
512 ret = -EINVAL;
513 goto out;
516 mapping = swap_file->f_mapping;
517 inode = mapping->host;
519 if (S_ISBLK(inode->i_mode)) {
520 bdev = I_BDEV(inode);
521 ret = bd_claim(bdev, setup_backing_swap);
522 if (ret < 0) {
523 bdev = NULL;
524 goto bad_param;
526 disksize = i_size_read(inode);
527 } else if (S_ISREG(inode->i_mode)) {
528 bdev = inode->i_sb->s_bdev;
529 if (IS_SWAPFILE(inode)) {
530 ret = -EBUSY;
531 goto bad_param;
533 ret = setup_backing_swap_extents(rzs, inode, &num_pages);
534 if (ret < 0)
535 goto bad_param;
536 disksize = num_pages << PAGE_SHIFT;
537 } else {
538 goto bad_param;
541 rzs->swap_file = swap_file;
542 rzs->backing_swap = bdev;
543 rzs->disksize = disksize;
544 BUG_ON(!rzs->disksize);
546 return 0;
548 bad_param:
549 if (bdev)
550 bd_release(bdev);
551 filp_close(swap_file, NULL);
553 out:
554 rzs->backing_swap = NULL;
555 return ret;
559 * Map logical page number 'pagenum' to physical page number
560 * on backing swap device. For block device, this is a nop.
562 u32 map_backing_swap_page(struct ramzswap *rzs, u32 pagenum)
564 u32 skip_pages, entries_per_page;
565 size_t delta, se_offset, skipped;
566 struct page *table_page, *se_page;
567 struct ramzswap_backing_extent *se;
569 if (!rzs->num_extents)
570 return pagenum;
572 entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
574 table_page = vmalloc_to_page(&rzs->table[pagenum]);
575 se = (struct ramzswap_backing_extent *)table_page->mapping;
576 se_page = virt_to_page(se);
578 skip_pages = pagenum - (pagenum / entries_per_page * entries_per_page);
579 se_offset = table_page->private + skip_pages;
581 if (se_offset < se->num_pages)
582 return se->phy_pagenum + se_offset;
584 skipped = se->num_pages - table_page->private;
585 do {
586 struct ramzswap_backing_extent *se_base;
587 u32 se_entries_per_page = PAGE_SIZE / sizeof(*se);
589 /* Get next swap extent */
590 se_base = (struct ramzswap_backing_extent *)
591 page_address(se_page);
592 if (se - se_base == se_entries_per_page - 1) {
593 se_page = list_entry(se_page->lru.next,
594 struct page, lru);
595 se = page_address(se_page);
596 } else {
597 se++;
600 skipped += se->num_pages;
601 } while (skipped < skip_pages);
603 delta = skipped - skip_pages;
604 se_offset = se->num_pages - delta;
606 return se->phy_pagenum + se_offset;
609 static void ramzswap_free_page(struct ramzswap *rzs, size_t index)
611 u32 clen;
612 void *obj;
614 struct page *page = rzs->table[index].page;
615 u32 offset = rzs->table[index].offset;
617 if (unlikely(!page)) {
618 if (rzs_test_flag(rzs, index, RZS_ZERO)) {
619 rzs_clear_flag(rzs, index, RZS_ZERO);
620 stat_dec(rzs->stats.pages_zero);
622 return;
625 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) {
626 clen = PAGE_SIZE;
627 __free_page(page);
628 rzs_clear_flag(rzs, index, RZS_UNCOMPRESSED);
629 stat_dec(rzs->stats.pages_expand);
630 goto out;
633 obj = kmap_atomic(page, KM_USER0) + offset;
634 clen = xv_get_object_size(obj) - sizeof(struct zobj_header);
635 kunmap_atomic(obj, KM_USER0);
637 xv_free(rzs->mem_pool, page, offset);
638 if (clen <= PAGE_SIZE / 2)
639 stat_dec(rzs->stats.good_compress);
641 out:
642 rzs->stats.compr_size -= clen;
643 stat_dec(rzs->stats.pages_stored);
645 rzs->table[index].page = NULL;
646 rzs->table[index].offset = 0;
649 static int handle_zero_page(struct bio *bio)
651 void *user_mem;
652 struct page *page = bio->bi_io_vec[0].bv_page;
654 user_mem = kmap_atomic(page, KM_USER0);
655 memset(user_mem, 0, PAGE_SIZE);
656 kunmap_atomic(user_mem, KM_USER0);
658 ramzswap_flush_dcache_page(page);
660 set_bit(BIO_UPTODATE, &bio->bi_flags);
661 bio_endio(bio, 0);
662 return 0;
665 static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio)
667 u32 index;
668 struct page *page;
669 unsigned char *user_mem, *cmem;
671 page = bio->bi_io_vec[0].bv_page;
672 index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
674 user_mem = kmap_atomic(page, KM_USER0);
675 cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
676 rzs->table[index].offset;
678 memcpy(user_mem, cmem, PAGE_SIZE);
679 kunmap_atomic(user_mem, KM_USER0);
680 kunmap_atomic(cmem, KM_USER1);
682 ramzswap_flush_dcache_page(page);
684 set_bit(BIO_UPTODATE, &bio->bi_flags);
685 bio_endio(bio, 0);
686 return 0;
691 * Called when request page is not present in ramzswap.
692 * Its either in backing swap device (if present) or
693 * this is an attempt to read before any previous write
694 * to this location - this happens due to readahead when
695 * swap device is read from user-space (e.g. during swapon)
697 static int handle_ramzswap_fault(struct ramzswap *rzs, struct bio *bio)
700 * Always forward such requests to backing swap
701 * device (if present)
703 if (rzs->backing_swap) {
704 u32 pagenum;
705 stat_dec(rzs->stats.num_reads);
706 stat_inc(rzs->stats.bdev_num_reads);
707 bio->bi_bdev = rzs->backing_swap;
710 * In case backing swap is a file, find the right offset within
711 * the file corresponding to logical position 'index'. For block
712 * device, this is a nop.
714 pagenum = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
715 bio->bi_sector = map_backing_swap_page(rzs, pagenum)
716 << SECTORS_PER_PAGE_SHIFT;
717 return 1;
721 * Its unlikely event in case backing dev is
722 * not present
724 pr_debug("Read before write on swap device: "
725 "sector=%lu, size=%u, offset=%u\n",
726 (ulong)(bio->bi_sector), bio->bi_size,
727 bio->bi_io_vec[0].bv_offset);
729 /* Do nothing. Just return success */
730 set_bit(BIO_UPTODATE, &bio->bi_flags);
731 bio_endio(bio, 0);
732 return 0;
735 static int ramzswap_read(struct ramzswap *rzs, struct bio *bio)
737 int ret;
738 u32 index;
739 size_t clen;
740 struct page *page;
741 struct zobj_header *zheader;
742 unsigned char *user_mem, *cmem;
744 stat_inc(rzs->stats.num_reads);
746 page = bio->bi_io_vec[0].bv_page;
747 index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
749 if (rzs_test_flag(rzs, index, RZS_ZERO))
750 return handle_zero_page(bio);
752 /* Requested page is not present in compressed area */
753 if (!rzs->table[index].page)
754 return handle_ramzswap_fault(rzs, bio);
756 /* Page is stored uncompressed since its incompressible */
757 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
758 return handle_uncompressed_page(rzs, bio);
760 user_mem = kmap_atomic(page, KM_USER0);
761 clen = PAGE_SIZE;
763 cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
764 rzs->table[index].offset;
766 ret = lzo1x_decompress_safe(
767 cmem + sizeof(*zheader),
768 xv_get_object_size(cmem) - sizeof(*zheader),
769 user_mem, &clen);
771 kunmap_atomic(user_mem, KM_USER0);
772 kunmap_atomic(cmem, KM_USER1);
774 /* should NEVER happen */
775 if (unlikely(ret != LZO_E_OK)) {
776 pr_err("Decompression failed! err=%d, page=%u\n",
777 ret, index);
778 stat_inc(rzs->stats.failed_reads);
779 goto out;
782 ramzswap_flush_dcache_page(page);
784 set_bit(BIO_UPTODATE, &bio->bi_flags);
785 bio_endio(bio, 0);
786 return 0;
788 out:
789 bio_io_error(bio);
790 return 0;
793 static int ramzswap_write(struct ramzswap *rzs, struct bio *bio)
795 int ret, fwd_write_request = 0;
796 u32 offset, index;
797 size_t clen;
798 struct zobj_header *zheader;
799 struct page *page, *page_store;
800 unsigned char *user_mem, *cmem, *src;
802 stat_inc(rzs->stats.num_writes);
804 page = bio->bi_io_vec[0].bv_page;
805 index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
807 src = rzs->compress_buffer;
810 * System swaps to same sector again when the stored page
811 * is no longer referenced by any process. So, its now safe
812 * to free the memory that was allocated for this page.
814 if (rzs->table[index].page)
815 ramzswap_free_page(rzs, index);
818 * No memory ia allocated for zero filled pages.
819 * Simply clear zero page flag.
821 if (rzs_test_flag(rzs, index, RZS_ZERO)) {
822 stat_dec(rzs->stats.pages_zero);
823 rzs_clear_flag(rzs, index, RZS_ZERO);
826 mutex_lock(&rzs->lock);
828 user_mem = kmap_atomic(page, KM_USER0);
829 if (page_zero_filled(user_mem)) {
830 kunmap_atomic(user_mem, KM_USER0);
831 mutex_unlock(&rzs->lock);
832 stat_inc(rzs->stats.pages_zero);
833 rzs_set_flag(rzs, index, RZS_ZERO);
835 set_bit(BIO_UPTODATE, &bio->bi_flags);
836 bio_endio(bio, 0);
837 return 0;
840 if (rzs->backing_swap &&
841 (rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) {
842 kunmap_atomic(user_mem, KM_USER0);
843 mutex_unlock(&rzs->lock);
844 fwd_write_request = 1;
845 goto out;
848 ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
849 rzs->compress_workmem);
851 kunmap_atomic(user_mem, KM_USER0);
853 if (unlikely(ret != LZO_E_OK)) {
854 mutex_unlock(&rzs->lock);
855 pr_err("Compression failed! err=%d\n", ret);
856 stat_inc(rzs->stats.failed_writes);
857 goto out;
861 * Page is incompressible. Forward it to backing swap
862 * if present. Otherwise, store it as-is (uncompressed)
863 * since we do not want to return too many swap write
864 * errors which has side effect of hanging the system.
866 if (unlikely(clen > max_zpage_size)) {
867 if (rzs->backing_swap) {
868 mutex_unlock(&rzs->lock);
869 fwd_write_request = 1;
870 goto out;
873 clen = PAGE_SIZE;
874 page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
875 if (unlikely(!page_store)) {
876 mutex_unlock(&rzs->lock);
877 pr_info("Error allocating memory for incompressible "
878 "page: %u\n", index);
879 stat_inc(rzs->stats.failed_writes);
880 goto out;
883 offset = 0;
884 rzs_set_flag(rzs, index, RZS_UNCOMPRESSED);
885 stat_inc(rzs->stats.pages_expand);
886 rzs->table[index].page = page_store;
887 src = kmap_atomic(page, KM_USER0);
888 goto memstore;
891 if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
892 &rzs->table[index].page, &offset,
893 GFP_NOIO | __GFP_HIGHMEM)) {
894 mutex_unlock(&rzs->lock);
895 pr_info("Error allocating memory for compressed "
896 "page: %u, size=%zu\n", index, clen);
897 stat_inc(rzs->stats.failed_writes);
898 if (rzs->backing_swap)
899 fwd_write_request = 1;
900 goto out;
903 memstore:
904 rzs->table[index].offset = offset;
906 cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
907 rzs->table[index].offset;
909 #if 0
910 /* Back-reference needed for memory defragmentation */
911 if (!rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)) {
912 zheader = (struct zobj_header *)cmem;
913 zheader->table_idx = index;
914 cmem += sizeof(*zheader);
916 #endif
918 memcpy(cmem, src, clen);
920 kunmap_atomic(cmem, KM_USER1);
921 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
922 kunmap_atomic(src, KM_USER0);
924 /* Update stats */
925 rzs->stats.compr_size += clen;
926 stat_inc(rzs->stats.pages_stored);
927 if (clen <= PAGE_SIZE / 2)
928 stat_inc(rzs->stats.good_compress);
930 mutex_unlock(&rzs->lock);
932 set_bit(BIO_UPTODATE, &bio->bi_flags);
933 bio_endio(bio, 0);
934 return 0;
936 out:
937 if (fwd_write_request) {
938 stat_inc(rzs->stats.bdev_num_writes);
939 bio->bi_bdev = rzs->backing_swap;
940 #if 0
942 * TODO: We currently have linear mapping of ramzswap and
943 * backing swap sectors. This is not desired since we want
944 * to optimize writes to backing swap to minimize disk seeks
945 * or have effective wear leveling (for SSDs). Also, a
946 * non-linear mapping is required to implement compressed
947 * on-disk swapping.
949 bio->bi_sector = get_backing_swap_page()
950 << SECTORS_PER_PAGE_SHIFT;
951 #endif
953 * In case backing swap is a file, find the right offset within
954 * the file corresponding to logical position 'index'. For block
955 * device, this is a nop.
957 bio->bi_sector = map_backing_swap_page(rzs, index)
958 << SECTORS_PER_PAGE_SHIFT;
959 return 1;
962 bio_io_error(bio);
963 return 0;
968 * Check if request is within bounds and page aligned.
970 static inline int valid_swap_request(struct ramzswap *rzs, struct bio *bio)
972 if (unlikely(
973 (bio->bi_sector >= (rzs->disksize >> SECTOR_SHIFT)) ||
974 (bio->bi_sector & (SECTORS_PER_PAGE - 1)) ||
975 (bio->bi_vcnt != 1) ||
976 (bio->bi_size != PAGE_SIZE) ||
977 (bio->bi_io_vec[0].bv_offset != 0))) {
979 return 0;
982 /* swap request is valid */
983 return 1;
987 * Handler function for all ramzswap I/O requests.
989 static int ramzswap_make_request(struct request_queue *queue, struct bio *bio)
991 int ret = 0;
992 struct ramzswap *rzs = queue->queuedata;
994 if (unlikely(!rzs->init_done)) {
995 bio_io_error(bio);
996 return 0;
999 if (!valid_swap_request(rzs, bio)) {
1000 stat_inc(rzs->stats.invalid_io);
1001 bio_io_error(bio);
1002 return 0;
1005 switch (bio_data_dir(bio)) {
1006 case READ:
1007 ret = ramzswap_read(rzs, bio);
1008 break;
1010 case WRITE:
1011 ret = ramzswap_write(rzs, bio);
1012 break;
1015 return ret;
1018 static void reset_device(struct ramzswap *rzs)
1020 int is_backing_blkdev = 0;
1021 size_t index, num_pages;
1022 unsigned entries_per_page;
1023 unsigned long num_table_pages, entry = 0;
1025 if (rzs->backing_swap && !rzs->num_extents)
1026 is_backing_blkdev = 1;
1028 num_pages = rzs->disksize >> PAGE_SHIFT;
1030 /* Free various per-device buffers */
1031 kfree(rzs->compress_workmem);
1032 free_pages((unsigned long)rzs->compress_buffer, 1);
1034 rzs->compress_workmem = NULL;
1035 rzs->compress_buffer = NULL;
1037 /* Free all pages that are still in this ramzswap device */
1038 for (index = 0; index < num_pages; index++) {
1039 struct page *page;
1040 u16 offset;
1042 page = rzs->table[index].page;
1043 offset = rzs->table[index].offset;
1045 if (!page)
1046 continue;
1048 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
1049 __free_page(page);
1050 else
1051 xv_free(rzs->mem_pool, page, offset);
1054 entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
1055 num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
1056 PAGE_SIZE);
1058 * Set page->mapping to NULL for every table page.
1059 * Otherwise, we will hit bad_page() during free.
1061 while (rzs->num_extents && num_table_pages--) {
1062 struct page *page;
1063 page = vmalloc_to_page(&rzs->table[entry]);
1064 page->mapping = NULL;
1065 entry += entries_per_page;
1067 vfree(rzs->table);
1068 rzs->table = NULL;
1070 xv_destroy_pool(rzs->mem_pool);
1071 rzs->mem_pool = NULL;
1073 /* Free all swap extent pages */
1074 while (!list_empty(&rzs->backing_swap_extent_list)) {
1075 struct page *page;
1076 struct list_head *entry;
1077 entry = rzs->backing_swap_extent_list.next;
1078 page = list_entry(entry, struct page, lru);
1079 list_del(entry);
1080 __free_page(page);
1082 INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
1083 rzs->num_extents = 0;
1085 /* Close backing swap device, if present */
1086 if (rzs->backing_swap) {
1087 if (is_backing_blkdev)
1088 bd_release(rzs->backing_swap);
1089 filp_close(rzs->swap_file, NULL);
1090 rzs->backing_swap = NULL;
1093 /* Reset stats */
1094 memset(&rzs->stats, 0, sizeof(rzs->stats));
1096 rzs->disksize = 0;
1097 rzs->memlimit = 0;
1099 /* Back to uninitialized state */
1100 rzs->init_done = 0;
1103 static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
1105 int ret;
1106 size_t num_pages;
1107 struct page *page;
1108 union swap_header *swap_header;
1110 if (rzs->init_done) {
1111 pr_info("Device already initialized!\n");
1112 return -EBUSY;
1115 ret = setup_backing_swap(rzs);
1116 if (ret)
1117 goto fail;
1119 if (rzs->backing_swap)
1120 ramzswap_set_memlimit(rzs, totalram_pages << PAGE_SHIFT);
1121 else
1122 ramzswap_set_disksize(rzs, totalram_pages << PAGE_SHIFT);
1124 rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
1125 if (!rzs->compress_workmem) {
1126 pr_err("Error allocating compressor working memory!\n");
1127 ret = -ENOMEM;
1128 goto fail;
1131 rzs->compress_buffer = (void *)__get_free_pages(__GFP_ZERO, 1);
1132 if (!rzs->compress_buffer) {
1133 pr_err("Error allocating compressor buffer space\n");
1134 ret = -ENOMEM;
1135 goto fail;
1138 num_pages = rzs->disksize >> PAGE_SHIFT;
1139 rzs->table = vmalloc(num_pages * sizeof(*rzs->table));
1140 if (!rzs->table) {
1141 pr_err("Error allocating ramzswap address table\n");
1142 /* To prevent accessing table entries during cleanup */
1143 rzs->disksize = 0;
1144 ret = -ENOMEM;
1145 goto fail;
1147 memset(rzs->table, 0, num_pages * sizeof(*rzs->table));
1149 map_backing_swap_extents(rzs);
1151 page = alloc_page(__GFP_ZERO);
1152 if (!page) {
1153 pr_err("Error allocating swap header page\n");
1154 ret = -ENOMEM;
1155 goto fail;
1157 rzs->table[0].page = page;
1158 rzs_set_flag(rzs, 0, RZS_UNCOMPRESSED);
1160 swap_header = kmap(page);
1161 ret = setup_swap_header(rzs, swap_header);
1162 kunmap(page);
1163 if (ret) {
1164 pr_err("Error setting swap header\n");
1165 goto fail;
1168 set_capacity(rzs->disk, rzs->disksize >> SECTOR_SHIFT);
1171 * We have ident mapping of sectors for ramzswap and
1172 * and the backing swap device. So, this queue flag
1173 * should be according to backing dev.
1175 if (!rzs->backing_swap ||
1176 blk_queue_nonrot(rzs->backing_swap->bd_disk->queue))
1177 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, rzs->disk->queue);
1179 rzs->mem_pool = xv_create_pool();
1180 if (!rzs->mem_pool) {
1181 pr_err("Error creating memory pool\n");
1182 ret = -ENOMEM;
1183 goto fail;
1187 * Pages that compress to size greater than this are forwarded
1188 * to physical swap disk (if backing dev is provided)
1189 * TODO: make this configurable
1191 if (rzs->backing_swap)
1192 max_zpage_size = max_zpage_size_bdev;
1193 else
1194 max_zpage_size = max_zpage_size_nobdev;
1195 pr_debug("Max compressed page size: %u bytes\n", max_zpage_size);
1197 rzs->init_done = 1;
1199 pr_debug("Initialization done!\n");
1200 return 0;
1202 fail:
1203 reset_device(rzs);
1205 pr_err("Initialization failed: err=%d\n", ret);
1206 return ret;
1209 static int ramzswap_ioctl_reset_device(struct ramzswap *rzs)
1211 if (rzs->init_done)
1212 reset_device(rzs);
1214 return 0;
1217 static int ramzswap_ioctl(struct block_device *bdev, fmode_t mode,
1218 unsigned int cmd, unsigned long arg)
1220 int ret = 0;
1221 size_t disksize_kb, memlimit_kb;
1223 struct ramzswap *rzs = bdev->bd_disk->private_data;
1225 switch (cmd) {
1226 case RZSIO_SET_DISKSIZE_KB:
1227 if (rzs->init_done) {
1228 ret = -EBUSY;
1229 goto out;
1231 if (copy_from_user(&disksize_kb, (void *)arg,
1232 _IOC_SIZE(cmd))) {
1233 ret = -EFAULT;
1234 goto out;
1236 rzs->disksize = disksize_kb << 10;
1237 pr_info("Disk size set to %zu kB\n", disksize_kb);
1238 break;
1240 case RZSIO_SET_MEMLIMIT_KB:
1241 if (rzs->init_done) {
1242 /* TODO: allow changing memlimit */
1243 ret = -EBUSY;
1244 goto out;
1246 if (copy_from_user(&memlimit_kb, (void *)arg,
1247 _IOC_SIZE(cmd))) {
1248 ret = -EFAULT;
1249 goto out;
1251 rzs->memlimit = memlimit_kb << 10;
1252 pr_info("Memory limit set to %zu kB\n", memlimit_kb);
1253 break;
1255 case RZSIO_SET_BACKING_SWAP:
1256 if (rzs->init_done) {
1257 ret = -EBUSY;
1258 goto out;
1261 if (copy_from_user(&rzs->backing_swap_name, (void *)arg,
1262 _IOC_SIZE(cmd))) {
1263 ret = -EFAULT;
1264 goto out;
1266 rzs->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
1267 pr_info("Backing swap set to %s\n", rzs->backing_swap_name);
1268 break;
1270 case RZSIO_GET_STATS:
1272 struct ramzswap_ioctl_stats *stats;
1273 if (!rzs->init_done) {
1274 ret = -ENOTTY;
1275 goto out;
1277 stats = kzalloc(sizeof(*stats), GFP_KERNEL);
1278 if (!stats) {
1279 ret = -ENOMEM;
1280 goto out;
1282 ramzswap_ioctl_get_stats(rzs, stats);
1283 if (copy_to_user((void *)arg, stats, sizeof(*stats))) {
1284 kfree(stats);
1285 ret = -EFAULT;
1286 goto out;
1288 kfree(stats);
1289 break;
1291 case RZSIO_INIT:
1292 ret = ramzswap_ioctl_init_device(rzs);
1293 break;
1295 case RZSIO_RESET:
1296 /* Do not reset an active device! */
1297 if (bdev->bd_holders) {
1298 ret = -EBUSY;
1299 goto out;
1301 ret = ramzswap_ioctl_reset_device(rzs);
1302 break;
1304 default:
1305 pr_info("Invalid ioctl %u\n", cmd);
1306 ret = -ENOTTY;
1309 out:
1310 return ret;
1313 static struct block_device_operations ramzswap_devops = {
1314 .ioctl = ramzswap_ioctl,
1315 .owner = THIS_MODULE,
1318 static void create_device(struct ramzswap *rzs, int device_id)
1320 mutex_init(&rzs->lock);
1321 INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
1323 rzs->queue = blk_alloc_queue(GFP_KERNEL);
1324 if (!rzs->queue) {
1325 pr_err("Error allocating disk queue for device %d\n",
1326 device_id);
1327 return;
1330 blk_queue_make_request(rzs->queue, ramzswap_make_request);
1331 rzs->queue->queuedata = rzs;
1333 /* gendisk structure */
1334 rzs->disk = alloc_disk(1);
1335 if (!rzs->disk) {
1336 blk_cleanup_queue(rzs->queue);
1337 pr_warning("Error allocating disk structure for device %d\n",
1338 device_id);
1339 return;
1342 rzs->disk->major = ramzswap_major;
1343 rzs->disk->first_minor = device_id;
1344 rzs->disk->fops = &ramzswap_devops;
1345 rzs->disk->queue = rzs->queue;
1346 rzs->disk->private_data = rzs;
1347 snprintf(rzs->disk->disk_name, 16, "ramzswap%d", device_id);
1350 * Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl
1351 * or set equal to backing swap device (if provided)
1353 set_capacity(rzs->disk, 0);
1354 add_disk(rzs->disk);
1356 rzs->init_done = 0;
1359 static void destroy_device(struct ramzswap *rzs)
1361 if (rzs->disk) {
1362 del_gendisk(rzs->disk);
1363 put_disk(rzs->disk);
1366 if (rzs->queue)
1367 blk_cleanup_queue(rzs->queue);
1370 static int __init ramzswap_init(void)
1372 int i, ret;
1374 if (num_devices > max_num_devices) {
1375 pr_warning("Invalid value for num_devices: %u\n",
1376 num_devices);
1377 return -EINVAL;
1380 ramzswap_major = register_blkdev(0, "ramzswap");
1381 if (ramzswap_major <= 0) {
1382 pr_warning("Unable to get major number\n");
1383 return -EBUSY;
1386 if (!num_devices) {
1387 pr_info("num_devices not specified. Using default: 1\n");
1388 num_devices = 1;
1391 /* Allocate the device array and initialize each one */
1392 pr_info("Creating %u devices ...\n", num_devices);
1393 devices = kzalloc(num_devices * sizeof(struct ramzswap), GFP_KERNEL);
1394 if (!devices) {
1395 ret = -ENOMEM;
1396 goto out;
1399 for (i = 0; i < num_devices; i++)
1400 create_device(&devices[i], i);
1402 return 0;
1403 out:
1404 unregister_blkdev(ramzswap_major, "ramzswap");
1405 return ret;
1408 static void __exit ramzswap_exit(void)
1410 int i;
1411 struct ramzswap *rzs;
1413 for (i = 0; i < num_devices; i++) {
1414 rzs = &devices[i];
1416 destroy_device(rzs);
1417 if (rzs->init_done)
1418 reset_device(rzs);
1421 unregister_blkdev(ramzswap_major, "ramzswap");
1423 kfree(devices);
1424 pr_debug("Cleanup done!\n");
1427 module_param(num_devices, uint, 0);
1428 MODULE_PARM_DESC(num_devices, "Number of ramzswap devices");
1430 module_init(ramzswap_init);
1431 module_exit(ramzswap_exit);
1433 MODULE_LICENSE("Dual BSD/GPL");
1434 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
1435 MODULE_DESCRIPTION("Compressed RAM Based Swap Device");