net: move procfs code to net/core/net-procfs.c
[linux/fpc-iii.git] / drivers / staging / ramster / zbud.c
bloba7c436127aa121064a195936979ca01eada57733
1 /*
2 * zbud.c - Compression buddies allocator
4 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
6 * Compression buddies ("zbud") provides for efficiently packing two
7 * (or, possibly in the future, more) compressed pages ("zpages") into
8 * a single "raw" pageframe and for tracking both zpages and pageframes
9 * so that whole pageframes can be easily reclaimed in LRU-like order.
10 * It is designed to be used in conjunction with transcendent memory
11 * ("tmem"); for example separate LRU lists are maintained for persistent
12 * vs. ephemeral pages.
14 * A zbudpage is an overlay for a struct page and thus each zbudpage
15 * refers to a physical pageframe of RAM. When the caller passes a
16 * struct page from the kernel's page allocator, zbud "transforms" it
17 * to a zbudpage which sets/uses a different set of fields than the
18 * struct-page and thus must "untransform" it back by reinitializing
19 * certain fields before the struct-page can be freed. The fields
20 * of a zbudpage include a page lock for controlling access to the
21 * corresponding pageframe, and there is a size field for each zpage.
22 * Each zbudpage also lives on two linked lists: a "budlist" which is
23 * used to support efficient buddying of zpages; and an "lru" which
24 * is used for reclaiming pageframes in approximately least-recently-used
25 * order.
27 * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks"
28 * which contain the compressed data for zero, one, or two zbuds. Contained
29 * with the compressed data is a tmem_handle which is a key to allow
30 * the same data to be found via the tmem interface so the zpage can
31 * be invalidated (for ephemeral pages) or repatriated to the swap cache
32 * (for persistent pages). The contents of a zbudpageframe must never
33 * be accessed without holding the page lock for the corresponding
34 * zbudpage and, to accomodate highmem machines, the contents may
35 * only be examined or changes when kmapped. Thus, when in use, a
36 * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg".
38 * Note that the term "zbud" refers to the combination of a zpage and
39 * a tmem_handle that is stored as one of possibly two "buddied" zpages;
40 * it also generically refers to this allocator... sorry for any confusion.
42 * A zbudref is a pointer to a struct zbudpage (which can be cast to a
43 * struct page), with the LSB either cleared or set to indicate, respectively,
44 * the first or second zpage in the zbudpageframe. Since a zbudref can be
45 * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely
46 * references a stored tmem page and so is the only zbud data structure
47 * externally visible to zbud.c/zbud.h.
49 * Since we wish to reclaim entire pageframes but zpages may be randomly
50 * added and deleted to any given pageframe, we approximate LRU by
51 * promoting a pageframe to MRU when a zpage is added to it, but
52 * leaving it at the current place in the list when a zpage is deleted
53 * from it. As a side effect, zpages that are difficult to buddy (e.g.
54 * very large paages) will be reclaimed faster than average, which seems
55 * reasonable.
57 * In the current implementation, no more than two zpages may be stored in
58 * any pageframe and no zpage ever crosses a pageframe boundary. While
59 * other zpage allocation mechanisms may allow greater density, this two
60 * zpage-per-pageframe limit both ensures simple reclaim of pageframes
61 * (including garbage collection of references to the contents of those
62 * pageframes from tmem data structures) AND avoids the need for compaction.
63 * With additional complexity, zbud could be modified to support storing
64 * up to three zpages per pageframe or, to handle larger average zpages,
65 * up to three zpages per pair of pageframes, but it is not clear if the
66 * additional complexity would be worth it. So consider it an exercise
67 * for future developers.
69 * Note also that zbud does no page allocation or freeing. This is so
70 * that the caller has complete control over and, for accounting, visibility
71 * into if/when pages are allocated and freed.
73 * Finally, note that zbud limits the size of zpages it can store; the
74 * caller must check the zpage size with zbud_max_buddy_size before
75 * storing it, else BUGs will result. User beware.
78 #include <linux/module.h>
79 #include <linux/highmem.h>
80 #include <linux/list.h>
81 #include <linux/spinlock.h>
82 #include <linux/pagemap.h>
83 #include <linux/atomic.h>
84 #include <linux/bug.h>
85 #include "tmem.h"
86 #include "zcache.h"
87 #include "zbud.h"
90 * We need to ensure that a struct zbudpage is never larger than a
91 * struct page. This is checked with a BUG_ON in zbud_init.
93 * The unevictable field indicates that a zbud is being added to the
94 * zbudpage. Since this is a two-phase process (due to tmem locking),
95 * this field locks the zbudpage against eviction when a zbud match
96 * or creation is in process. Since this addition process may occur
97 * in parallel for two zbuds in one zbudpage, the field is a counter
98 * that must not exceed two.
100 struct zbudpage {
101 union {
102 struct page page;
103 struct {
104 unsigned long space_for_flags;
105 struct {
106 unsigned zbud0_size:12;
107 unsigned zbud1_size:12;
108 unsigned unevictable:2;
110 struct list_head budlist;
111 struct list_head lru;
116 struct zbudref {
117 union {
118 struct zbudpage *zbudpage;
119 unsigned long zbudref;
123 #define CHUNK_SHIFT 6
124 #define CHUNK_SIZE (1 << CHUNK_SHIFT)
125 #define CHUNK_MASK (~(CHUNK_SIZE-1))
126 #define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
127 #define MAX_CHUNK (NCHUNKS-1)
130 * The following functions deal with the difference between struct
131 * page and struct zbudpage. Note the hack of using the pageflags
132 * from struct page; this is to avoid duplicating all the complex
133 * pageflag macros.
135 static inline void zbudpage_spin_lock(struct zbudpage *zbudpage)
137 struct page *page = (struct page *)zbudpage;
139 while (unlikely(test_and_set_bit_lock(PG_locked, &page->flags))) {
140 do {
141 cpu_relax();
142 } while (test_bit(PG_locked, &page->flags));
146 static inline void zbudpage_spin_unlock(struct zbudpage *zbudpage)
148 struct page *page = (struct page *)zbudpage;
150 clear_bit(PG_locked, &page->flags);
153 static inline int zbudpage_spin_trylock(struct zbudpage *zbudpage)
155 return trylock_page((struct page *)zbudpage);
158 static inline int zbudpage_is_locked(struct zbudpage *zbudpage)
160 return PageLocked((struct page *)zbudpage);
163 static inline void *kmap_zbudpage_atomic(struct zbudpage *zbudpage)
165 return kmap_atomic((struct page *)zbudpage);
169 * A dying zbudpage is an ephemeral page in the process of being evicted.
170 * Any data contained in the zbudpage is invalid and we are just waiting for
171 * the tmem pampds to be invalidated before freeing the page
173 static inline int zbudpage_is_dying(struct zbudpage *zbudpage)
175 struct page *page = (struct page *)zbudpage;
177 return test_bit(PG_reclaim, &page->flags);
180 static inline void zbudpage_set_dying(struct zbudpage *zbudpage)
182 struct page *page = (struct page *)zbudpage;
184 set_bit(PG_reclaim, &page->flags);
187 static inline void zbudpage_clear_dying(struct zbudpage *zbudpage)
189 struct page *page = (struct page *)zbudpage;
191 clear_bit(PG_reclaim, &page->flags);
195 * A zombie zbudpage is a persistent page in the process of being evicted.
196 * The data contained in the zbudpage is valid and we are just waiting for
197 * the tmem pampds to be invalidated before freeing the page
199 static inline int zbudpage_is_zombie(struct zbudpage *zbudpage)
201 struct page *page = (struct page *)zbudpage;
203 return test_bit(PG_dirty, &page->flags);
206 static inline void zbudpage_set_zombie(struct zbudpage *zbudpage)
208 struct page *page = (struct page *)zbudpage;
210 set_bit(PG_dirty, &page->flags);
213 static inline void zbudpage_clear_zombie(struct zbudpage *zbudpage)
215 struct page *page = (struct page *)zbudpage;
217 clear_bit(PG_dirty, &page->flags);
220 static inline void kunmap_zbudpage_atomic(void *zbpg)
222 kunmap_atomic(zbpg);
226 * zbud "translation" and helper functions
229 static inline struct zbudpage *zbudref_to_zbudpage(struct zbudref *zref)
231 unsigned long zbud = (unsigned long)zref;
232 zbud &= ~1UL;
233 return (struct zbudpage *)zbud;
236 static inline struct zbudref *zbudpage_to_zbudref(struct zbudpage *zbudpage,
237 unsigned budnum)
239 unsigned long zbud = (unsigned long)zbudpage;
240 BUG_ON(budnum > 1);
241 zbud |= budnum;
242 return (struct zbudref *)zbud;
245 static inline int zbudref_budnum(struct zbudref *zbudref)
247 unsigned long zbud = (unsigned long)zbudref;
248 return zbud & 1UL;
251 static inline unsigned zbud_max_size(void)
253 return MAX_CHUNK << CHUNK_SHIFT;
256 static inline unsigned zbud_size_to_chunks(unsigned size)
258 BUG_ON(size == 0 || size > zbud_max_size());
259 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
262 /* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */
263 static inline char *zbud_data(void *zbpg,
264 unsigned budnum, unsigned size)
266 char *p;
268 BUG_ON(size == 0 || size > zbud_max_size());
269 p = (char *)zbpg;
270 if (budnum == 1)
271 p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
272 return p;
276 * These are all informative and exposed through debugfs... except for
277 * the arrays... anyone know how to do that? To avoid confusion for
278 * debugfs viewers, some of these should also be atomic_long_t, but
279 * I don't know how to expose atomics via debugfs either...
281 static unsigned long zbud_eph_pageframes;
282 static unsigned long zbud_pers_pageframes;
283 static unsigned long zbud_eph_zpages;
284 static unsigned long zbud_pers_zpages;
285 static u64 zbud_eph_zbytes;
286 static u64 zbud_pers_zbytes;
287 static unsigned long zbud_eph_evicted_pageframes;
288 static unsigned long zbud_pers_evicted_pageframes;
289 static unsigned long zbud_eph_cumul_zpages;
290 static unsigned long zbud_pers_cumul_zpages;
291 static u64 zbud_eph_cumul_zbytes;
292 static u64 zbud_pers_cumul_zbytes;
293 static unsigned long zbud_eph_cumul_chunk_counts[NCHUNKS];
294 static unsigned long zbud_pers_cumul_chunk_counts[NCHUNKS];
295 static unsigned long zbud_eph_buddied_count;
296 static unsigned long zbud_pers_buddied_count;
297 static unsigned long zbud_eph_unbuddied_count;
298 static unsigned long zbud_pers_unbuddied_count;
299 static unsigned long zbud_eph_zombie_count;
300 static unsigned long zbud_pers_zombie_count;
301 static atomic_t zbud_eph_zombie_atomic;
302 static atomic_t zbud_pers_zombie_atomic;
304 #ifdef CONFIG_DEBUG_FS
305 #include <linux/debugfs.h>
306 #define zdfs debugfs_create_size_t
307 #define zdfs64 debugfs_create_u64
308 static int zbud_debugfs_init(void)
310 struct dentry *root = debugfs_create_dir("zbud", NULL);
311 if (root == NULL)
312 return -ENXIO;
315 * would be nice to dump the sizes of the unbuddied
316 * arrays, like was done with sysfs, but it doesn't
317 * look like debugfs is flexible enough to do that
319 zdfs64("eph_zbytes", S_IRUGO, root, &zbud_eph_zbytes);
320 zdfs64("eph_cumul_zbytes", S_IRUGO, root, &zbud_eph_cumul_zbytes);
321 zdfs64("pers_zbytes", S_IRUGO, root, &zbud_pers_zbytes);
322 zdfs64("pers_cumul_zbytes", S_IRUGO, root, &zbud_pers_cumul_zbytes);
323 zdfs("eph_cumul_zpages", S_IRUGO, root, &zbud_eph_cumul_zpages);
324 zdfs("eph_evicted_pageframes", S_IRUGO, root,
325 &zbud_eph_evicted_pageframes);
326 zdfs("eph_zpages", S_IRUGO, root, &zbud_eph_zpages);
327 zdfs("eph_pageframes", S_IRUGO, root, &zbud_eph_pageframes);
328 zdfs("eph_buddied_count", S_IRUGO, root, &zbud_eph_buddied_count);
329 zdfs("eph_unbuddied_count", S_IRUGO, root, &zbud_eph_unbuddied_count);
330 zdfs("pers_cumul_zpages", S_IRUGO, root, &zbud_pers_cumul_zpages);
331 zdfs("pers_evicted_pageframes", S_IRUGO, root,
332 &zbud_pers_evicted_pageframes);
333 zdfs("pers_zpages", S_IRUGO, root, &zbud_pers_zpages);
334 zdfs("pers_pageframes", S_IRUGO, root, &zbud_pers_pageframes);
335 zdfs("pers_buddied_count", S_IRUGO, root, &zbud_pers_buddied_count);
336 zdfs("pers_unbuddied_count", S_IRUGO, root, &zbud_pers_unbuddied_count);
337 zdfs("pers_zombie_count", S_IRUGO, root, &zbud_pers_zombie_count);
338 return 0;
340 #undef zdfs
341 #undef zdfs64
342 #endif
344 /* protects the buddied list and all unbuddied lists */
345 static DEFINE_SPINLOCK(zbud_eph_lists_lock);
346 static DEFINE_SPINLOCK(zbud_pers_lists_lock);
348 struct zbud_unbuddied {
349 struct list_head list;
350 unsigned count;
353 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
354 /* element 0 is never used but optimizing that isn't worth it */
355 static struct zbud_unbuddied zbud_eph_unbuddied[NCHUNKS];
356 static struct zbud_unbuddied zbud_pers_unbuddied[NCHUNKS];
357 static LIST_HEAD(zbud_eph_lru_list);
358 static LIST_HEAD(zbud_pers_lru_list);
359 static LIST_HEAD(zbud_eph_buddied_list);
360 static LIST_HEAD(zbud_pers_buddied_list);
361 static LIST_HEAD(zbud_eph_zombie_list);
362 static LIST_HEAD(zbud_pers_zombie_list);
365 * Given a struct page, transform it to a zbudpage so that it can be
366 * used by zbud and initialize fields as necessary.
368 static inline struct zbudpage *zbud_init_zbudpage(struct page *page, bool eph)
370 struct zbudpage *zbudpage = (struct zbudpage *)page;
372 BUG_ON(page == NULL);
373 INIT_LIST_HEAD(&zbudpage->budlist);
374 INIT_LIST_HEAD(&zbudpage->lru);
375 zbudpage->zbud0_size = 0;
376 zbudpage->zbud1_size = 0;
377 zbudpage->unevictable = 0;
378 if (eph)
379 zbud_eph_pageframes++;
380 else
381 zbud_pers_pageframes++;
382 return zbudpage;
385 /* "Transform" a zbudpage back to a struct page suitable to free. */
386 static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage,
387 bool eph)
389 struct page *page = (struct page *)zbudpage;
391 BUG_ON(!list_empty(&zbudpage->budlist));
392 BUG_ON(!list_empty(&zbudpage->lru));
393 BUG_ON(zbudpage->zbud0_size != 0);
394 BUG_ON(zbudpage->zbud1_size != 0);
395 BUG_ON(!PageLocked(page));
396 BUG_ON(zbudpage->unevictable != 0);
397 BUG_ON(zbudpage_is_dying(zbudpage));
398 BUG_ON(zbudpage_is_zombie(zbudpage));
399 if (eph)
400 zbud_eph_pageframes--;
401 else
402 zbud_pers_pageframes--;
403 zbudpage_spin_unlock(zbudpage);
404 reset_page_mapcount(page);
405 init_page_count(page);
406 page->index = 0;
407 return page;
410 /* Mark a zbud as unused and do accounting */
411 static inline void zbud_unuse_zbud(struct zbudpage *zbudpage,
412 int budnum, bool eph)
414 unsigned size;
416 BUG_ON(!zbudpage_is_locked(zbudpage));
417 if (budnum == 0) {
418 size = zbudpage->zbud0_size;
419 zbudpage->zbud0_size = 0;
420 } else {
421 size = zbudpage->zbud1_size;
422 zbudpage->zbud1_size = 0;
424 if (eph) {
425 zbud_eph_zbytes -= size;
426 zbud_eph_zpages--;
427 } else {
428 zbud_pers_zbytes -= size;
429 zbud_pers_zpages--;
434 * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer
435 * to some data, set up the zbud appropriately including data copying
436 * and accounting. Note that if cdata is NULL, the data copying is
437 * skipped. (This is useful for lazy writes such as for RAMster.)
439 static void zbud_init_zbud(struct zbudpage *zbudpage, struct tmem_handle *th,
440 bool eph, void *cdata,
441 unsigned budnum, unsigned size)
443 char *to;
444 void *zbpg;
445 struct tmem_handle *to_th;
446 unsigned nchunks = zbud_size_to_chunks(size);
448 BUG_ON(!zbudpage_is_locked(zbudpage));
449 zbpg = kmap_zbudpage_atomic(zbudpage);
450 to = zbud_data(zbpg, budnum, size);
451 to_th = (struct tmem_handle *)to;
452 to_th->index = th->index;
453 to_th->oid = th->oid;
454 to_th->pool_id = th->pool_id;
455 to_th->client_id = th->client_id;
456 to += sizeof(struct tmem_handle);
457 if (cdata != NULL)
458 memcpy(to, cdata, size - sizeof(struct tmem_handle));
459 kunmap_zbudpage_atomic(zbpg);
460 if (budnum == 0)
461 zbudpage->zbud0_size = size;
462 else
463 zbudpage->zbud1_size = size;
464 if (eph) {
465 zbud_eph_cumul_chunk_counts[nchunks]++;
466 zbud_eph_zpages++;
467 zbud_eph_cumul_zpages++;
468 zbud_eph_zbytes += size;
469 zbud_eph_cumul_zbytes += size;
470 } else {
471 zbud_pers_cumul_chunk_counts[nchunks]++;
472 zbud_pers_zpages++;
473 zbud_pers_cumul_zpages++;
474 zbud_pers_zbytes += size;
475 zbud_pers_cumul_zbytes += size;
480 * Given a locked dying zbudpage, read out the tmem handles from the data,
481 * unlock the page, then use the handles to tell tmem to flush out its
482 * references
484 static void zbud_evict_tmem(struct zbudpage *zbudpage)
486 int i, j;
487 uint32_t pool_id[2], client_id[2];
488 uint32_t index[2];
489 struct tmem_oid oid[2];
490 struct tmem_pool *pool;
491 void *zbpg;
492 struct tmem_handle *th;
493 unsigned size;
495 /* read out the tmem handles from the data and set aside */
496 zbpg = kmap_zbudpage_atomic(zbudpage);
497 for (i = 0, j = 0; i < 2; i++) {
498 size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
499 if (size) {
500 th = (struct tmem_handle *)zbud_data(zbpg, i, size);
501 client_id[j] = th->client_id;
502 pool_id[j] = th->pool_id;
503 oid[j] = th->oid;
504 index[j] = th->index;
505 j++;
506 zbud_unuse_zbud(zbudpage, i, true);
509 kunmap_zbudpage_atomic(zbpg);
510 zbudpage_spin_unlock(zbudpage);
511 /* zbudpage is now an unlocked dying... tell tmem to flush pointers */
512 for (i = 0; i < j; i++) {
513 pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
514 if (pool != NULL) {
515 tmem_flush_page(pool, &oid[i], index[i]);
516 zcache_put_pool(pool);
522 * Externally callable zbud handling routines.
526 * Return the maximum size compressed page that can be stored (secretly
527 * setting aside space for the tmem handle.
529 unsigned int zbud_max_buddy_size(void)
531 return zbud_max_size() - sizeof(struct tmem_handle);
535 * Given a zbud reference, free the corresponding zbud from all lists,
536 * mark it as unused, do accounting, and if the freeing of the zbud
537 * frees up an entire pageframe, return it to the caller (else NULL).
539 struct page *zbud_free_and_delist(struct zbudref *zref, bool eph,
540 unsigned int *zsize, unsigned int *zpages)
542 unsigned long budnum = zbudref_budnum(zref);
543 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
544 struct page *page = NULL;
545 unsigned chunks, bud_size, other_bud_size;
546 spinlock_t *lists_lock =
547 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
548 struct zbud_unbuddied *unbud =
549 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
552 spin_lock(lists_lock);
553 zbudpage_spin_lock(zbudpage);
554 if (zbudpage_is_dying(zbudpage)) {
555 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
556 zbudpage_spin_unlock(zbudpage);
557 spin_unlock(lists_lock);
558 *zpages = 0;
559 *zsize = 0;
560 goto out;
562 if (budnum == 0) {
563 bud_size = zbudpage->zbud0_size;
564 other_bud_size = zbudpage->zbud1_size;
565 } else {
566 bud_size = zbudpage->zbud1_size;
567 other_bud_size = zbudpage->zbud0_size;
569 *zsize = bud_size - sizeof(struct tmem_handle);
570 *zpages = 1;
571 zbud_unuse_zbud(zbudpage, budnum, eph);
572 if (other_bud_size == 0) { /* was unbuddied: unlist and free */
573 chunks = zbud_size_to_chunks(bud_size) ;
574 if (zbudpage_is_zombie(zbudpage)) {
575 if (eph)
576 zbud_pers_zombie_count =
577 atomic_dec_return(&zbud_eph_zombie_atomic);
578 else
579 zbud_pers_zombie_count =
580 atomic_dec_return(&zbud_pers_zombie_atomic);
581 zbudpage_clear_zombie(zbudpage);
582 } else {
583 BUG_ON(list_empty(&unbud[chunks].list));
584 list_del_init(&zbudpage->budlist);
585 unbud[chunks].count--;
587 list_del_init(&zbudpage->lru);
588 spin_unlock(lists_lock);
589 if (eph)
590 zbud_eph_unbuddied_count--;
591 else
592 zbud_pers_unbuddied_count--;
593 page = zbud_unuse_zbudpage(zbudpage, eph);
594 } else { /* was buddied: move remaining buddy to unbuddied list */
595 chunks = zbud_size_to_chunks(other_bud_size) ;
596 if (!zbudpage_is_zombie(zbudpage)) {
597 list_del_init(&zbudpage->budlist);
598 list_add_tail(&zbudpage->budlist, &unbud[chunks].list);
599 unbud[chunks].count++;
601 if (eph) {
602 zbud_eph_buddied_count--;
603 zbud_eph_unbuddied_count++;
604 } else {
605 zbud_pers_unbuddied_count++;
606 zbud_pers_buddied_count--;
608 /* don't mess with lru, no need to move it */
609 zbudpage_spin_unlock(zbudpage);
610 spin_unlock(lists_lock);
612 out:
613 return page;
617 * Given a tmem handle, and a kmapped pointer to compressed data of
618 * the given size, try to find an unbuddied zbudpage in which to
619 * create a zbud. If found, put it there, mark the zbudpage unevictable,
620 * and return a zbudref to it. Else return NULL.
622 struct zbudref *zbud_match_prep(struct tmem_handle *th, bool eph,
623 void *cdata, unsigned size)
625 struct zbudpage *zbudpage = NULL, *zbudpage2;
626 unsigned long budnum = 0UL;
627 unsigned nchunks;
628 int i, found_good_buddy = 0;
629 spinlock_t *lists_lock =
630 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
631 struct zbud_unbuddied *unbud =
632 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
634 size += sizeof(struct tmem_handle);
635 nchunks = zbud_size_to_chunks(size);
636 for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
637 spin_lock(lists_lock);
638 if (!list_empty(&unbud[i].list)) {
639 list_for_each_entry_safe(zbudpage, zbudpage2,
640 &unbud[i].list, budlist) {
641 if (zbudpage_spin_trylock(zbudpage)) {
642 found_good_buddy = i;
643 goto found_unbuddied;
647 spin_unlock(lists_lock);
649 zbudpage = NULL;
650 goto out;
652 found_unbuddied:
653 BUG_ON(!zbudpage_is_locked(zbudpage));
654 BUG_ON(!((zbudpage->zbud0_size == 0) ^ (zbudpage->zbud1_size == 0)));
655 if (zbudpage->zbud0_size == 0)
656 budnum = 0UL;
657 else if (zbudpage->zbud1_size == 0)
658 budnum = 1UL;
659 list_del_init(&zbudpage->budlist);
660 if (eph) {
661 list_add_tail(&zbudpage->budlist, &zbud_eph_buddied_list);
662 unbud[found_good_buddy].count--;
663 zbud_eph_unbuddied_count--;
664 zbud_eph_buddied_count++;
665 /* "promote" raw zbudpage to most-recently-used */
666 list_del_init(&zbudpage->lru);
667 list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
668 } else {
669 list_add_tail(&zbudpage->budlist, &zbud_pers_buddied_list);
670 unbud[found_good_buddy].count--;
671 zbud_pers_unbuddied_count--;
672 zbud_pers_buddied_count++;
673 /* "promote" raw zbudpage to most-recently-used */
674 list_del_init(&zbudpage->lru);
675 list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
677 zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
678 zbudpage->unevictable++;
679 BUG_ON(zbudpage->unevictable == 3);
680 zbudpage_spin_unlock(zbudpage);
681 spin_unlock(lists_lock);
682 out:
683 return zbudpage_to_zbudref(zbudpage, budnum);
688 * Given a tmem handle, and a kmapped pointer to compressed data of
689 * the given size, and a newly allocated struct page, create an unevictable
690 * zbud in that new page and return a zbudref to it.
692 struct zbudref *zbud_create_prep(struct tmem_handle *th, bool eph,
693 void *cdata, unsigned size,
694 struct page *newpage)
696 struct zbudpage *zbudpage;
697 unsigned long budnum = 0;
698 unsigned nchunks;
699 spinlock_t *lists_lock =
700 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
701 struct zbud_unbuddied *unbud =
702 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
704 #if 0
705 /* this may be worth it later to support decompress-in-place? */
706 static unsigned long counter;
707 budnum = counter++ & 1; /* alternate using zbud0 and zbud1 */
708 #endif
710 if (size > zbud_max_buddy_size())
711 return NULL;
712 if (newpage == NULL)
713 return NULL;
715 size += sizeof(struct tmem_handle);
716 nchunks = zbud_size_to_chunks(size) ;
717 spin_lock(lists_lock);
718 zbudpage = zbud_init_zbudpage(newpage, eph);
719 zbudpage_spin_lock(zbudpage);
720 list_add_tail(&zbudpage->budlist, &unbud[nchunks].list);
721 if (eph) {
722 list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
723 zbud_eph_unbuddied_count++;
724 } else {
725 list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
726 zbud_pers_unbuddied_count++;
728 unbud[nchunks].count++;
729 zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
730 zbudpage->unevictable++;
731 BUG_ON(zbudpage->unevictable == 3);
732 zbudpage_spin_unlock(zbudpage);
733 spin_unlock(lists_lock);
734 return zbudpage_to_zbudref(zbudpage, budnum);
738 * Finish creation of a zbud by, assuming another zbud isn't being created
739 * in parallel, marking it evictable.
741 void zbud_create_finish(struct zbudref *zref, bool eph)
743 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
744 spinlock_t *lists_lock =
745 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
747 spin_lock(lists_lock);
748 zbudpage_spin_lock(zbudpage);
749 BUG_ON(zbudpage_is_dying(zbudpage));
750 zbudpage->unevictable--;
751 BUG_ON((int)zbudpage->unevictable < 0);
752 zbudpage_spin_unlock(zbudpage);
753 spin_unlock(lists_lock);
757 * Given a zbudref and a struct page, decompress the data from
758 * the zbud into the physical page represented by the struct page
759 * by upcalling to zcache_decompress
761 int zbud_decompress(struct page *data_page, struct zbudref *zref, bool eph,
762 void (*decompress)(char *, unsigned int, char *))
764 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
765 unsigned long budnum = zbudref_budnum(zref);
766 void *zbpg;
767 char *to_va, *from_va;
768 unsigned size;
769 int ret = -1;
770 spinlock_t *lists_lock =
771 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
773 spin_lock(lists_lock);
774 zbudpage_spin_lock(zbudpage);
775 if (zbudpage_is_dying(zbudpage)) {
776 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
777 goto out;
779 zbpg = kmap_zbudpage_atomic(zbudpage);
780 to_va = kmap_atomic(data_page);
781 if (budnum == 0)
782 size = zbudpage->zbud0_size;
783 else
784 size = zbudpage->zbud1_size;
785 BUG_ON(size == 0 || size > zbud_max_size());
786 from_va = zbud_data(zbpg, budnum, size);
787 from_va += sizeof(struct tmem_handle);
788 size -= sizeof(struct tmem_handle);
789 decompress(from_va, size, to_va);
790 kunmap_atomic(to_va);
791 kunmap_zbudpage_atomic(zbpg);
792 ret = 0;
793 out:
794 zbudpage_spin_unlock(zbudpage);
795 spin_unlock(lists_lock);
796 return ret;
800 * Given a zbudref and a kernel pointer, copy the data from
801 * the zbud to the kernel pointer.
803 int zbud_copy_from_zbud(char *to_va, struct zbudref *zref,
804 size_t *sizep, bool eph)
806 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
807 unsigned long budnum = zbudref_budnum(zref);
808 void *zbpg;
809 char *from_va;
810 unsigned size;
811 int ret = -1;
812 spinlock_t *lists_lock =
813 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
815 spin_lock(lists_lock);
816 zbudpage_spin_lock(zbudpage);
817 if (zbudpage_is_dying(zbudpage)) {
818 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
819 goto out;
821 zbpg = kmap_zbudpage_atomic(zbudpage);
822 if (budnum == 0)
823 size = zbudpage->zbud0_size;
824 else
825 size = zbudpage->zbud1_size;
826 BUG_ON(size == 0 || size > zbud_max_size());
827 from_va = zbud_data(zbpg, budnum, size);
828 from_va += sizeof(struct tmem_handle);
829 size -= sizeof(struct tmem_handle);
830 *sizep = size;
831 memcpy(to_va, from_va, size);
833 kunmap_zbudpage_atomic(zbpg);
834 ret = 0;
835 out:
836 zbudpage_spin_unlock(zbudpage);
837 spin_unlock(lists_lock);
838 return ret;
842 * Given a zbudref and a kernel pointer, copy the data from
843 * the kernel pointer to the zbud.
845 int zbud_copy_to_zbud(struct zbudref *zref, char *from_va, bool eph)
847 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
848 unsigned long budnum = zbudref_budnum(zref);
849 void *zbpg;
850 char *to_va;
851 unsigned size;
852 int ret = -1;
853 spinlock_t *lists_lock =
854 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
856 spin_lock(lists_lock);
857 zbudpage_spin_lock(zbudpage);
858 if (zbudpage_is_dying(zbudpage)) {
859 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
860 goto out;
862 zbpg = kmap_zbudpage_atomic(zbudpage);
863 if (budnum == 0)
864 size = zbudpage->zbud0_size;
865 else
866 size = zbudpage->zbud1_size;
867 BUG_ON(size == 0 || size > zbud_max_size());
868 to_va = zbud_data(zbpg, budnum, size);
869 to_va += sizeof(struct tmem_handle);
870 size -= sizeof(struct tmem_handle);
871 memcpy(to_va, from_va, size);
873 kunmap_zbudpage_atomic(zbpg);
874 ret = 0;
875 out:
876 zbudpage_spin_unlock(zbudpage);
877 spin_unlock(lists_lock);
878 return ret;
882 * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure
883 * there are no references to it remaining, and return the now unused
884 * (and re-init'ed) struct page and the total amount of compressed
885 * data that was evicted.
887 struct page *zbud_evict_pageframe_lru(unsigned int *zsize, unsigned int *zpages)
889 struct zbudpage *zbudpage = NULL, *zbudpage2;
890 struct zbud_unbuddied *unbud = zbud_eph_unbuddied;
891 struct page *page = NULL;
892 bool irqs_disabled = irqs_disabled();
895 * Since this can be called indirectly from cleancache_put, which
896 * has interrupts disabled, as well as frontswap_put, which does not,
897 * we need to be able to handle both cases, even though it is ugly.
899 if (irqs_disabled)
900 spin_lock(&zbud_eph_lists_lock);
901 else
902 spin_lock_bh(&zbud_eph_lists_lock);
903 *zsize = 0;
904 if (list_empty(&zbud_eph_lru_list))
905 goto unlock_out;
906 list_for_each_entry_safe(zbudpage, zbudpage2, &zbud_eph_lru_list, lru) {
907 /* skip a locked zbudpage */
908 if (unlikely(!zbudpage_spin_trylock(zbudpage)))
909 continue;
910 /* skip an unevictable zbudpage */
911 if (unlikely(zbudpage->unevictable != 0)) {
912 zbudpage_spin_unlock(zbudpage);
913 continue;
915 /* got a locked evictable page */
916 goto evict_page;
919 unlock_out:
920 /* no unlocked evictable pages, give up */
921 if (irqs_disabled)
922 spin_unlock(&zbud_eph_lists_lock);
923 else
924 spin_unlock_bh(&zbud_eph_lists_lock);
925 goto out;
927 evict_page:
928 list_del_init(&zbudpage->budlist);
929 list_del_init(&zbudpage->lru);
930 zbudpage_set_dying(zbudpage);
932 * the zbudpage is now "dying" and attempts to read, write,
933 * or delete data from it will be ignored
935 if (zbudpage->zbud0_size != 0 && zbudpage->zbud1_size != 0) {
936 *zsize = zbudpage->zbud0_size + zbudpage->zbud1_size -
937 (2 * sizeof(struct tmem_handle));
938 *zpages = 2;
939 } else if (zbudpage->zbud0_size != 0) {
940 unbud[zbud_size_to_chunks(zbudpage->zbud0_size)].count--;
941 *zsize = zbudpage->zbud0_size - sizeof(struct tmem_handle);
942 *zpages = 1;
943 } else if (zbudpage->zbud1_size != 0) {
944 unbud[zbud_size_to_chunks(zbudpage->zbud1_size)].count--;
945 *zsize = zbudpage->zbud1_size - sizeof(struct tmem_handle);
946 *zpages = 1;
947 } else {
948 BUG();
950 spin_unlock(&zbud_eph_lists_lock);
951 zbud_eph_evicted_pageframes++;
952 if (*zpages == 1)
953 zbud_eph_unbuddied_count--;
954 else
955 zbud_eph_buddied_count--;
956 zbud_evict_tmem(zbudpage);
957 zbudpage_spin_lock(zbudpage);
958 zbudpage_clear_dying(zbudpage);
959 page = zbud_unuse_zbudpage(zbudpage, true);
960 if (!irqs_disabled)
961 local_bh_enable();
962 out:
963 return page;
967 * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it,
968 * read the tmem_handle(s) out of it into the passed array, and return the
969 * number of zbuds. Caller must perform necessary tmem functions and,
970 * indirectly, zbud functions to fetch any valid data and cause the
971 * now-zombified zbudpage to eventually be freed. We track the zombified
972 * zbudpage count so it is possible to observe if there is a leak.
973 FIXME: describe (ramster) case where data pointers are passed in for memcpy
975 unsigned int zbud_make_zombie_lru(struct tmem_handle *th, unsigned char **data,
976 unsigned int *zsize, bool eph)
978 struct zbudpage *zbudpage = NULL, *zbudpag2;
979 struct tmem_handle *thfrom;
980 char *from_va;
981 void *zbpg;
982 unsigned size;
983 int ret = 0, i;
984 spinlock_t *lists_lock =
985 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
986 struct list_head *lru_list =
987 eph ? &zbud_eph_lru_list : &zbud_pers_lru_list;
989 spin_lock_bh(lists_lock);
990 if (list_empty(lru_list))
991 goto out;
992 list_for_each_entry_safe(zbudpage, zbudpag2, lru_list, lru) {
993 /* skip a locked zbudpage */
994 if (unlikely(!zbudpage_spin_trylock(zbudpage)))
995 continue;
996 /* skip an unevictable zbudpage */
997 if (unlikely(zbudpage->unevictable != 0)) {
998 zbudpage_spin_unlock(zbudpage);
999 continue;
1001 /* got a locked evictable page */
1002 goto zombify_page;
1004 /* no unlocked evictable pages, give up */
1005 goto out;
1007 zombify_page:
1008 /* got an unlocked evictable page, zombify it */
1009 list_del_init(&zbudpage->budlist);
1010 zbudpage_set_zombie(zbudpage);
1011 /* FIXME what accounting do I need to do here? */
1012 list_del_init(&zbudpage->lru);
1013 if (eph) {
1014 list_add_tail(&zbudpage->lru, &zbud_eph_zombie_list);
1015 zbud_eph_zombie_count =
1016 atomic_inc_return(&zbud_eph_zombie_atomic);
1017 } else {
1018 list_add_tail(&zbudpage->lru, &zbud_pers_zombie_list);
1019 zbud_pers_zombie_count =
1020 atomic_inc_return(&zbud_pers_zombie_atomic);
1022 /* FIXME what accounting do I need to do here? */
1023 zbpg = kmap_zbudpage_atomic(zbudpage);
1024 for (i = 0; i < 2; i++) {
1025 size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
1026 if (size) {
1027 from_va = zbud_data(zbpg, i, size);
1028 thfrom = (struct tmem_handle *)from_va;
1029 from_va += sizeof(struct tmem_handle);
1030 size -= sizeof(struct tmem_handle);
1031 if (th != NULL)
1032 th[ret] = *thfrom;
1033 if (data != NULL)
1034 memcpy(data[ret], from_va, size);
1035 if (zsize != NULL)
1036 *zsize++ = size;
1037 ret++;
1040 kunmap_zbudpage_atomic(zbpg);
1041 zbudpage_spin_unlock(zbudpage);
1042 out:
1043 spin_unlock_bh(lists_lock);
1044 return ret;
1047 void __init zbud_init(void)
1049 int i;
1051 #ifdef CONFIG_DEBUG_FS
1052 zbud_debugfs_init();
1053 #endif
1054 BUG_ON((sizeof(struct tmem_handle) * 2 > CHUNK_SIZE));
1055 BUG_ON(sizeof(struct zbudpage) > sizeof(struct page));
1056 for (i = 0; i < NCHUNKS; i++) {
1057 INIT_LIST_HEAD(&zbud_eph_unbuddied[i].list);
1058 INIT_LIST_HEAD(&zbud_pers_unbuddied[i].list);