4 * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
5 * Copyright (c) 2010,2011, Nitin Gupta
7 * Zcache provides an in-kernel "host implementation" for transcendent memory
8 * and, thus indirectly, for cleancache and frontswap. Zcache includes two
9 * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
10 * 1) "compression buddies" ("zbud") is used for ephemeral pages
11 * 2) xvmalloc is used for persistent pages.
12 * Xvmalloc (based on the TLSF allocator) has very low fragmentation
13 * so maximizes space efficiency, while zbud allows pairs (and potentially,
14 * in the future, more than a pair of) compressed pages to be closely linked
15 * so that reclaiming can be done via the kernel's physical-page-oriented
16 * "shrinker" interface.
18 * [1] For a definition of page-accessible memory (aka PAM), see:
19 * http://marc.info/?l=linux-mm&m=127811271605009
22 #include <linux/module.h>
23 #include <linux/cpu.h>
24 #include <linux/highmem.h>
25 #include <linux/list.h>
26 #include <linux/lzo.h>
27 #include <linux/slab.h>
28 #include <linux/spinlock.h>
29 #include <linux/types.h>
30 #include <linux/atomic.h>
31 #include <linux/math64.h>
34 #include "../zram/xvmalloc.h" /* if built in drivers/staging */
36 #if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
37 #error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
39 #ifdef CONFIG_CLEANCACHE
40 #include <linux/cleancache.h>
42 #ifdef CONFIG_FRONTSWAP
43 #include <linux/frontswap.h>
47 /* this is more aggressive but may cause other problems? */
48 #define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
50 #define ZCACHE_GFP_MASK \
51 (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
54 #define MAX_POOLS_PER_CLIENT 16
56 #define MAX_CLIENTS 16
57 #define LOCAL_CLIENT ((uint16_t)-1)
59 MODULE_LICENSE("GPL");
61 struct zcache_client
{
62 struct tmem_pool
*tmem_pools
[MAX_POOLS_PER_CLIENT
];
63 struct xv_pool
*xvpool
;
68 static struct zcache_client zcache_host
;
69 static struct zcache_client zcache_clients
[MAX_CLIENTS
];
71 static inline uint16_t get_client_id_from_client(struct zcache_client
*cli
)
74 if (cli
== &zcache_host
)
76 return cli
- &zcache_clients
[0];
79 static inline bool is_local_client(struct zcache_client
*cli
)
81 return cli
== &zcache_host
;
85 * Compression buddies ("zbud") provides for packing two (or, possibly
86 * in the future, more) compressed ephemeral pages into a single "raw"
87 * (physical) page and tracking them with data structures so that
88 * the raw pages can be easily reclaimed.
90 * A zbud page ("zbpg") is an aligned page containing a list_head,
91 * a lock, and two "zbud headers". The remainder of the physical
92 * page is divided up into aligned 64-byte "chunks" which contain
93 * the compressed data for zero, one, or two zbuds. Each zbpg
94 * resides on: (1) an "unused list" if it has no zbuds; (2) a
95 * "buddied" list if it is fully populated with two zbuds; or
96 * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
97 * the one unbuddied zbud uses. The data inside a zbpg cannot be
98 * read or written unless the zbpg's lock is held.
101 #define ZBH_SENTINEL 0x43214321
102 #define ZBPG_SENTINEL 0xdeadbeef
104 #define ZBUD_MAX_BUDS 2
111 uint16_t size
; /* compressed size in bytes, zero means unused */
116 struct list_head bud_list
;
118 struct zbud_hdr buddy
[ZBUD_MAX_BUDS
];
120 /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
123 #define CHUNK_SHIFT 6
124 #define CHUNK_SIZE (1 << CHUNK_SHIFT)
125 #define CHUNK_MASK (~(CHUNK_SIZE-1))
126 #define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \
127 CHUNK_MASK) >> CHUNK_SHIFT)
128 #define MAX_CHUNK (NCHUNKS-1)
131 struct list_head list
;
133 } zbud_unbuddied
[NCHUNKS
];
134 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
135 /* element 0 is never used but optimizing that isn't worth it */
136 static unsigned long zbud_cumul_chunk_counts
[NCHUNKS
];
138 struct list_head zbud_buddied_list
;
139 static unsigned long zcache_zbud_buddied_count
;
141 /* protects the buddied list and all unbuddied lists */
142 static DEFINE_SPINLOCK(zbud_budlists_spinlock
);
144 static LIST_HEAD(zbpg_unused_list
);
145 static unsigned long zcache_zbpg_unused_list_count
;
147 /* protects the unused page list */
148 static DEFINE_SPINLOCK(zbpg_unused_list_spinlock
);
150 static atomic_t zcache_zbud_curr_raw_pages
;
151 static atomic_t zcache_zbud_curr_zpages
;
152 static unsigned long zcache_zbud_curr_zbytes
;
153 static unsigned long zcache_zbud_cumul_zpages
;
154 static unsigned long zcache_zbud_cumul_zbytes
;
155 static unsigned long zcache_compress_poor
;
156 static unsigned long zcache_mean_compress_poor
;
158 /* forward references */
159 static void *zcache_get_free_page(void);
160 static void zcache_free_page(void *p
);
163 * zbud helper functions
166 static inline unsigned zbud_max_buddy_size(void)
168 return MAX_CHUNK
<< CHUNK_SHIFT
;
171 static inline unsigned zbud_size_to_chunks(unsigned size
)
173 BUG_ON(size
== 0 || size
> zbud_max_buddy_size());
174 return (size
+ CHUNK_SIZE
- 1) >> CHUNK_SHIFT
;
177 static inline int zbud_budnum(struct zbud_hdr
*zh
)
179 unsigned offset
= (unsigned long)zh
& (PAGE_SIZE
- 1);
180 struct zbud_page
*zbpg
= NULL
;
181 unsigned budnum
= -1U;
184 for (i
= 0; i
< ZBUD_MAX_BUDS
; i
++)
185 if (offset
== offsetof(typeof(*zbpg
), buddy
[i
])) {
189 BUG_ON(budnum
== -1U);
193 static char *zbud_data(struct zbud_hdr
*zh
, unsigned size
)
195 struct zbud_page
*zbpg
;
199 ASSERT_SENTINEL(zh
, ZBH
);
200 budnum
= zbud_budnum(zh
);
201 BUG_ON(size
== 0 || size
> zbud_max_buddy_size());
202 zbpg
= container_of(zh
, struct zbud_page
, buddy
[budnum
]);
203 ASSERT_SPINLOCK(&zbpg
->lock
);
206 p
+= ((sizeof(struct zbud_page
) + CHUNK_SIZE
- 1) &
208 else if (budnum
== 1)
209 p
+= PAGE_SIZE
- ((size
+ CHUNK_SIZE
- 1) & CHUNK_MASK
);
214 * zbud raw page management
217 static struct zbud_page
*zbud_alloc_raw_page(void)
219 struct zbud_page
*zbpg
= NULL
;
220 struct zbud_hdr
*zh0
, *zh1
;
223 /* if any pages on the zbpg list, use one */
224 spin_lock(&zbpg_unused_list_spinlock
);
225 if (!list_empty(&zbpg_unused_list
)) {
226 zbpg
= list_first_entry(&zbpg_unused_list
,
227 struct zbud_page
, bud_list
);
228 list_del_init(&zbpg
->bud_list
);
229 zcache_zbpg_unused_list_count
--;
232 spin_unlock(&zbpg_unused_list_spinlock
);
234 /* none on zbpg list, try to get a kernel page */
235 zbpg
= zcache_get_free_page();
236 if (likely(zbpg
!= NULL
)) {
237 INIT_LIST_HEAD(&zbpg
->bud_list
);
238 zh0
= &zbpg
->buddy
[0]; zh1
= &zbpg
->buddy
[1];
239 spin_lock_init(&zbpg
->lock
);
241 ASSERT_INVERTED_SENTINEL(zbpg
, ZBPG
);
242 SET_SENTINEL(zbpg
, ZBPG
);
243 BUG_ON(zh0
->size
!= 0 || tmem_oid_valid(&zh0
->oid
));
244 BUG_ON(zh1
->size
!= 0 || tmem_oid_valid(&zh1
->oid
));
246 atomic_inc(&zcache_zbud_curr_raw_pages
);
247 INIT_LIST_HEAD(&zbpg
->bud_list
);
248 SET_SENTINEL(zbpg
, ZBPG
);
249 zh0
->size
= 0; zh1
->size
= 0;
250 tmem_oid_set_invalid(&zh0
->oid
);
251 tmem_oid_set_invalid(&zh1
->oid
);
257 static void zbud_free_raw_page(struct zbud_page
*zbpg
)
259 struct zbud_hdr
*zh0
= &zbpg
->buddy
[0], *zh1
= &zbpg
->buddy
[1];
261 ASSERT_SENTINEL(zbpg
, ZBPG
);
262 BUG_ON(!list_empty(&zbpg
->bud_list
));
263 ASSERT_SPINLOCK(&zbpg
->lock
);
264 BUG_ON(zh0
->size
!= 0 || tmem_oid_valid(&zh0
->oid
));
265 BUG_ON(zh1
->size
!= 0 || tmem_oid_valid(&zh1
->oid
));
266 INVERT_SENTINEL(zbpg
, ZBPG
);
267 spin_unlock(&zbpg
->lock
);
268 spin_lock(&zbpg_unused_list_spinlock
);
269 list_add(&zbpg
->bud_list
, &zbpg_unused_list
);
270 zcache_zbpg_unused_list_count
++;
271 spin_unlock(&zbpg_unused_list_spinlock
);
275 * core zbud handling routines
278 static unsigned zbud_free(struct zbud_hdr
*zh
)
282 ASSERT_SENTINEL(zh
, ZBH
);
283 BUG_ON(!tmem_oid_valid(&zh
->oid
));
285 BUG_ON(zh
->size
== 0 || zh
->size
> zbud_max_buddy_size());
287 tmem_oid_set_invalid(&zh
->oid
);
288 INVERT_SENTINEL(zh
, ZBH
);
289 zcache_zbud_curr_zbytes
-= size
;
290 atomic_dec(&zcache_zbud_curr_zpages
);
294 static void zbud_free_and_delist(struct zbud_hdr
*zh
)
297 struct zbud_hdr
*zh_other
;
298 unsigned budnum
= zbud_budnum(zh
), size
;
299 struct zbud_page
*zbpg
=
300 container_of(zh
, struct zbud_page
, buddy
[budnum
]);
302 spin_lock(&zbud_budlists_spinlock
);
303 spin_lock(&zbpg
->lock
);
304 if (list_empty(&zbpg
->bud_list
)) {
305 /* ignore zombie page... see zbud_evict_pages() */
306 spin_unlock(&zbpg
->lock
);
307 spin_unlock(&zbud_budlists_spinlock
);
310 size
= zbud_free(zh
);
311 ASSERT_SPINLOCK(&zbpg
->lock
);
312 zh_other
= &zbpg
->buddy
[(budnum
== 0) ? 1 : 0];
313 if (zh_other
->size
== 0) { /* was unbuddied: unlist and free */
314 chunks
= zbud_size_to_chunks(size
) ;
315 BUG_ON(list_empty(&zbud_unbuddied
[chunks
].list
));
316 list_del_init(&zbpg
->bud_list
);
317 zbud_unbuddied
[chunks
].count
--;
318 spin_unlock(&zbud_budlists_spinlock
);
319 zbud_free_raw_page(zbpg
);
320 } else { /* was buddied: move remaining buddy to unbuddied list */
321 chunks
= zbud_size_to_chunks(zh_other
->size
) ;
322 list_del_init(&zbpg
->bud_list
);
323 zcache_zbud_buddied_count
--;
324 list_add_tail(&zbpg
->bud_list
, &zbud_unbuddied
[chunks
].list
);
325 zbud_unbuddied
[chunks
].count
++;
326 spin_unlock(&zbud_budlists_spinlock
);
327 spin_unlock(&zbpg
->lock
);
331 static struct zbud_hdr
*zbud_create(uint16_t client_id
, uint16_t pool_id
,
332 struct tmem_oid
*oid
,
333 uint32_t index
, struct page
*page
,
334 void *cdata
, unsigned size
)
336 struct zbud_hdr
*zh0
, *zh1
, *zh
= NULL
;
337 struct zbud_page
*zbpg
= NULL
, *ztmp
;
340 int i
, found_good_buddy
= 0;
342 nchunks
= zbud_size_to_chunks(size
) ;
343 for (i
= MAX_CHUNK
- nchunks
+ 1; i
> 0; i
--) {
344 spin_lock(&zbud_budlists_spinlock
);
345 if (!list_empty(&zbud_unbuddied
[i
].list
)) {
346 list_for_each_entry_safe(zbpg
, ztmp
,
347 &zbud_unbuddied
[i
].list
, bud_list
) {
348 if (spin_trylock(&zbpg
->lock
)) {
349 found_good_buddy
= i
;
350 goto found_unbuddied
;
354 spin_unlock(&zbud_budlists_spinlock
);
356 /* didn't find a good buddy, try allocating a new page */
357 zbpg
= zbud_alloc_raw_page();
358 if (unlikely(zbpg
== NULL
))
360 /* ok, have a page, now compress the data before taking locks */
361 spin_lock(&zbud_budlists_spinlock
);
362 spin_lock(&zbpg
->lock
);
363 list_add_tail(&zbpg
->bud_list
, &zbud_unbuddied
[nchunks
].list
);
364 zbud_unbuddied
[nchunks
].count
++;
365 zh
= &zbpg
->buddy
[0];
369 ASSERT_SPINLOCK(&zbpg
->lock
);
370 zh0
= &zbpg
->buddy
[0]; zh1
= &zbpg
->buddy
[1];
371 BUG_ON(!((zh0
->size
== 0) ^ (zh1
->size
== 0)));
372 if (zh0
->size
!= 0) { /* buddy0 in use, buddy1 is vacant */
373 ASSERT_SENTINEL(zh0
, ZBH
);
375 } else if (zh1
->size
!= 0) { /* buddy1 in use, buddy0 is vacant */
376 ASSERT_SENTINEL(zh1
, ZBH
);
380 list_del_init(&zbpg
->bud_list
);
381 zbud_unbuddied
[found_good_buddy
].count
--;
382 list_add_tail(&zbpg
->bud_list
, &zbud_buddied_list
);
383 zcache_zbud_buddied_count
++;
386 SET_SENTINEL(zh
, ZBH
);
390 zh
->pool_id
= pool_id
;
391 zh
->client_id
= client_id
;
392 to
= zbud_data(zh
, size
);
393 memcpy(to
, cdata
, size
);
394 spin_unlock(&zbpg
->lock
);
395 spin_unlock(&zbud_budlists_spinlock
);
397 zbud_cumul_chunk_counts
[nchunks
]++;
398 atomic_inc(&zcache_zbud_curr_zpages
);
399 zcache_zbud_cumul_zpages
++;
400 zcache_zbud_curr_zbytes
+= size
;
401 zcache_zbud_cumul_zbytes
+= size
;
406 static int zbud_decompress(struct page
*page
, struct zbud_hdr
*zh
)
408 struct zbud_page
*zbpg
;
409 unsigned budnum
= zbud_budnum(zh
);
410 size_t out_len
= PAGE_SIZE
;
411 char *to_va
, *from_va
;
415 zbpg
= container_of(zh
, struct zbud_page
, buddy
[budnum
]);
416 spin_lock(&zbpg
->lock
);
417 if (list_empty(&zbpg
->bud_list
)) {
418 /* ignore zombie page... see zbud_evict_pages() */
422 ASSERT_SENTINEL(zh
, ZBH
);
423 BUG_ON(zh
->size
== 0 || zh
->size
> zbud_max_buddy_size());
424 to_va
= kmap_atomic(page
, KM_USER0
);
426 from_va
= zbud_data(zh
, size
);
427 ret
= lzo1x_decompress_safe(from_va
, size
, to_va
, &out_len
);
428 BUG_ON(ret
!= LZO_E_OK
);
429 BUG_ON(out_len
!= PAGE_SIZE
);
430 kunmap_atomic(to_va
, KM_USER0
);
432 spin_unlock(&zbpg
->lock
);
437 * The following routines handle shrinking of ephemeral pages by evicting
438 * pages "least valuable" first.
441 static unsigned long zcache_evicted_raw_pages
;
442 static unsigned long zcache_evicted_buddied_pages
;
443 static unsigned long zcache_evicted_unbuddied_pages
;
445 static struct tmem_pool
*zcache_get_pool_by_id(uint16_t cli_id
,
447 static void zcache_put_pool(struct tmem_pool
*pool
);
450 * Flush and free all zbuds in a zbpg, then free the pageframe
452 static void zbud_evict_zbpg(struct zbud_page
*zbpg
)
456 uint32_t pool_id
[ZBUD_MAX_BUDS
], client_id
[ZBUD_MAX_BUDS
];
457 uint32_t index
[ZBUD_MAX_BUDS
];
458 struct tmem_oid oid
[ZBUD_MAX_BUDS
];
459 struct tmem_pool
*pool
;
461 ASSERT_SPINLOCK(&zbpg
->lock
);
462 BUG_ON(!list_empty(&zbpg
->bud_list
));
463 for (i
= 0, j
= 0; i
< ZBUD_MAX_BUDS
; i
++) {
464 zh
= &zbpg
->buddy
[i
];
466 client_id
[j
] = zh
->client_id
;
467 pool_id
[j
] = zh
->pool_id
;
469 index
[j
] = zh
->index
;
474 spin_unlock(&zbpg
->lock
);
475 for (i
= 0; i
< j
; i
++) {
476 pool
= zcache_get_pool_by_id(client_id
[i
], pool_id
[i
]);
478 tmem_flush_page(pool
, &oid
[i
], index
[i
]);
479 zcache_put_pool(pool
);
482 ASSERT_SENTINEL(zbpg
, ZBPG
);
483 spin_lock(&zbpg
->lock
);
484 zbud_free_raw_page(zbpg
);
488 * Free nr pages. This code is funky because we want to hold the locks
489 * protecting various lists for as short a time as possible, and in some
490 * circumstances the list may change asynchronously when the list lock is
491 * not held. In some cases we also trylock not only to avoid waiting on a
492 * page in use by another cpu, but also to avoid potential deadlock due to
495 static void zbud_evict_pages(int nr
)
497 struct zbud_page
*zbpg
;
500 /* first try freeing any pages on unused list */
502 spin_lock_bh(&zbpg_unused_list_spinlock
);
503 if (!list_empty(&zbpg_unused_list
)) {
504 /* can't walk list here, since it may change when unlocked */
505 zbpg
= list_first_entry(&zbpg_unused_list
,
506 struct zbud_page
, bud_list
);
507 list_del_init(&zbpg
->bud_list
);
508 zcache_zbpg_unused_list_count
--;
509 atomic_dec(&zcache_zbud_curr_raw_pages
);
510 spin_unlock_bh(&zbpg_unused_list_spinlock
);
511 zcache_free_page(zbpg
);
512 zcache_evicted_raw_pages
++;
515 goto retry_unused_list
;
517 spin_unlock_bh(&zbpg_unused_list_spinlock
);
519 /* now try freeing unbuddied pages, starting with least space avail */
520 for (i
= 0; i
< MAX_CHUNK
; i
++) {
522 spin_lock_bh(&zbud_budlists_spinlock
);
523 if (list_empty(&zbud_unbuddied
[i
].list
)) {
524 spin_unlock_bh(&zbud_budlists_spinlock
);
527 list_for_each_entry(zbpg
, &zbud_unbuddied
[i
].list
, bud_list
) {
528 if (unlikely(!spin_trylock(&zbpg
->lock
)))
530 list_del_init(&zbpg
->bud_list
);
531 zbud_unbuddied
[i
].count
--;
532 spin_unlock(&zbud_budlists_spinlock
);
533 zcache_evicted_unbuddied_pages
++;
534 /* want budlists unlocked when doing zbpg eviction */
535 zbud_evict_zbpg(zbpg
);
539 goto retry_unbud_list_i
;
541 spin_unlock_bh(&zbud_budlists_spinlock
);
544 /* as a last resort, free buddied pages */
546 spin_lock_bh(&zbud_budlists_spinlock
);
547 if (list_empty(&zbud_buddied_list
)) {
548 spin_unlock_bh(&zbud_budlists_spinlock
);
551 list_for_each_entry(zbpg
, &zbud_buddied_list
, bud_list
) {
552 if (unlikely(!spin_trylock(&zbpg
->lock
)))
554 list_del_init(&zbpg
->bud_list
);
555 zcache_zbud_buddied_count
--;
556 spin_unlock(&zbud_budlists_spinlock
);
557 zcache_evicted_buddied_pages
++;
558 /* want budlists unlocked when doing zbpg eviction */
559 zbud_evict_zbpg(zbpg
);
565 spin_unlock_bh(&zbud_budlists_spinlock
);
570 static void zbud_init(void)
574 INIT_LIST_HEAD(&zbud_buddied_list
);
575 zcache_zbud_buddied_count
= 0;
576 for (i
= 0; i
< NCHUNKS
; i
++) {
577 INIT_LIST_HEAD(&zbud_unbuddied
[i
].list
);
578 zbud_unbuddied
[i
].count
= 0;
584 * These sysfs routines show a nice distribution of how many zbpg's are
585 * currently (and have ever been placed) in each unbuddied list. It's fun
586 * to watch but can probably go away before final merge.
588 static int zbud_show_unbuddied_list_counts(char *buf
)
593 for (i
= 0; i
< NCHUNKS
; i
++)
594 p
+= sprintf(p
, "%u ", zbud_unbuddied
[i
].count
);
598 static int zbud_show_cumul_chunk_counts(char *buf
)
600 unsigned long i
, chunks
= 0, total_chunks
= 0, sum_total_chunks
= 0;
601 unsigned long total_chunks_lte_21
= 0, total_chunks_lte_32
= 0;
602 unsigned long total_chunks_lte_42
= 0;
605 for (i
= 0; i
< NCHUNKS
; i
++) {
606 p
+= sprintf(p
, "%lu ", zbud_cumul_chunk_counts
[i
]);
607 chunks
+= zbud_cumul_chunk_counts
[i
];
608 total_chunks
+= zbud_cumul_chunk_counts
[i
];
609 sum_total_chunks
+= i
* zbud_cumul_chunk_counts
[i
];
611 total_chunks_lte_21
= total_chunks
;
613 total_chunks_lte_32
= total_chunks
;
615 total_chunks_lte_42
= total_chunks
;
617 p
+= sprintf(p
, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
618 total_chunks_lte_21
, total_chunks_lte_32
, total_chunks_lte_42
,
619 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
625 * This "zv" PAM implementation combines the TLSF-based xvMalloc
626 * with lzo1x compression to maximize the amount of data that can
627 * be packed into a physical page.
629 * Zv represents a PAM page with the index and object (plus a "size" value
630 * necessary for decompression) immediately preceding the compressed data.
633 #define ZVH_SENTINEL 0x43214321
642 /* rudimentary policy limits */
643 /* total number of persistent pages may not exceed this percentage */
644 static unsigned int zv_page_count_policy_percent
= 75;
646 * byte count defining poor compression; pages with greater zsize will be
649 static unsigned int zv_max_zsize
= (PAGE_SIZE
/ 8) * 7;
651 * byte count defining poor *mean* compression; pages with greater zsize
652 * will be rejected until sufficient better-compressed pages are accepted
653 * driving the mean below this threshold
655 static unsigned int zv_max_mean_zsize
= (PAGE_SIZE
/ 8) * 5;
657 static atomic_t zv_curr_dist_counts
[NCHUNKS
];
658 static atomic_t zv_cumul_dist_counts
[NCHUNKS
];
660 static struct zv_hdr
*zv_create(struct xv_pool
*xvpool
, uint32_t pool_id
,
661 struct tmem_oid
*oid
, uint32_t index
,
662 void *cdata
, unsigned clen
)
665 struct zv_hdr
*zv
= NULL
;
667 int alloc_size
= clen
+ sizeof(struct zv_hdr
);
668 int chunks
= (alloc_size
+ (CHUNK_SIZE
- 1)) >> CHUNK_SHIFT
;
671 BUG_ON(!irqs_disabled());
672 BUG_ON(chunks
>= NCHUNKS
);
673 ret
= xv_malloc(xvpool
, alloc_size
,
674 &page
, &offset
, ZCACHE_GFP_MASK
);
677 atomic_inc(&zv_curr_dist_counts
[chunks
]);
678 atomic_inc(&zv_cumul_dist_counts
[chunks
]);
679 zv
= kmap_atomic(page
, KM_USER0
) + offset
;
682 zv
->pool_id
= pool_id
;
683 SET_SENTINEL(zv
, ZVH
);
684 memcpy((char *)zv
+ sizeof(struct zv_hdr
), cdata
, clen
);
685 kunmap_atomic(zv
, KM_USER0
);
690 static void zv_free(struct xv_pool
*xvpool
, struct zv_hdr
*zv
)
695 uint16_t size
= xv_get_object_size(zv
);
696 int chunks
= (size
+ (CHUNK_SIZE
- 1)) >> CHUNK_SHIFT
;
698 ASSERT_SENTINEL(zv
, ZVH
);
699 BUG_ON(chunks
>= NCHUNKS
);
700 atomic_dec(&zv_curr_dist_counts
[chunks
]);
703 INVERT_SENTINEL(zv
, ZVH
);
704 page
= virt_to_page(zv
);
705 offset
= (unsigned long)zv
& ~PAGE_MASK
;
706 local_irq_save(flags
);
707 xv_free(xvpool
, page
, offset
);
708 local_irq_restore(flags
);
711 static void zv_decompress(struct page
*page
, struct zv_hdr
*zv
)
713 size_t clen
= PAGE_SIZE
;
718 ASSERT_SENTINEL(zv
, ZVH
);
719 size
= xv_get_object_size(zv
) - sizeof(*zv
);
721 to_va
= kmap_atomic(page
, KM_USER0
);
722 ret
= lzo1x_decompress_safe((char *)zv
+ sizeof(*zv
),
724 kunmap_atomic(to_va
, KM_USER0
);
725 BUG_ON(ret
!= LZO_E_OK
);
726 BUG_ON(clen
!= PAGE_SIZE
);
731 * show a distribution of compression stats for zv pages.
734 static int zv_curr_dist_counts_show(char *buf
)
736 unsigned long i
, n
, chunks
= 0, sum_total_chunks
= 0;
739 for (i
= 0; i
< NCHUNKS
; i
++) {
740 n
= atomic_read(&zv_curr_dist_counts
[i
]);
741 p
+= sprintf(p
, "%lu ", n
);
743 sum_total_chunks
+= i
* n
;
745 p
+= sprintf(p
, "mean:%lu\n",
746 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
750 static int zv_cumul_dist_counts_show(char *buf
)
752 unsigned long i
, n
, chunks
= 0, sum_total_chunks
= 0;
755 for (i
= 0; i
< NCHUNKS
; i
++) {
756 n
= atomic_read(&zv_cumul_dist_counts
[i
]);
757 p
+= sprintf(p
, "%lu ", n
);
759 sum_total_chunks
+= i
* n
;
761 p
+= sprintf(p
, "mean:%lu\n",
762 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
767 * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
768 * pages that don't compress to less than this value (including metadata
769 * overhead) to be rejected. We don't allow the value to get too close
772 static ssize_t
zv_max_zsize_show(struct kobject
*kobj
,
773 struct kobj_attribute
*attr
,
776 return sprintf(buf
, "%u\n", zv_max_zsize
);
779 static ssize_t
zv_max_zsize_store(struct kobject
*kobj
,
780 struct kobj_attribute
*attr
,
781 const char *buf
, size_t count
)
786 if (!capable(CAP_SYS_ADMIN
))
789 err
= kstrtoul(buf
, 10, &val
);
790 if (err
|| (val
== 0) || (val
> (PAGE_SIZE
/ 8) * 7))
797 * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
798 * pages that don't compress to less than this value (including metadata
799 * overhead) to be rejected UNLESS the mean compression is also smaller
800 * than this value. In other words, we are load-balancing-by-zsize the
801 * accepted pages. Again, we don't allow the value to get too close
804 static ssize_t
zv_max_mean_zsize_show(struct kobject
*kobj
,
805 struct kobj_attribute
*attr
,
808 return sprintf(buf
, "%u\n", zv_max_mean_zsize
);
811 static ssize_t
zv_max_mean_zsize_store(struct kobject
*kobj
,
812 struct kobj_attribute
*attr
,
813 const char *buf
, size_t count
)
818 if (!capable(CAP_SYS_ADMIN
))
821 err
= kstrtoul(buf
, 10, &val
);
822 if (err
|| (val
== 0) || (val
> (PAGE_SIZE
/ 8) * 7))
824 zv_max_mean_zsize
= val
;
829 * setting zv_page_count_policy_percent via sysfs sets an upper bound of
830 * persistent (e.g. swap) pages that will be retained according to:
831 * (zv_page_count_policy_percent * totalram_pages) / 100)
832 * when that limit is reached, further puts will be rejected (until
833 * some pages have been flushed). Note that, due to compression,
834 * this number may exceed 100; it defaults to 75 and we set an
835 * arbitary limit of 150. A poor choice will almost certainly result
836 * in OOM's, so this value should only be changed prudently.
838 static ssize_t
zv_page_count_policy_percent_show(struct kobject
*kobj
,
839 struct kobj_attribute
*attr
,
842 return sprintf(buf
, "%u\n", zv_page_count_policy_percent
);
845 static ssize_t
zv_page_count_policy_percent_store(struct kobject
*kobj
,
846 struct kobj_attribute
*attr
,
847 const char *buf
, size_t count
)
852 if (!capable(CAP_SYS_ADMIN
))
855 err
= kstrtoul(buf
, 10, &val
);
856 if (err
|| (val
== 0) || (val
> 150))
858 zv_page_count_policy_percent
= val
;
862 static struct kobj_attribute zcache_zv_max_zsize_attr
= {
863 .attr
= { .name
= "zv_max_zsize", .mode
= 0644 },
864 .show
= zv_max_zsize_show
,
865 .store
= zv_max_zsize_store
,
868 static struct kobj_attribute zcache_zv_max_mean_zsize_attr
= {
869 .attr
= { .name
= "zv_max_mean_zsize", .mode
= 0644 },
870 .show
= zv_max_mean_zsize_show
,
871 .store
= zv_max_mean_zsize_store
,
874 static struct kobj_attribute zcache_zv_page_count_policy_percent_attr
= {
875 .attr
= { .name
= "zv_page_count_policy_percent",
877 .show
= zv_page_count_policy_percent_show
,
878 .store
= zv_page_count_policy_percent_store
,
883 * zcache core code starts here
886 /* useful stats not collected by cleancache or frontswap */
887 static unsigned long zcache_flush_total
;
888 static unsigned long zcache_flush_found
;
889 static unsigned long zcache_flobj_total
;
890 static unsigned long zcache_flobj_found
;
891 static unsigned long zcache_failed_eph_puts
;
892 static unsigned long zcache_failed_pers_puts
;
895 * Tmem operations assume the poolid implies the invoking client.
896 * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
897 * RAMster has each client numbered by cluster node, and a KVM version
898 * of zcache would have one client per guest and each client might
901 static struct tmem_pool
*zcache_get_pool_by_id(uint16_t cli_id
, uint16_t poolid
)
903 struct tmem_pool
*pool
= NULL
;
904 struct zcache_client
*cli
= NULL
;
906 if (cli_id
== LOCAL_CLIENT
)
909 if (cli_id
>= MAX_CLIENTS
)
911 cli
= &zcache_clients
[cli_id
];
914 atomic_inc(&cli
->refcount
);
916 if (poolid
< MAX_POOLS_PER_CLIENT
) {
917 pool
= cli
->tmem_pools
[poolid
];
919 atomic_inc(&pool
->refcount
);
925 static void zcache_put_pool(struct tmem_pool
*pool
)
927 struct zcache_client
*cli
= NULL
;
932 atomic_dec(&pool
->refcount
);
933 atomic_dec(&cli
->refcount
);
936 int zcache_new_client(uint16_t cli_id
)
938 struct zcache_client
*cli
= NULL
;
941 if (cli_id
== LOCAL_CLIENT
)
943 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
944 cli
= &zcache_clients
[cli_id
];
950 #ifdef CONFIG_FRONTSWAP
951 cli
->xvpool
= xv_create_pool();
952 if (cli
->xvpool
== NULL
)
960 /* counters for debugging */
961 static unsigned long zcache_failed_get_free_pages
;
962 static unsigned long zcache_failed_alloc
;
963 static unsigned long zcache_put_to_flush
;
966 * for now, used named slabs so can easily track usage; later can
967 * either just use kmalloc, or perhaps add a slab-like allocator
968 * to more carefully manage total memory utilization
970 static struct kmem_cache
*zcache_objnode_cache
;
971 static struct kmem_cache
*zcache_obj_cache
;
972 static atomic_t zcache_curr_obj_count
= ATOMIC_INIT(0);
973 static unsigned long zcache_curr_obj_count_max
;
974 static atomic_t zcache_curr_objnode_count
= ATOMIC_INIT(0);
975 static unsigned long zcache_curr_objnode_count_max
;
978 * to avoid memory allocation recursion (e.g. due to direct reclaim), we
979 * preload all necessary data structures so the hostops callbacks never
980 * actually do a malloc
982 struct zcache_preload
{
984 struct tmem_obj
*obj
;
986 struct tmem_objnode
*objnodes
[OBJNODE_TREE_MAX_PATH
];
988 static DEFINE_PER_CPU(struct zcache_preload
, zcache_preloads
) = { 0, };
990 static int zcache_do_preload(struct tmem_pool
*pool
)
992 struct zcache_preload
*kp
;
993 struct tmem_objnode
*objnode
;
994 struct tmem_obj
*obj
;
998 if (unlikely(zcache_objnode_cache
== NULL
))
1000 if (unlikely(zcache_obj_cache
== NULL
))
1003 kp
= &__get_cpu_var(zcache_preloads
);
1004 while (kp
->nr
< ARRAY_SIZE(kp
->objnodes
)) {
1005 preempt_enable_no_resched();
1006 objnode
= kmem_cache_alloc(zcache_objnode_cache
,
1008 if (unlikely(objnode
== NULL
)) {
1009 zcache_failed_alloc
++;
1013 kp
= &__get_cpu_var(zcache_preloads
);
1014 if (kp
->nr
< ARRAY_SIZE(kp
->objnodes
))
1015 kp
->objnodes
[kp
->nr
++] = objnode
;
1017 kmem_cache_free(zcache_objnode_cache
, objnode
);
1019 preempt_enable_no_resched();
1020 obj
= kmem_cache_alloc(zcache_obj_cache
, ZCACHE_GFP_MASK
);
1021 if (unlikely(obj
== NULL
)) {
1022 zcache_failed_alloc
++;
1025 page
= (void *)__get_free_page(ZCACHE_GFP_MASK
);
1026 if (unlikely(page
== NULL
)) {
1027 zcache_failed_get_free_pages
++;
1028 kmem_cache_free(zcache_obj_cache
, obj
);
1032 kp
= &__get_cpu_var(zcache_preloads
);
1033 if (kp
->obj
== NULL
)
1036 kmem_cache_free(zcache_obj_cache
, obj
);
1037 if (kp
->page
== NULL
)
1040 free_page((unsigned long)page
);
1046 static void *zcache_get_free_page(void)
1048 struct zcache_preload
*kp
;
1051 kp
= &__get_cpu_var(zcache_preloads
);
1053 BUG_ON(page
== NULL
);
1058 static void zcache_free_page(void *p
)
1060 free_page((unsigned long)p
);
1064 * zcache implementation for tmem host ops
1067 static struct tmem_objnode
*zcache_objnode_alloc(struct tmem_pool
*pool
)
1069 struct tmem_objnode
*objnode
= NULL
;
1070 unsigned long count
;
1071 struct zcache_preload
*kp
;
1073 kp
= &__get_cpu_var(zcache_preloads
);
1076 objnode
= kp
->objnodes
[kp
->nr
- 1];
1077 BUG_ON(objnode
== NULL
);
1078 kp
->objnodes
[kp
->nr
- 1] = NULL
;
1080 count
= atomic_inc_return(&zcache_curr_objnode_count
);
1081 if (count
> zcache_curr_objnode_count_max
)
1082 zcache_curr_objnode_count_max
= count
;
1087 static void zcache_objnode_free(struct tmem_objnode
*objnode
,
1088 struct tmem_pool
*pool
)
1090 atomic_dec(&zcache_curr_objnode_count
);
1091 BUG_ON(atomic_read(&zcache_curr_objnode_count
) < 0);
1092 kmem_cache_free(zcache_objnode_cache
, objnode
);
1095 static struct tmem_obj
*zcache_obj_alloc(struct tmem_pool
*pool
)
1097 struct tmem_obj
*obj
= NULL
;
1098 unsigned long count
;
1099 struct zcache_preload
*kp
;
1101 kp
= &__get_cpu_var(zcache_preloads
);
1103 BUG_ON(obj
== NULL
);
1105 count
= atomic_inc_return(&zcache_curr_obj_count
);
1106 if (count
> zcache_curr_obj_count_max
)
1107 zcache_curr_obj_count_max
= count
;
1111 static void zcache_obj_free(struct tmem_obj
*obj
, struct tmem_pool
*pool
)
1113 atomic_dec(&zcache_curr_obj_count
);
1114 BUG_ON(atomic_read(&zcache_curr_obj_count
) < 0);
1115 kmem_cache_free(zcache_obj_cache
, obj
);
1118 static struct tmem_hostops zcache_hostops
= {
1119 .obj_alloc
= zcache_obj_alloc
,
1120 .obj_free
= zcache_obj_free
,
1121 .objnode_alloc
= zcache_objnode_alloc
,
1122 .objnode_free
= zcache_objnode_free
,
1126 * zcache implementations for PAM page descriptor ops
1129 static atomic_t zcache_curr_eph_pampd_count
= ATOMIC_INIT(0);
1130 static unsigned long zcache_curr_eph_pampd_count_max
;
1131 static atomic_t zcache_curr_pers_pampd_count
= ATOMIC_INIT(0);
1132 static unsigned long zcache_curr_pers_pampd_count_max
;
1134 /* forward reference */
1135 static int zcache_compress(struct page
*from
, void **out_va
, size_t *out_len
);
1137 static void *zcache_pampd_create(char *data
, size_t size
, bool raw
, int eph
,
1138 struct tmem_pool
*pool
, struct tmem_oid
*oid
,
1141 void *pampd
= NULL
, *cdata
;
1144 unsigned long count
;
1145 struct page
*page
= (struct page
*)(data
);
1146 struct zcache_client
*cli
= pool
->client
;
1147 uint16_t client_id
= get_client_id_from_client(cli
);
1148 unsigned long zv_mean_zsize
;
1149 unsigned long curr_pers_pampd_count
;
1153 ret
= zcache_compress(page
, &cdata
, &clen
);
1156 if (clen
== 0 || clen
> zbud_max_buddy_size()) {
1157 zcache_compress_poor
++;
1160 pampd
= (void *)zbud_create(client_id
, pool
->pool_id
, oid
,
1161 index
, page
, cdata
, clen
);
1162 if (pampd
!= NULL
) {
1163 count
= atomic_inc_return(&zcache_curr_eph_pampd_count
);
1164 if (count
> zcache_curr_eph_pampd_count_max
)
1165 zcache_curr_eph_pampd_count_max
= count
;
1168 curr_pers_pampd_count
=
1169 atomic_read(&zcache_curr_pers_pampd_count
);
1170 if (curr_pers_pampd_count
>
1171 (zv_page_count_policy_percent
* totalram_pages
) / 100)
1173 ret
= zcache_compress(page
, &cdata
, &clen
);
1176 /* reject if compression is too poor */
1177 if (clen
> zv_max_zsize
) {
1178 zcache_compress_poor
++;
1181 /* reject if mean compression is too poor */
1182 if ((clen
> zv_max_mean_zsize
) && (curr_pers_pampd_count
> 0)) {
1183 total_zsize
= xv_get_total_size_bytes(cli
->xvpool
);
1184 zv_mean_zsize
= div_u64(total_zsize
,
1185 curr_pers_pampd_count
);
1186 if (zv_mean_zsize
> zv_max_mean_zsize
) {
1187 zcache_mean_compress_poor
++;
1191 pampd
= (void *)zv_create(cli
->xvpool
, pool
->pool_id
,
1192 oid
, index
, cdata
, clen
);
1195 count
= atomic_inc_return(&zcache_curr_pers_pampd_count
);
1196 if (count
> zcache_curr_pers_pampd_count_max
)
1197 zcache_curr_pers_pampd_count_max
= count
;
1204 * fill the pageframe corresponding to the struct page with the data
1205 * from the passed pampd
1207 static int zcache_pampd_get_data(char *data
, size_t *bufsize
, bool raw
,
1208 void *pampd
, struct tmem_pool
*pool
,
1209 struct tmem_oid
*oid
, uint32_t index
)
1213 BUG_ON(is_ephemeral(pool
));
1214 zv_decompress((struct page
*)(data
), pampd
);
1219 * fill the pageframe corresponding to the struct page with the data
1220 * from the passed pampd
1222 static int zcache_pampd_get_data_and_free(char *data
, size_t *bufsize
, bool raw
,
1223 void *pampd
, struct tmem_pool
*pool
,
1224 struct tmem_oid
*oid
, uint32_t index
)
1228 BUG_ON(!is_ephemeral(pool
));
1229 zbud_decompress((struct page
*)(data
), pampd
);
1230 zbud_free_and_delist((struct zbud_hdr
*)pampd
);
1231 atomic_dec(&zcache_curr_eph_pampd_count
);
1236 * free the pampd and remove it from any zcache lists
1237 * pampd must no longer be pointed to from any tmem data structures!
1239 static void zcache_pampd_free(void *pampd
, struct tmem_pool
*pool
,
1240 struct tmem_oid
*oid
, uint32_t index
)
1242 struct zcache_client
*cli
= pool
->client
;
1244 if (is_ephemeral(pool
)) {
1245 zbud_free_and_delist((struct zbud_hdr
*)pampd
);
1246 atomic_dec(&zcache_curr_eph_pampd_count
);
1247 BUG_ON(atomic_read(&zcache_curr_eph_pampd_count
) < 0);
1249 zv_free(cli
->xvpool
, (struct zv_hdr
*)pampd
);
1250 atomic_dec(&zcache_curr_pers_pampd_count
);
1251 BUG_ON(atomic_read(&zcache_curr_pers_pampd_count
) < 0);
1255 static void zcache_pampd_free_obj(struct tmem_pool
*pool
, struct tmem_obj
*obj
)
1259 static void zcache_pampd_new_obj(struct tmem_obj
*obj
)
1263 static int zcache_pampd_replace_in_obj(void *pampd
, struct tmem_obj
*obj
)
1268 static bool zcache_pampd_is_remote(void *pampd
)
1273 static struct tmem_pamops zcache_pamops
= {
1274 .create
= zcache_pampd_create
,
1275 .get_data
= zcache_pampd_get_data
,
1276 .get_data_and_free
= zcache_pampd_get_data_and_free
,
1277 .free
= zcache_pampd_free
,
1278 .free_obj
= zcache_pampd_free_obj
,
1279 .new_obj
= zcache_pampd_new_obj
,
1280 .replace_in_obj
= zcache_pampd_replace_in_obj
,
1281 .is_remote
= zcache_pampd_is_remote
,
1285 * zcache compression/decompression and related per-cpu stuff
1288 #define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
1289 #define LZO_DSTMEM_PAGE_ORDER 1
1290 static DEFINE_PER_CPU(unsigned char *, zcache_workmem
);
1291 static DEFINE_PER_CPU(unsigned char *, zcache_dstmem
);
1293 static int zcache_compress(struct page
*from
, void **out_va
, size_t *out_len
)
1296 unsigned char *dmem
= __get_cpu_var(zcache_dstmem
);
1297 unsigned char *wmem
= __get_cpu_var(zcache_workmem
);
1300 BUG_ON(!irqs_disabled());
1301 if (unlikely(dmem
== NULL
|| wmem
== NULL
))
1302 goto out
; /* no buffer, so can't compress */
1303 from_va
= kmap_atomic(from
, KM_USER0
);
1305 ret
= lzo1x_1_compress(from_va
, PAGE_SIZE
, dmem
, out_len
, wmem
);
1306 BUG_ON(ret
!= LZO_E_OK
);
1308 kunmap_atomic(from_va
, KM_USER0
);
1315 static int zcache_cpu_notifier(struct notifier_block
*nb
,
1316 unsigned long action
, void *pcpu
)
1318 int cpu
= (long)pcpu
;
1319 struct zcache_preload
*kp
;
1322 case CPU_UP_PREPARE
:
1323 per_cpu(zcache_dstmem
, cpu
) = (void *)__get_free_pages(
1324 GFP_KERNEL
| __GFP_REPEAT
,
1325 LZO_DSTMEM_PAGE_ORDER
),
1326 per_cpu(zcache_workmem
, cpu
) =
1327 kzalloc(LZO1X_MEM_COMPRESS
,
1328 GFP_KERNEL
| __GFP_REPEAT
);
1331 case CPU_UP_CANCELED
:
1332 free_pages((unsigned long)per_cpu(zcache_dstmem
, cpu
),
1333 LZO_DSTMEM_PAGE_ORDER
);
1334 per_cpu(zcache_dstmem
, cpu
) = NULL
;
1335 kfree(per_cpu(zcache_workmem
, cpu
));
1336 per_cpu(zcache_workmem
, cpu
) = NULL
;
1337 kp
= &per_cpu(zcache_preloads
, cpu
);
1339 kmem_cache_free(zcache_objnode_cache
,
1340 kp
->objnodes
[kp
->nr
- 1]);
1341 kp
->objnodes
[kp
->nr
- 1] = NULL
;
1345 kmem_cache_free(zcache_obj_cache
, kp
->obj
);
1349 free_page((unsigned long)kp
->page
);
1359 static struct notifier_block zcache_cpu_notifier_block
= {
1360 .notifier_call
= zcache_cpu_notifier
1364 #define ZCACHE_SYSFS_RO(_name) \
1365 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1366 struct kobj_attribute *attr, char *buf) \
1368 return sprintf(buf, "%lu\n", zcache_##_name); \
1370 static struct kobj_attribute zcache_##_name##_attr = { \
1371 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1372 .show = zcache_##_name##_show, \
1375 #define ZCACHE_SYSFS_RO_ATOMIC(_name) \
1376 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1377 struct kobj_attribute *attr, char *buf) \
1379 return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
1381 static struct kobj_attribute zcache_##_name##_attr = { \
1382 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1383 .show = zcache_##_name##_show, \
1386 #define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
1387 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1388 struct kobj_attribute *attr, char *buf) \
1390 return _func(buf); \
1392 static struct kobj_attribute zcache_##_name##_attr = { \
1393 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1394 .show = zcache_##_name##_show, \
1397 ZCACHE_SYSFS_RO(curr_obj_count_max
);
1398 ZCACHE_SYSFS_RO(curr_objnode_count_max
);
1399 ZCACHE_SYSFS_RO(flush_total
);
1400 ZCACHE_SYSFS_RO(flush_found
);
1401 ZCACHE_SYSFS_RO(flobj_total
);
1402 ZCACHE_SYSFS_RO(flobj_found
);
1403 ZCACHE_SYSFS_RO(failed_eph_puts
);
1404 ZCACHE_SYSFS_RO(failed_pers_puts
);
1405 ZCACHE_SYSFS_RO(zbud_curr_zbytes
);
1406 ZCACHE_SYSFS_RO(zbud_cumul_zpages
);
1407 ZCACHE_SYSFS_RO(zbud_cumul_zbytes
);
1408 ZCACHE_SYSFS_RO(zbud_buddied_count
);
1409 ZCACHE_SYSFS_RO(zbpg_unused_list_count
);
1410 ZCACHE_SYSFS_RO(evicted_raw_pages
);
1411 ZCACHE_SYSFS_RO(evicted_unbuddied_pages
);
1412 ZCACHE_SYSFS_RO(evicted_buddied_pages
);
1413 ZCACHE_SYSFS_RO(failed_get_free_pages
);
1414 ZCACHE_SYSFS_RO(failed_alloc
);
1415 ZCACHE_SYSFS_RO(put_to_flush
);
1416 ZCACHE_SYSFS_RO(compress_poor
);
1417 ZCACHE_SYSFS_RO(mean_compress_poor
);
1418 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages
);
1419 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages
);
1420 ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count
);
1421 ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count
);
1422 ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts
,
1423 zbud_show_unbuddied_list_counts
);
1424 ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts
,
1425 zbud_show_cumul_chunk_counts
);
1426 ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts
,
1427 zv_curr_dist_counts_show
);
1428 ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts
,
1429 zv_cumul_dist_counts_show
);
1431 static struct attribute
*zcache_attrs
[] = {
1432 &zcache_curr_obj_count_attr
.attr
,
1433 &zcache_curr_obj_count_max_attr
.attr
,
1434 &zcache_curr_objnode_count_attr
.attr
,
1435 &zcache_curr_objnode_count_max_attr
.attr
,
1436 &zcache_flush_total_attr
.attr
,
1437 &zcache_flobj_total_attr
.attr
,
1438 &zcache_flush_found_attr
.attr
,
1439 &zcache_flobj_found_attr
.attr
,
1440 &zcache_failed_eph_puts_attr
.attr
,
1441 &zcache_failed_pers_puts_attr
.attr
,
1442 &zcache_compress_poor_attr
.attr
,
1443 &zcache_mean_compress_poor_attr
.attr
,
1444 &zcache_zbud_curr_raw_pages_attr
.attr
,
1445 &zcache_zbud_curr_zpages_attr
.attr
,
1446 &zcache_zbud_curr_zbytes_attr
.attr
,
1447 &zcache_zbud_cumul_zpages_attr
.attr
,
1448 &zcache_zbud_cumul_zbytes_attr
.attr
,
1449 &zcache_zbud_buddied_count_attr
.attr
,
1450 &zcache_zbpg_unused_list_count_attr
.attr
,
1451 &zcache_evicted_raw_pages_attr
.attr
,
1452 &zcache_evicted_unbuddied_pages_attr
.attr
,
1453 &zcache_evicted_buddied_pages_attr
.attr
,
1454 &zcache_failed_get_free_pages_attr
.attr
,
1455 &zcache_failed_alloc_attr
.attr
,
1456 &zcache_put_to_flush_attr
.attr
,
1457 &zcache_zbud_unbuddied_list_counts_attr
.attr
,
1458 &zcache_zbud_cumul_chunk_counts_attr
.attr
,
1459 &zcache_zv_curr_dist_counts_attr
.attr
,
1460 &zcache_zv_cumul_dist_counts_attr
.attr
,
1461 &zcache_zv_max_zsize_attr
.attr
,
1462 &zcache_zv_max_mean_zsize_attr
.attr
,
1463 &zcache_zv_page_count_policy_percent_attr
.attr
,
1467 static struct attribute_group zcache_attr_group
= {
1468 .attrs
= zcache_attrs
,
1472 #endif /* CONFIG_SYSFS */
1474 * When zcache is disabled ("frozen"), pools can be created and destroyed,
1475 * but all puts (and thus all other operations that require memory allocation)
1476 * must fail. If zcache is unfrozen, accepts puts, then frozen again,
1477 * data consistency requires all puts while frozen to be converted into
1480 static bool zcache_freeze
;
1483 * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
1485 static int shrink_zcache_memory(struct shrinker
*shrink
,
1486 struct shrink_control
*sc
)
1489 int nr
= sc
->nr_to_scan
;
1490 gfp_t gfp_mask
= sc
->gfp_mask
;
1493 if (!(gfp_mask
& __GFP_FS
))
1494 /* does this case really need to be skipped? */
1496 zbud_evict_pages(nr
);
1498 ret
= (int)atomic_read(&zcache_zbud_curr_raw_pages
);
1503 static struct shrinker zcache_shrinker
= {
1504 .shrink
= shrink_zcache_memory
,
1505 .seeks
= DEFAULT_SEEKS
,
1509 * zcache shims between cleancache/frontswap ops and tmem
1512 static int zcache_put_page(int cli_id
, int pool_id
, struct tmem_oid
*oidp
,
1513 uint32_t index
, struct page
*page
)
1515 struct tmem_pool
*pool
;
1518 BUG_ON(!irqs_disabled());
1519 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1520 if (unlikely(pool
== NULL
))
1522 if (!zcache_freeze
&& zcache_do_preload(pool
) == 0) {
1523 /* preload does preempt_disable on success */
1524 ret
= tmem_put(pool
, oidp
, index
, (char *)(page
),
1525 PAGE_SIZE
, 0, is_ephemeral(pool
));
1527 if (is_ephemeral(pool
))
1528 zcache_failed_eph_puts
++;
1530 zcache_failed_pers_puts
++;
1532 zcache_put_pool(pool
);
1533 preempt_enable_no_resched();
1535 zcache_put_to_flush
++;
1536 if (atomic_read(&pool
->obj_count
) > 0)
1537 /* the put fails whether the flush succeeds or not */
1538 (void)tmem_flush_page(pool
, oidp
, index
);
1539 zcache_put_pool(pool
);
1545 static int zcache_get_page(int cli_id
, int pool_id
, struct tmem_oid
*oidp
,
1546 uint32_t index
, struct page
*page
)
1548 struct tmem_pool
*pool
;
1550 unsigned long flags
;
1551 size_t size
= PAGE_SIZE
;
1553 local_irq_save(flags
);
1554 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1555 if (likely(pool
!= NULL
)) {
1556 if (atomic_read(&pool
->obj_count
) > 0)
1557 ret
= tmem_get(pool
, oidp
, index
, (char *)(page
),
1558 &size
, 0, is_ephemeral(pool
));
1559 zcache_put_pool(pool
);
1561 local_irq_restore(flags
);
1565 static int zcache_flush_page(int cli_id
, int pool_id
,
1566 struct tmem_oid
*oidp
, uint32_t index
)
1568 struct tmem_pool
*pool
;
1570 unsigned long flags
;
1572 local_irq_save(flags
);
1573 zcache_flush_total
++;
1574 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1575 if (likely(pool
!= NULL
)) {
1576 if (atomic_read(&pool
->obj_count
) > 0)
1577 ret
= tmem_flush_page(pool
, oidp
, index
);
1578 zcache_put_pool(pool
);
1581 zcache_flush_found
++;
1582 local_irq_restore(flags
);
1586 static int zcache_flush_object(int cli_id
, int pool_id
,
1587 struct tmem_oid
*oidp
)
1589 struct tmem_pool
*pool
;
1591 unsigned long flags
;
1593 local_irq_save(flags
);
1594 zcache_flobj_total
++;
1595 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1596 if (likely(pool
!= NULL
)) {
1597 if (atomic_read(&pool
->obj_count
) > 0)
1598 ret
= tmem_flush_object(pool
, oidp
);
1599 zcache_put_pool(pool
);
1602 zcache_flobj_found
++;
1603 local_irq_restore(flags
);
1607 static int zcache_destroy_pool(int cli_id
, int pool_id
)
1609 struct tmem_pool
*pool
= NULL
;
1610 struct zcache_client
*cli
= NULL
;
1615 if (cli_id
== LOCAL_CLIENT
)
1617 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
1618 cli
= &zcache_clients
[cli_id
];
1621 atomic_inc(&cli
->refcount
);
1622 pool
= cli
->tmem_pools
[pool_id
];
1625 cli
->tmem_pools
[pool_id
] = NULL
;
1626 /* wait for pool activity on other cpus to quiesce */
1627 while (atomic_read(&pool
->refcount
) != 0)
1629 atomic_dec(&cli
->refcount
);
1631 ret
= tmem_destroy_pool(pool
);
1634 pr_info("zcache: destroyed pool id=%d, cli_id=%d\n",
1640 static int zcache_new_pool(uint16_t cli_id
, uint32_t flags
)
1643 struct tmem_pool
*pool
;
1644 struct zcache_client
*cli
= NULL
;
1646 if (cli_id
== LOCAL_CLIENT
)
1648 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
1649 cli
= &zcache_clients
[cli_id
];
1652 atomic_inc(&cli
->refcount
);
1653 pool
= kmalloc(sizeof(struct tmem_pool
), GFP_ATOMIC
);
1655 pr_info("zcache: pool creation failed: out of memory\n");
1659 for (poolid
= 0; poolid
< MAX_POOLS_PER_CLIENT
; poolid
++)
1660 if (cli
->tmem_pools
[poolid
] == NULL
)
1662 if (poolid
>= MAX_POOLS_PER_CLIENT
) {
1663 pr_info("zcache: pool creation failed: max exceeded\n");
1668 atomic_set(&pool
->refcount
, 0);
1670 pool
->pool_id
= poolid
;
1671 tmem_new_pool(pool
, flags
);
1672 cli
->tmem_pools
[poolid
] = pool
;
1673 pr_info("zcache: created %s tmem pool, id=%d, client=%d\n",
1674 flags
& TMEM_POOL_PERSIST
? "persistent" : "ephemeral",
1678 atomic_dec(&cli
->refcount
);
1683 * Two kernel functionalities currently can be layered on top of tmem.
1684 * These are "cleancache" which is used as a second-chance cache for clean
1685 * page cache pages; and "frontswap" which is used for swap pages
1686 * to avoid writes to disk. A generic "shim" is provided here for each
1687 * to translate in-kernel semantics to zcache semantics.
1690 #ifdef CONFIG_CLEANCACHE
1691 static void zcache_cleancache_put_page(int pool_id
,
1692 struct cleancache_filekey key
,
1693 pgoff_t index
, struct page
*page
)
1695 u32 ind
= (u32
) index
;
1696 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1698 if (likely(ind
== index
))
1699 (void)zcache_put_page(LOCAL_CLIENT
, pool_id
, &oid
, index
, page
);
1702 static int zcache_cleancache_get_page(int pool_id
,
1703 struct cleancache_filekey key
,
1704 pgoff_t index
, struct page
*page
)
1706 u32 ind
= (u32
) index
;
1707 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1710 if (likely(ind
== index
))
1711 ret
= zcache_get_page(LOCAL_CLIENT
, pool_id
, &oid
, index
, page
);
1715 static void zcache_cleancache_flush_page(int pool_id
,
1716 struct cleancache_filekey key
,
1719 u32 ind
= (u32
) index
;
1720 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1722 if (likely(ind
== index
))
1723 (void)zcache_flush_page(LOCAL_CLIENT
, pool_id
, &oid
, ind
);
1726 static void zcache_cleancache_flush_inode(int pool_id
,
1727 struct cleancache_filekey key
)
1729 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1731 (void)zcache_flush_object(LOCAL_CLIENT
, pool_id
, &oid
);
1734 static void zcache_cleancache_flush_fs(int pool_id
)
1737 (void)zcache_destroy_pool(LOCAL_CLIENT
, pool_id
);
1740 static int zcache_cleancache_init_fs(size_t pagesize
)
1742 BUG_ON(sizeof(struct cleancache_filekey
) !=
1743 sizeof(struct tmem_oid
));
1744 BUG_ON(pagesize
!= PAGE_SIZE
);
1745 return zcache_new_pool(LOCAL_CLIENT
, 0);
1748 static int zcache_cleancache_init_shared_fs(char *uuid
, size_t pagesize
)
1750 /* shared pools are unsupported and map to private */
1751 BUG_ON(sizeof(struct cleancache_filekey
) !=
1752 sizeof(struct tmem_oid
));
1753 BUG_ON(pagesize
!= PAGE_SIZE
);
1754 return zcache_new_pool(LOCAL_CLIENT
, 0);
1757 static struct cleancache_ops zcache_cleancache_ops
= {
1758 .put_page
= zcache_cleancache_put_page
,
1759 .get_page
= zcache_cleancache_get_page
,
1760 .flush_page
= zcache_cleancache_flush_page
,
1761 .flush_inode
= zcache_cleancache_flush_inode
,
1762 .flush_fs
= zcache_cleancache_flush_fs
,
1763 .init_shared_fs
= zcache_cleancache_init_shared_fs
,
1764 .init_fs
= zcache_cleancache_init_fs
1767 struct cleancache_ops
zcache_cleancache_register_ops(void)
1769 struct cleancache_ops old_ops
=
1770 cleancache_register_ops(&zcache_cleancache_ops
);
1776 #ifdef CONFIG_FRONTSWAP
1777 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1778 static int zcache_frontswap_poolid
= -1;
1781 * Swizzling increases objects per swaptype, increasing tmem concurrency
1782 * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
1783 * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from
1784 * frontswap_get_page(), but has side-effects. Hence using 8.
1787 #define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
1788 #define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
1789 #define iswiz(_ind) (_ind >> SWIZ_BITS)
1791 static inline struct tmem_oid
oswiz(unsigned type
, u32 ind
)
1793 struct tmem_oid oid
= { .oid
= { 0 } };
1794 oid
.oid
[0] = _oswiz(type
, ind
);
1798 static int zcache_frontswap_put_page(unsigned type
, pgoff_t offset
,
1801 u64 ind64
= (u64
)offset
;
1802 u32 ind
= (u32
)offset
;
1803 struct tmem_oid oid
= oswiz(type
, ind
);
1805 unsigned long flags
;
1807 BUG_ON(!PageLocked(page
));
1808 if (likely(ind64
== ind
)) {
1809 local_irq_save(flags
);
1810 ret
= zcache_put_page(LOCAL_CLIENT
, zcache_frontswap_poolid
,
1811 &oid
, iswiz(ind
), page
);
1812 local_irq_restore(flags
);
1817 /* returns 0 if the page was successfully gotten from frontswap, -1 if
1818 * was not present (should never happen!) */
1819 static int zcache_frontswap_get_page(unsigned type
, pgoff_t offset
,
1822 u64 ind64
= (u64
)offset
;
1823 u32 ind
= (u32
)offset
;
1824 struct tmem_oid oid
= oswiz(type
, ind
);
1827 BUG_ON(!PageLocked(page
));
1828 if (likely(ind64
== ind
))
1829 ret
= zcache_get_page(LOCAL_CLIENT
, zcache_frontswap_poolid
,
1830 &oid
, iswiz(ind
), page
);
1834 /* flush a single page from frontswap */
1835 static void zcache_frontswap_flush_page(unsigned type
, pgoff_t offset
)
1837 u64 ind64
= (u64
)offset
;
1838 u32 ind
= (u32
)offset
;
1839 struct tmem_oid oid
= oswiz(type
, ind
);
1841 if (likely(ind64
== ind
))
1842 (void)zcache_flush_page(LOCAL_CLIENT
, zcache_frontswap_poolid
,
1846 /* flush all pages from the passed swaptype */
1847 static void zcache_frontswap_flush_area(unsigned type
)
1849 struct tmem_oid oid
;
1852 for (ind
= SWIZ_MASK
; ind
>= 0; ind
--) {
1853 oid
= oswiz(type
, ind
);
1854 (void)zcache_flush_object(LOCAL_CLIENT
,
1855 zcache_frontswap_poolid
, &oid
);
1859 static void zcache_frontswap_init(unsigned ignored
)
1861 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1862 if (zcache_frontswap_poolid
< 0)
1863 zcache_frontswap_poolid
=
1864 zcache_new_pool(LOCAL_CLIENT
, TMEM_POOL_PERSIST
);
1867 static struct frontswap_ops zcache_frontswap_ops
= {
1868 .put_page
= zcache_frontswap_put_page
,
1869 .get_page
= zcache_frontswap_get_page
,
1870 .flush_page
= zcache_frontswap_flush_page
,
1871 .flush_area
= zcache_frontswap_flush_area
,
1872 .init
= zcache_frontswap_init
1875 struct frontswap_ops
zcache_frontswap_register_ops(void)
1877 struct frontswap_ops old_ops
=
1878 frontswap_register_ops(&zcache_frontswap_ops
);
1885 * zcache initialization
1886 * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
1890 static int zcache_enabled
;
1892 static int __init
enable_zcache(char *s
)
1897 __setup("zcache", enable_zcache
);
1899 /* allow independent dynamic disabling of cleancache and frontswap */
1901 static int use_cleancache
= 1;
1903 static int __init
no_cleancache(char *s
)
1909 __setup("nocleancache", no_cleancache
);
1911 static int use_frontswap
= 1;
1913 static int __init
no_frontswap(char *s
)
1919 __setup("nofrontswap", no_frontswap
);
1921 static int __init
zcache_init(void)
1926 ret
= sysfs_create_group(mm_kobj
, &zcache_attr_group
);
1928 pr_err("zcache: can't create sysfs\n");
1931 #endif /* CONFIG_SYSFS */
1932 #if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
1933 if (zcache_enabled
) {
1936 tmem_register_hostops(&zcache_hostops
);
1937 tmem_register_pamops(&zcache_pamops
);
1938 ret
= register_cpu_notifier(&zcache_cpu_notifier_block
);
1940 pr_err("zcache: can't register cpu notifier\n");
1943 for_each_online_cpu(cpu
) {
1944 void *pcpu
= (void *)(long)cpu
;
1945 zcache_cpu_notifier(&zcache_cpu_notifier_block
,
1946 CPU_UP_PREPARE
, pcpu
);
1949 zcache_objnode_cache
= kmem_cache_create("zcache_objnode",
1950 sizeof(struct tmem_objnode
), 0, 0, NULL
);
1951 zcache_obj_cache
= kmem_cache_create("zcache_obj",
1952 sizeof(struct tmem_obj
), 0, 0, NULL
);
1953 ret
= zcache_new_client(LOCAL_CLIENT
);
1955 pr_err("zcache: can't create client\n");
1959 #ifdef CONFIG_CLEANCACHE
1960 if (zcache_enabled
&& use_cleancache
) {
1961 struct cleancache_ops old_ops
;
1964 register_shrinker(&zcache_shrinker
);
1965 old_ops
= zcache_cleancache_register_ops();
1966 pr_info("zcache: cleancache enabled using kernel "
1967 "transcendent memory and compression buddies\n");
1968 if (old_ops
.init_fs
!= NULL
)
1969 pr_warning("zcache: cleancache_ops overridden");
1972 #ifdef CONFIG_FRONTSWAP
1973 if (zcache_enabled
&& use_frontswap
) {
1974 struct frontswap_ops old_ops
;
1976 old_ops
= zcache_frontswap_register_ops();
1977 pr_info("zcache: frontswap enabled using kernel "
1978 "transcendent memory and xvmalloc\n");
1979 if (old_ops
.init
!= NULL
)
1980 pr_warning("zcache: frontswap_ops overridden");
1987 module_init(zcache_init
)