4 * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
5 * Copyright (c) 2010,2011, Nitin Gupta
7 * Zcache provides an in-kernel "host implementation" for transcendent memory
8 * and, thus indirectly, for cleancache and frontswap. Zcache includes two
9 * page-accessible memory [1] interfaces, both utilizing the crypto compression
11 * 1) "compression buddies" ("zbud") is used for ephemeral pages
12 * 2) zsmalloc is used for persistent pages.
13 * Xvmalloc (based on the TLSF allocator) has very low fragmentation
14 * so maximizes space efficiency, while zbud allows pairs (and potentially,
15 * in the future, more than a pair of) compressed pages to be closely linked
16 * so that reclaiming can be done via the kernel's physical-page-oriented
17 * "shrinker" interface.
19 * [1] For a definition of page-accessible memory (aka PAM), see:
20 * http://marc.info/?l=linux-mm&m=127811271605009
23 #include <linux/module.h>
24 #include <linux/cpu.h>
25 #include <linux/highmem.h>
26 #include <linux/list.h>
27 #include <linux/slab.h>
28 #include <linux/spinlock.h>
29 #include <linux/types.h>
30 #include <linux/atomic.h>
31 #include <linux/math64.h>
32 #include <linux/crypto.h>
33 #include <linux/string.h>
34 #include <linux/idr.h>
37 #include "../zsmalloc/zsmalloc.h"
39 #ifdef CONFIG_CLEANCACHE
40 #include <linux/cleancache.h>
42 #ifdef CONFIG_FRONTSWAP
43 #include <linux/frontswap.h>
47 /* this is more aggressive but may cause other problems? */
48 #define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
50 #define ZCACHE_GFP_MASK \
51 (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
54 #define MAX_CLIENTS 16
55 #define LOCAL_CLIENT ((uint16_t)-1)
57 MODULE_LICENSE("GPL");
59 struct zcache_client
{
60 struct idr tmem_pools
;
61 struct zs_pool
*zspool
;
66 static struct zcache_client zcache_host
;
67 static struct zcache_client zcache_clients
[MAX_CLIENTS
];
69 static inline uint16_t get_client_id_from_client(struct zcache_client
*cli
)
72 if (cli
== &zcache_host
)
74 return cli
- &zcache_clients
[0];
77 static struct zcache_client
*get_zcache_client(uint16_t cli_id
)
79 if (cli_id
== LOCAL_CLIENT
)
82 if ((unsigned int)cli_id
< MAX_CLIENTS
)
83 return &zcache_clients
[cli_id
];
88 static inline bool is_local_client(struct zcache_client
*cli
)
90 return cli
== &zcache_host
;
93 /* crypto API for zcache */
94 #define ZCACHE_COMP_NAME_SZ CRYPTO_MAX_ALG_NAME
95 static char zcache_comp_name
[ZCACHE_COMP_NAME_SZ
];
96 static struct crypto_comp
* __percpu
*zcache_comp_pcpu_tfms
;
99 ZCACHE_COMPOP_COMPRESS
,
100 ZCACHE_COMPOP_DECOMPRESS
103 static inline int zcache_comp_op(enum comp_op op
,
104 const u8
*src
, unsigned int slen
,
105 u8
*dst
, unsigned int *dlen
)
107 struct crypto_comp
*tfm
;
110 BUG_ON(!zcache_comp_pcpu_tfms
);
111 tfm
= *per_cpu_ptr(zcache_comp_pcpu_tfms
, get_cpu());
114 case ZCACHE_COMPOP_COMPRESS
:
115 ret
= crypto_comp_compress(tfm
, src
, slen
, dst
, dlen
);
117 case ZCACHE_COMPOP_DECOMPRESS
:
118 ret
= crypto_comp_decompress(tfm
, src
, slen
, dst
, dlen
);
128 * Compression buddies ("zbud") provides for packing two (or, possibly
129 * in the future, more) compressed ephemeral pages into a single "raw"
130 * (physical) page and tracking them with data structures so that
131 * the raw pages can be easily reclaimed.
133 * A zbud page ("zbpg") is an aligned page containing a list_head,
134 * a lock, and two "zbud headers". The remainder of the physical
135 * page is divided up into aligned 64-byte "chunks" which contain
136 * the compressed data for zero, one, or two zbuds. Each zbpg
137 * resides on: (1) an "unused list" if it has no zbuds; (2) a
138 * "buddied" list if it is fully populated with two zbuds; or
139 * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
140 * the one unbuddied zbud uses. The data inside a zbpg cannot be
141 * read or written unless the zbpg's lock is held.
144 #define ZBH_SENTINEL 0x43214321
145 #define ZBPG_SENTINEL 0xdeadbeef
147 #define ZBUD_MAX_BUDS 2
154 uint16_t size
; /* compressed size in bytes, zero means unused */
159 struct list_head bud_list
;
161 struct zbud_hdr buddy
[ZBUD_MAX_BUDS
];
163 /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
166 #define CHUNK_SHIFT 6
167 #define CHUNK_SIZE (1 << CHUNK_SHIFT)
168 #define CHUNK_MASK (~(CHUNK_SIZE-1))
169 #define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \
170 CHUNK_MASK) >> CHUNK_SHIFT)
171 #define MAX_CHUNK (NCHUNKS-1)
174 struct list_head list
;
176 } zbud_unbuddied
[NCHUNKS
];
177 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
178 /* element 0 is never used but optimizing that isn't worth it */
179 static unsigned long zbud_cumul_chunk_counts
[NCHUNKS
];
181 struct list_head zbud_buddied_list
;
182 static unsigned long zcache_zbud_buddied_count
;
184 /* protects the buddied list and all unbuddied lists */
185 static DEFINE_SPINLOCK(zbud_budlists_spinlock
);
187 static LIST_HEAD(zbpg_unused_list
);
188 static unsigned long zcache_zbpg_unused_list_count
;
190 /* protects the unused page list */
191 static DEFINE_SPINLOCK(zbpg_unused_list_spinlock
);
193 static atomic_t zcache_zbud_curr_raw_pages
;
194 static atomic_t zcache_zbud_curr_zpages
;
195 static unsigned long zcache_zbud_curr_zbytes
;
196 static unsigned long zcache_zbud_cumul_zpages
;
197 static unsigned long zcache_zbud_cumul_zbytes
;
198 static unsigned long zcache_compress_poor
;
199 static unsigned long zcache_mean_compress_poor
;
201 /* forward references */
202 static void *zcache_get_free_page(void);
203 static void zcache_free_page(void *p
);
206 * zbud helper functions
209 static inline unsigned zbud_max_buddy_size(void)
211 return MAX_CHUNK
<< CHUNK_SHIFT
;
214 static inline unsigned zbud_size_to_chunks(unsigned size
)
216 BUG_ON(size
== 0 || size
> zbud_max_buddy_size());
217 return (size
+ CHUNK_SIZE
- 1) >> CHUNK_SHIFT
;
220 static inline int zbud_budnum(struct zbud_hdr
*zh
)
222 unsigned offset
= (unsigned long)zh
& (PAGE_SIZE
- 1);
223 struct zbud_page
*zbpg
= NULL
;
224 unsigned budnum
= -1U;
227 for (i
= 0; i
< ZBUD_MAX_BUDS
; i
++)
228 if (offset
== offsetof(typeof(*zbpg
), buddy
[i
])) {
232 BUG_ON(budnum
== -1U);
236 static char *zbud_data(struct zbud_hdr
*zh
, unsigned size
)
238 struct zbud_page
*zbpg
;
242 ASSERT_SENTINEL(zh
, ZBH
);
243 budnum
= zbud_budnum(zh
);
244 BUG_ON(size
== 0 || size
> zbud_max_buddy_size());
245 zbpg
= container_of(zh
, struct zbud_page
, buddy
[budnum
]);
246 ASSERT_SPINLOCK(&zbpg
->lock
);
249 p
+= ((sizeof(struct zbud_page
) + CHUNK_SIZE
- 1) &
251 else if (budnum
== 1)
252 p
+= PAGE_SIZE
- ((size
+ CHUNK_SIZE
- 1) & CHUNK_MASK
);
257 * zbud raw page management
260 static struct zbud_page
*zbud_alloc_raw_page(void)
262 struct zbud_page
*zbpg
= NULL
;
263 struct zbud_hdr
*zh0
, *zh1
;
266 /* if any pages on the zbpg list, use one */
267 spin_lock(&zbpg_unused_list_spinlock
);
268 if (!list_empty(&zbpg_unused_list
)) {
269 zbpg
= list_first_entry(&zbpg_unused_list
,
270 struct zbud_page
, bud_list
);
271 list_del_init(&zbpg
->bud_list
);
272 zcache_zbpg_unused_list_count
--;
275 spin_unlock(&zbpg_unused_list_spinlock
);
277 /* none on zbpg list, try to get a kernel page */
278 zbpg
= zcache_get_free_page();
279 if (likely(zbpg
!= NULL
)) {
280 INIT_LIST_HEAD(&zbpg
->bud_list
);
281 zh0
= &zbpg
->buddy
[0]; zh1
= &zbpg
->buddy
[1];
282 spin_lock_init(&zbpg
->lock
);
284 ASSERT_INVERTED_SENTINEL(zbpg
, ZBPG
);
285 SET_SENTINEL(zbpg
, ZBPG
);
286 BUG_ON(zh0
->size
!= 0 || tmem_oid_valid(&zh0
->oid
));
287 BUG_ON(zh1
->size
!= 0 || tmem_oid_valid(&zh1
->oid
));
289 atomic_inc(&zcache_zbud_curr_raw_pages
);
290 INIT_LIST_HEAD(&zbpg
->bud_list
);
291 SET_SENTINEL(zbpg
, ZBPG
);
292 zh0
->size
= 0; zh1
->size
= 0;
293 tmem_oid_set_invalid(&zh0
->oid
);
294 tmem_oid_set_invalid(&zh1
->oid
);
300 static void zbud_free_raw_page(struct zbud_page
*zbpg
)
302 struct zbud_hdr
*zh0
= &zbpg
->buddy
[0], *zh1
= &zbpg
->buddy
[1];
304 ASSERT_SENTINEL(zbpg
, ZBPG
);
305 BUG_ON(!list_empty(&zbpg
->bud_list
));
306 ASSERT_SPINLOCK(&zbpg
->lock
);
307 BUG_ON(zh0
->size
!= 0 || tmem_oid_valid(&zh0
->oid
));
308 BUG_ON(zh1
->size
!= 0 || tmem_oid_valid(&zh1
->oid
));
309 INVERT_SENTINEL(zbpg
, ZBPG
);
310 spin_unlock(&zbpg
->lock
);
311 spin_lock(&zbpg_unused_list_spinlock
);
312 list_add(&zbpg
->bud_list
, &zbpg_unused_list
);
313 zcache_zbpg_unused_list_count
++;
314 spin_unlock(&zbpg_unused_list_spinlock
);
318 * core zbud handling routines
321 static unsigned zbud_free(struct zbud_hdr
*zh
)
325 ASSERT_SENTINEL(zh
, ZBH
);
326 BUG_ON(!tmem_oid_valid(&zh
->oid
));
328 BUG_ON(zh
->size
== 0 || zh
->size
> zbud_max_buddy_size());
330 tmem_oid_set_invalid(&zh
->oid
);
331 INVERT_SENTINEL(zh
, ZBH
);
332 zcache_zbud_curr_zbytes
-= size
;
333 atomic_dec(&zcache_zbud_curr_zpages
);
337 static void zbud_free_and_delist(struct zbud_hdr
*zh
)
340 struct zbud_hdr
*zh_other
;
341 unsigned budnum
= zbud_budnum(zh
), size
;
342 struct zbud_page
*zbpg
=
343 container_of(zh
, struct zbud_page
, buddy
[budnum
]);
345 spin_lock(&zbud_budlists_spinlock
);
346 spin_lock(&zbpg
->lock
);
347 if (list_empty(&zbpg
->bud_list
)) {
348 /* ignore zombie page... see zbud_evict_pages() */
349 spin_unlock(&zbpg
->lock
);
350 spin_unlock(&zbud_budlists_spinlock
);
353 size
= zbud_free(zh
);
354 ASSERT_SPINLOCK(&zbpg
->lock
);
355 zh_other
= &zbpg
->buddy
[(budnum
== 0) ? 1 : 0];
356 if (zh_other
->size
== 0) { /* was unbuddied: unlist and free */
357 chunks
= zbud_size_to_chunks(size
) ;
358 BUG_ON(list_empty(&zbud_unbuddied
[chunks
].list
));
359 list_del_init(&zbpg
->bud_list
);
360 zbud_unbuddied
[chunks
].count
--;
361 spin_unlock(&zbud_budlists_spinlock
);
362 zbud_free_raw_page(zbpg
);
363 } else { /* was buddied: move remaining buddy to unbuddied list */
364 chunks
= zbud_size_to_chunks(zh_other
->size
) ;
365 list_del_init(&zbpg
->bud_list
);
366 zcache_zbud_buddied_count
--;
367 list_add_tail(&zbpg
->bud_list
, &zbud_unbuddied
[chunks
].list
);
368 zbud_unbuddied
[chunks
].count
++;
369 spin_unlock(&zbud_budlists_spinlock
);
370 spin_unlock(&zbpg
->lock
);
374 static struct zbud_hdr
*zbud_create(uint16_t client_id
, uint16_t pool_id
,
375 struct tmem_oid
*oid
,
376 uint32_t index
, struct page
*page
,
377 void *cdata
, unsigned size
)
379 struct zbud_hdr
*zh0
, *zh1
, *zh
= NULL
;
380 struct zbud_page
*zbpg
= NULL
, *ztmp
;
383 int i
, found_good_buddy
= 0;
385 nchunks
= zbud_size_to_chunks(size
) ;
386 for (i
= MAX_CHUNK
- nchunks
+ 1; i
> 0; i
--) {
387 spin_lock(&zbud_budlists_spinlock
);
388 if (!list_empty(&zbud_unbuddied
[i
].list
)) {
389 list_for_each_entry_safe(zbpg
, ztmp
,
390 &zbud_unbuddied
[i
].list
, bud_list
) {
391 if (spin_trylock(&zbpg
->lock
)) {
392 found_good_buddy
= i
;
393 goto found_unbuddied
;
397 spin_unlock(&zbud_budlists_spinlock
);
399 /* didn't find a good buddy, try allocating a new page */
400 zbpg
= zbud_alloc_raw_page();
401 if (unlikely(zbpg
== NULL
))
403 /* ok, have a page, now compress the data before taking locks */
404 spin_lock(&zbud_budlists_spinlock
);
405 spin_lock(&zbpg
->lock
);
406 list_add_tail(&zbpg
->bud_list
, &zbud_unbuddied
[nchunks
].list
);
407 zbud_unbuddied
[nchunks
].count
++;
408 zh
= &zbpg
->buddy
[0];
412 ASSERT_SPINLOCK(&zbpg
->lock
);
413 zh0
= &zbpg
->buddy
[0]; zh1
= &zbpg
->buddy
[1];
414 BUG_ON(!((zh0
->size
== 0) ^ (zh1
->size
== 0)));
415 if (zh0
->size
!= 0) { /* buddy0 in use, buddy1 is vacant */
416 ASSERT_SENTINEL(zh0
, ZBH
);
418 } else if (zh1
->size
!= 0) { /* buddy1 in use, buddy0 is vacant */
419 ASSERT_SENTINEL(zh1
, ZBH
);
423 list_del_init(&zbpg
->bud_list
);
424 zbud_unbuddied
[found_good_buddy
].count
--;
425 list_add_tail(&zbpg
->bud_list
, &zbud_buddied_list
);
426 zcache_zbud_buddied_count
++;
429 SET_SENTINEL(zh
, ZBH
);
433 zh
->pool_id
= pool_id
;
434 zh
->client_id
= client_id
;
435 to
= zbud_data(zh
, size
);
436 memcpy(to
, cdata
, size
);
437 spin_unlock(&zbpg
->lock
);
438 spin_unlock(&zbud_budlists_spinlock
);
440 zbud_cumul_chunk_counts
[nchunks
]++;
441 atomic_inc(&zcache_zbud_curr_zpages
);
442 zcache_zbud_cumul_zpages
++;
443 zcache_zbud_curr_zbytes
+= size
;
444 zcache_zbud_cumul_zbytes
+= size
;
449 static int zbud_decompress(struct page
*page
, struct zbud_hdr
*zh
)
451 struct zbud_page
*zbpg
;
452 unsigned budnum
= zbud_budnum(zh
);
453 unsigned int out_len
= PAGE_SIZE
;
454 char *to_va
, *from_va
;
458 zbpg
= container_of(zh
, struct zbud_page
, buddy
[budnum
]);
459 spin_lock(&zbpg
->lock
);
460 if (list_empty(&zbpg
->bud_list
)) {
461 /* ignore zombie page... see zbud_evict_pages() */
465 ASSERT_SENTINEL(zh
, ZBH
);
466 BUG_ON(zh
->size
== 0 || zh
->size
> zbud_max_buddy_size());
467 to_va
= kmap_atomic(page
);
469 from_va
= zbud_data(zh
, size
);
470 ret
= zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS
, from_va
, size
,
473 BUG_ON(out_len
!= PAGE_SIZE
);
474 kunmap_atomic(to_va
);
476 spin_unlock(&zbpg
->lock
);
481 * The following routines handle shrinking of ephemeral pages by evicting
482 * pages "least valuable" first.
485 static unsigned long zcache_evicted_raw_pages
;
486 static unsigned long zcache_evicted_buddied_pages
;
487 static unsigned long zcache_evicted_unbuddied_pages
;
489 static struct tmem_pool
*zcache_get_pool_by_id(uint16_t cli_id
,
491 static void zcache_put_pool(struct tmem_pool
*pool
);
494 * Flush and free all zbuds in a zbpg, then free the pageframe
496 static void zbud_evict_zbpg(struct zbud_page
*zbpg
)
500 uint32_t pool_id
[ZBUD_MAX_BUDS
], client_id
[ZBUD_MAX_BUDS
];
501 uint32_t index
[ZBUD_MAX_BUDS
];
502 struct tmem_oid oid
[ZBUD_MAX_BUDS
];
503 struct tmem_pool
*pool
;
505 ASSERT_SPINLOCK(&zbpg
->lock
);
506 BUG_ON(!list_empty(&zbpg
->bud_list
));
507 for (i
= 0, j
= 0; i
< ZBUD_MAX_BUDS
; i
++) {
508 zh
= &zbpg
->buddy
[i
];
510 client_id
[j
] = zh
->client_id
;
511 pool_id
[j
] = zh
->pool_id
;
513 index
[j
] = zh
->index
;
518 spin_unlock(&zbpg
->lock
);
519 for (i
= 0; i
< j
; i
++) {
520 pool
= zcache_get_pool_by_id(client_id
[i
], pool_id
[i
]);
522 tmem_flush_page(pool
, &oid
[i
], index
[i
]);
523 zcache_put_pool(pool
);
526 ASSERT_SENTINEL(zbpg
, ZBPG
);
527 spin_lock(&zbpg
->lock
);
528 zbud_free_raw_page(zbpg
);
532 * Free nr pages. This code is funky because we want to hold the locks
533 * protecting various lists for as short a time as possible, and in some
534 * circumstances the list may change asynchronously when the list lock is
535 * not held. In some cases we also trylock not only to avoid waiting on a
536 * page in use by another cpu, but also to avoid potential deadlock due to
539 static void zbud_evict_pages(int nr
)
541 struct zbud_page
*zbpg
;
544 /* first try freeing any pages on unused list */
546 spin_lock_bh(&zbpg_unused_list_spinlock
);
547 if (!list_empty(&zbpg_unused_list
)) {
548 /* can't walk list here, since it may change when unlocked */
549 zbpg
= list_first_entry(&zbpg_unused_list
,
550 struct zbud_page
, bud_list
);
551 list_del_init(&zbpg
->bud_list
);
552 zcache_zbpg_unused_list_count
--;
553 atomic_dec(&zcache_zbud_curr_raw_pages
);
554 spin_unlock_bh(&zbpg_unused_list_spinlock
);
555 zcache_free_page(zbpg
);
556 zcache_evicted_raw_pages
++;
559 goto retry_unused_list
;
561 spin_unlock_bh(&zbpg_unused_list_spinlock
);
563 /* now try freeing unbuddied pages, starting with least space avail */
564 for (i
= 0; i
< MAX_CHUNK
; i
++) {
566 spin_lock_bh(&zbud_budlists_spinlock
);
567 if (list_empty(&zbud_unbuddied
[i
].list
)) {
568 spin_unlock_bh(&zbud_budlists_spinlock
);
571 list_for_each_entry(zbpg
, &zbud_unbuddied
[i
].list
, bud_list
) {
572 if (unlikely(!spin_trylock(&zbpg
->lock
)))
574 list_del_init(&zbpg
->bud_list
);
575 zbud_unbuddied
[i
].count
--;
576 spin_unlock(&zbud_budlists_spinlock
);
577 zcache_evicted_unbuddied_pages
++;
578 /* want budlists unlocked when doing zbpg eviction */
579 zbud_evict_zbpg(zbpg
);
583 goto retry_unbud_list_i
;
585 spin_unlock_bh(&zbud_budlists_spinlock
);
588 /* as a last resort, free buddied pages */
590 spin_lock_bh(&zbud_budlists_spinlock
);
591 if (list_empty(&zbud_buddied_list
)) {
592 spin_unlock_bh(&zbud_budlists_spinlock
);
595 list_for_each_entry(zbpg
, &zbud_buddied_list
, bud_list
) {
596 if (unlikely(!spin_trylock(&zbpg
->lock
)))
598 list_del_init(&zbpg
->bud_list
);
599 zcache_zbud_buddied_count
--;
600 spin_unlock(&zbud_budlists_spinlock
);
601 zcache_evicted_buddied_pages
++;
602 /* want budlists unlocked when doing zbpg eviction */
603 zbud_evict_zbpg(zbpg
);
609 spin_unlock_bh(&zbud_budlists_spinlock
);
614 static void __init
zbud_init(void)
618 INIT_LIST_HEAD(&zbud_buddied_list
);
620 for (i
= 0; i
< NCHUNKS
; i
++)
621 INIT_LIST_HEAD(&zbud_unbuddied
[i
].list
);
626 * These sysfs routines show a nice distribution of how many zbpg's are
627 * currently (and have ever been placed) in each unbuddied list. It's fun
628 * to watch but can probably go away before final merge.
630 static int zbud_show_unbuddied_list_counts(char *buf
)
635 for (i
= 0; i
< NCHUNKS
; i
++)
636 p
+= sprintf(p
, "%u ", zbud_unbuddied
[i
].count
);
640 static int zbud_show_cumul_chunk_counts(char *buf
)
642 unsigned long i
, chunks
= 0, total_chunks
= 0, sum_total_chunks
= 0;
643 unsigned long total_chunks_lte_21
= 0, total_chunks_lte_32
= 0;
644 unsigned long total_chunks_lte_42
= 0;
647 for (i
= 0; i
< NCHUNKS
; i
++) {
648 p
+= sprintf(p
, "%lu ", zbud_cumul_chunk_counts
[i
]);
649 chunks
+= zbud_cumul_chunk_counts
[i
];
650 total_chunks
+= zbud_cumul_chunk_counts
[i
];
651 sum_total_chunks
+= i
* zbud_cumul_chunk_counts
[i
];
653 total_chunks_lte_21
= total_chunks
;
655 total_chunks_lte_32
= total_chunks
;
657 total_chunks_lte_42
= total_chunks
;
659 p
+= sprintf(p
, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
660 total_chunks_lte_21
, total_chunks_lte_32
, total_chunks_lte_42
,
661 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
667 * This "zv" PAM implementation combines the slab-based zsmalloc
668 * with the crypto compression API to maximize the amount of data that can
669 * be packed into a physical page.
671 * Zv represents a PAM page with the index and object (plus a "size" value
672 * necessary for decompression) immediately preceding the compressed data.
675 #define ZVH_SENTINEL 0x43214321
685 /* rudimentary policy limits */
686 /* total number of persistent pages may not exceed this percentage */
687 static unsigned int zv_page_count_policy_percent
= 75;
689 * byte count defining poor compression; pages with greater zsize will be
692 static unsigned int zv_max_zsize
= (PAGE_SIZE
/ 8) * 7;
694 * byte count defining poor *mean* compression; pages with greater zsize
695 * will be rejected until sufficient better-compressed pages are accepted
696 * driving the mean below this threshold
698 static unsigned int zv_max_mean_zsize
= (PAGE_SIZE
/ 8) * 5;
700 static atomic_t zv_curr_dist_counts
[NCHUNKS
];
701 static atomic_t zv_cumul_dist_counts
[NCHUNKS
];
703 static unsigned long zv_create(struct zs_pool
*pool
, uint32_t pool_id
,
704 struct tmem_oid
*oid
, uint32_t index
,
705 void *cdata
, unsigned clen
)
708 u32 size
= clen
+ sizeof(struct zv_hdr
);
709 int chunks
= (size
+ (CHUNK_SIZE
- 1)) >> CHUNK_SHIFT
;
710 unsigned long handle
= 0;
712 BUG_ON(!irqs_disabled());
713 BUG_ON(chunks
>= NCHUNKS
);
714 handle
= zs_malloc(pool
, size
);
717 atomic_inc(&zv_curr_dist_counts
[chunks
]);
718 atomic_inc(&zv_cumul_dist_counts
[chunks
]);
719 zv
= zs_map_object(pool
, handle
, ZS_MM_WO
);
722 zv
->pool_id
= pool_id
;
724 SET_SENTINEL(zv
, ZVH
);
725 memcpy((char *)zv
+ sizeof(struct zv_hdr
), cdata
, clen
);
726 zs_unmap_object(pool
, handle
);
731 static void zv_free(struct zs_pool
*pool
, unsigned long handle
)
738 zv
= zs_map_object(pool
, handle
, ZS_MM_RW
);
739 ASSERT_SENTINEL(zv
, ZVH
);
740 size
= zv
->size
+ sizeof(struct zv_hdr
);
741 INVERT_SENTINEL(zv
, ZVH
);
742 zs_unmap_object(pool
, handle
);
744 chunks
= (size
+ (CHUNK_SIZE
- 1)) >> CHUNK_SHIFT
;
745 BUG_ON(chunks
>= NCHUNKS
);
746 atomic_dec(&zv_curr_dist_counts
[chunks
]);
748 local_irq_save(flags
);
749 zs_free(pool
, handle
);
750 local_irq_restore(flags
);
753 static void zv_decompress(struct page
*page
, unsigned long handle
)
755 unsigned int clen
= PAGE_SIZE
;
760 zv
= zs_map_object(zcache_host
.zspool
, handle
, ZS_MM_RO
);
761 BUG_ON(zv
->size
== 0);
762 ASSERT_SENTINEL(zv
, ZVH
);
763 to_va
= kmap_atomic(page
);
764 ret
= zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS
, (char *)zv
+ sizeof(*zv
),
765 zv
->size
, to_va
, &clen
);
766 kunmap_atomic(to_va
);
767 zs_unmap_object(zcache_host
.zspool
, handle
);
769 BUG_ON(clen
!= PAGE_SIZE
);
774 * show a distribution of compression stats for zv pages.
777 static int zv_curr_dist_counts_show(char *buf
)
779 unsigned long i
, n
, chunks
= 0, sum_total_chunks
= 0;
782 for (i
= 0; i
< NCHUNKS
; i
++) {
783 n
= atomic_read(&zv_curr_dist_counts
[i
]);
784 p
+= sprintf(p
, "%lu ", n
);
786 sum_total_chunks
+= i
* n
;
788 p
+= sprintf(p
, "mean:%lu\n",
789 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
793 static int zv_cumul_dist_counts_show(char *buf
)
795 unsigned long i
, n
, chunks
= 0, sum_total_chunks
= 0;
798 for (i
= 0; i
< NCHUNKS
; i
++) {
799 n
= atomic_read(&zv_cumul_dist_counts
[i
]);
800 p
+= sprintf(p
, "%lu ", n
);
802 sum_total_chunks
+= i
* n
;
804 p
+= sprintf(p
, "mean:%lu\n",
805 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
810 * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
811 * pages that don't compress to less than this value (including metadata
812 * overhead) to be rejected. We don't allow the value to get too close
815 static ssize_t
zv_max_zsize_show(struct kobject
*kobj
,
816 struct kobj_attribute
*attr
,
819 return sprintf(buf
, "%u\n", zv_max_zsize
);
822 static ssize_t
zv_max_zsize_store(struct kobject
*kobj
,
823 struct kobj_attribute
*attr
,
824 const char *buf
, size_t count
)
829 if (!capable(CAP_SYS_ADMIN
))
832 err
= kstrtoul(buf
, 10, &val
);
833 if (err
|| (val
== 0) || (val
> (PAGE_SIZE
/ 8) * 7))
840 * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
841 * pages that don't compress to less than this value (including metadata
842 * overhead) to be rejected UNLESS the mean compression is also smaller
843 * than this value. In other words, we are load-balancing-by-zsize the
844 * accepted pages. Again, we don't allow the value to get too close
847 static ssize_t
zv_max_mean_zsize_show(struct kobject
*kobj
,
848 struct kobj_attribute
*attr
,
851 return sprintf(buf
, "%u\n", zv_max_mean_zsize
);
854 static ssize_t
zv_max_mean_zsize_store(struct kobject
*kobj
,
855 struct kobj_attribute
*attr
,
856 const char *buf
, size_t count
)
861 if (!capable(CAP_SYS_ADMIN
))
864 err
= kstrtoul(buf
, 10, &val
);
865 if (err
|| (val
== 0) || (val
> (PAGE_SIZE
/ 8) * 7))
867 zv_max_mean_zsize
= val
;
872 * setting zv_page_count_policy_percent via sysfs sets an upper bound of
873 * persistent (e.g. swap) pages that will be retained according to:
874 * (zv_page_count_policy_percent * totalram_pages) / 100)
875 * when that limit is reached, further puts will be rejected (until
876 * some pages have been flushed). Note that, due to compression,
877 * this number may exceed 100; it defaults to 75 and we set an
878 * arbitary limit of 150. A poor choice will almost certainly result
879 * in OOM's, so this value should only be changed prudently.
881 static ssize_t
zv_page_count_policy_percent_show(struct kobject
*kobj
,
882 struct kobj_attribute
*attr
,
885 return sprintf(buf
, "%u\n", zv_page_count_policy_percent
);
888 static ssize_t
zv_page_count_policy_percent_store(struct kobject
*kobj
,
889 struct kobj_attribute
*attr
,
890 const char *buf
, size_t count
)
895 if (!capable(CAP_SYS_ADMIN
))
898 err
= kstrtoul(buf
, 10, &val
);
899 if (err
|| (val
== 0) || (val
> 150))
901 zv_page_count_policy_percent
= val
;
905 static struct kobj_attribute zcache_zv_max_zsize_attr
= {
906 .attr
= { .name
= "zv_max_zsize", .mode
= 0644 },
907 .show
= zv_max_zsize_show
,
908 .store
= zv_max_zsize_store
,
911 static struct kobj_attribute zcache_zv_max_mean_zsize_attr
= {
912 .attr
= { .name
= "zv_max_mean_zsize", .mode
= 0644 },
913 .show
= zv_max_mean_zsize_show
,
914 .store
= zv_max_mean_zsize_store
,
917 static struct kobj_attribute zcache_zv_page_count_policy_percent_attr
= {
918 .attr
= { .name
= "zv_page_count_policy_percent",
920 .show
= zv_page_count_policy_percent_show
,
921 .store
= zv_page_count_policy_percent_store
,
926 * zcache core code starts here
929 /* useful stats not collected by cleancache or frontswap */
930 static unsigned long zcache_flush_total
;
931 static unsigned long zcache_flush_found
;
932 static unsigned long zcache_flobj_total
;
933 static unsigned long zcache_flobj_found
;
934 static unsigned long zcache_failed_eph_puts
;
935 static unsigned long zcache_failed_pers_puts
;
938 * Tmem operations assume the poolid implies the invoking client.
939 * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
940 * RAMster has each client numbered by cluster node, and a KVM version
941 * of zcache would have one client per guest and each client might
944 static struct tmem_pool
*zcache_get_pool_by_id(uint16_t cli_id
, uint16_t poolid
)
946 struct tmem_pool
*pool
= NULL
;
947 struct zcache_client
*cli
= NULL
;
949 cli
= get_zcache_client(cli_id
);
953 atomic_inc(&cli
->refcount
);
954 pool
= idr_find(&cli
->tmem_pools
, poolid
);
956 atomic_inc(&pool
->refcount
);
961 static void zcache_put_pool(struct tmem_pool
*pool
)
963 struct zcache_client
*cli
= NULL
;
968 atomic_dec(&pool
->refcount
);
969 atomic_dec(&cli
->refcount
);
972 int zcache_new_client(uint16_t cli_id
)
974 struct zcache_client
*cli
;
977 cli
= get_zcache_client(cli_id
);
984 #ifdef CONFIG_FRONTSWAP
985 cli
->zspool
= zs_create_pool("zcache", ZCACHE_GFP_MASK
);
986 if (cli
->zspool
== NULL
)
988 idr_init(&cli
->tmem_pools
);
995 /* counters for debugging */
996 static unsigned long zcache_failed_get_free_pages
;
997 static unsigned long zcache_failed_alloc
;
998 static unsigned long zcache_put_to_flush
;
1001 * for now, used named slabs so can easily track usage; later can
1002 * either just use kmalloc, or perhaps add a slab-like allocator
1003 * to more carefully manage total memory utilization
1005 static struct kmem_cache
*zcache_objnode_cache
;
1006 static struct kmem_cache
*zcache_obj_cache
;
1007 static atomic_t zcache_curr_obj_count
= ATOMIC_INIT(0);
1008 static unsigned long zcache_curr_obj_count_max
;
1009 static atomic_t zcache_curr_objnode_count
= ATOMIC_INIT(0);
1010 static unsigned long zcache_curr_objnode_count_max
;
1013 * to avoid memory allocation recursion (e.g. due to direct reclaim), we
1014 * preload all necessary data structures so the hostops callbacks never
1015 * actually do a malloc
1017 struct zcache_preload
{
1019 struct tmem_obj
*obj
;
1021 struct tmem_objnode
*objnodes
[OBJNODE_TREE_MAX_PATH
];
1023 static DEFINE_PER_CPU(struct zcache_preload
, zcache_preloads
) = { 0, };
1025 static int zcache_do_preload(struct tmem_pool
*pool
)
1027 struct zcache_preload
*kp
;
1028 struct tmem_objnode
*objnode
;
1029 struct tmem_obj
*obj
;
1033 if (unlikely(zcache_objnode_cache
== NULL
))
1035 if (unlikely(zcache_obj_cache
== NULL
))
1038 /* IRQ has already been disabled. */
1039 kp
= &__get_cpu_var(zcache_preloads
);
1040 while (kp
->nr
< ARRAY_SIZE(kp
->objnodes
)) {
1041 objnode
= kmem_cache_alloc(zcache_objnode_cache
,
1043 if (unlikely(objnode
== NULL
)) {
1044 zcache_failed_alloc
++;
1048 kp
->objnodes
[kp
->nr
++] = objnode
;
1052 obj
= kmem_cache_alloc(zcache_obj_cache
, ZCACHE_GFP_MASK
);
1053 if (unlikely(obj
== NULL
)) {
1054 zcache_failed_alloc
++;
1061 page
= (void *)__get_free_page(ZCACHE_GFP_MASK
);
1062 if (unlikely(page
== NULL
)) {
1063 zcache_failed_get_free_pages
++;
1074 static void *zcache_get_free_page(void)
1076 struct zcache_preload
*kp
;
1079 kp
= &__get_cpu_var(zcache_preloads
);
1081 BUG_ON(page
== NULL
);
1086 static void zcache_free_page(void *p
)
1088 free_page((unsigned long)p
);
1092 * zcache implementation for tmem host ops
1095 static struct tmem_objnode
*zcache_objnode_alloc(struct tmem_pool
*pool
)
1097 struct tmem_objnode
*objnode
= NULL
;
1098 unsigned long count
;
1099 struct zcache_preload
*kp
;
1101 kp
= &__get_cpu_var(zcache_preloads
);
1104 objnode
= kp
->objnodes
[kp
->nr
- 1];
1105 BUG_ON(objnode
== NULL
);
1106 kp
->objnodes
[kp
->nr
- 1] = NULL
;
1108 count
= atomic_inc_return(&zcache_curr_objnode_count
);
1109 if (count
> zcache_curr_objnode_count_max
)
1110 zcache_curr_objnode_count_max
= count
;
1115 static void zcache_objnode_free(struct tmem_objnode
*objnode
,
1116 struct tmem_pool
*pool
)
1118 atomic_dec(&zcache_curr_objnode_count
);
1119 BUG_ON(atomic_read(&zcache_curr_objnode_count
) < 0);
1120 kmem_cache_free(zcache_objnode_cache
, objnode
);
1123 static struct tmem_obj
*zcache_obj_alloc(struct tmem_pool
*pool
)
1125 struct tmem_obj
*obj
= NULL
;
1126 unsigned long count
;
1127 struct zcache_preload
*kp
;
1129 kp
= &__get_cpu_var(zcache_preloads
);
1131 BUG_ON(obj
== NULL
);
1133 count
= atomic_inc_return(&zcache_curr_obj_count
);
1134 if (count
> zcache_curr_obj_count_max
)
1135 zcache_curr_obj_count_max
= count
;
1139 static void zcache_obj_free(struct tmem_obj
*obj
, struct tmem_pool
*pool
)
1141 atomic_dec(&zcache_curr_obj_count
);
1142 BUG_ON(atomic_read(&zcache_curr_obj_count
) < 0);
1143 kmem_cache_free(zcache_obj_cache
, obj
);
1146 static struct tmem_hostops zcache_hostops
= {
1147 .obj_alloc
= zcache_obj_alloc
,
1148 .obj_free
= zcache_obj_free
,
1149 .objnode_alloc
= zcache_objnode_alloc
,
1150 .objnode_free
= zcache_objnode_free
,
1154 * zcache implementations for PAM page descriptor ops
1157 static atomic_t zcache_curr_eph_pampd_count
= ATOMIC_INIT(0);
1158 static unsigned long zcache_curr_eph_pampd_count_max
;
1159 static atomic_t zcache_curr_pers_pampd_count
= ATOMIC_INIT(0);
1160 static unsigned long zcache_curr_pers_pampd_count_max
;
1162 /* forward reference */
1163 static int zcache_compress(struct page
*from
, void **out_va
, unsigned *out_len
);
1165 static void *zcache_pampd_create(char *data
, size_t size
, bool raw
, int eph
,
1166 struct tmem_pool
*pool
, struct tmem_oid
*oid
,
1169 void *pampd
= NULL
, *cdata
;
1172 unsigned long count
;
1173 struct page
*page
= (struct page
*)(data
);
1174 struct zcache_client
*cli
= pool
->client
;
1175 uint16_t client_id
= get_client_id_from_client(cli
);
1176 unsigned long zv_mean_zsize
;
1177 unsigned long curr_pers_pampd_count
;
1181 ret
= zcache_compress(page
, &cdata
, &clen
);
1184 if (clen
== 0 || clen
> zbud_max_buddy_size()) {
1185 zcache_compress_poor
++;
1188 pampd
= (void *)zbud_create(client_id
, pool
->pool_id
, oid
,
1189 index
, page
, cdata
, clen
);
1190 if (pampd
!= NULL
) {
1191 count
= atomic_inc_return(&zcache_curr_eph_pampd_count
);
1192 if (count
> zcache_curr_eph_pampd_count_max
)
1193 zcache_curr_eph_pampd_count_max
= count
;
1196 curr_pers_pampd_count
=
1197 atomic_read(&zcache_curr_pers_pampd_count
);
1198 if (curr_pers_pampd_count
>
1199 (zv_page_count_policy_percent
* totalram_pages
) / 100)
1201 ret
= zcache_compress(page
, &cdata
, &clen
);
1204 /* reject if compression is too poor */
1205 if (clen
> zv_max_zsize
) {
1206 zcache_compress_poor
++;
1209 /* reject if mean compression is too poor */
1210 if ((clen
> zv_max_mean_zsize
) && (curr_pers_pampd_count
> 0)) {
1211 total_zsize
= zs_get_total_size_bytes(cli
->zspool
);
1212 zv_mean_zsize
= div_u64(total_zsize
,
1213 curr_pers_pampd_count
);
1214 if (zv_mean_zsize
> zv_max_mean_zsize
) {
1215 zcache_mean_compress_poor
++;
1219 pampd
= (void *)zv_create(cli
->zspool
, pool
->pool_id
,
1220 oid
, index
, cdata
, clen
);
1223 count
= atomic_inc_return(&zcache_curr_pers_pampd_count
);
1224 if (count
> zcache_curr_pers_pampd_count_max
)
1225 zcache_curr_pers_pampd_count_max
= count
;
1232 * fill the pageframe corresponding to the struct page with the data
1233 * from the passed pampd
1235 static int zcache_pampd_get_data(char *data
, size_t *bufsize
, bool raw
,
1236 void *pampd
, struct tmem_pool
*pool
,
1237 struct tmem_oid
*oid
, uint32_t index
)
1241 BUG_ON(is_ephemeral(pool
));
1242 zv_decompress((struct page
*)(data
), (unsigned long)pampd
);
1247 * fill the pageframe corresponding to the struct page with the data
1248 * from the passed pampd
1250 static int zcache_pampd_get_data_and_free(char *data
, size_t *bufsize
, bool raw
,
1251 void *pampd
, struct tmem_pool
*pool
,
1252 struct tmem_oid
*oid
, uint32_t index
)
1254 BUG_ON(!is_ephemeral(pool
));
1255 if (zbud_decompress((struct page
*)(data
), pampd
) < 0)
1257 zbud_free_and_delist((struct zbud_hdr
*)pampd
);
1258 atomic_dec(&zcache_curr_eph_pampd_count
);
1263 * free the pampd and remove it from any zcache lists
1264 * pampd must no longer be pointed to from any tmem data structures!
1266 static void zcache_pampd_free(void *pampd
, struct tmem_pool
*pool
,
1267 struct tmem_oid
*oid
, uint32_t index
)
1269 struct zcache_client
*cli
= pool
->client
;
1271 if (is_ephemeral(pool
)) {
1272 zbud_free_and_delist((struct zbud_hdr
*)pampd
);
1273 atomic_dec(&zcache_curr_eph_pampd_count
);
1274 BUG_ON(atomic_read(&zcache_curr_eph_pampd_count
) < 0);
1276 zv_free(cli
->zspool
, (unsigned long)pampd
);
1277 atomic_dec(&zcache_curr_pers_pampd_count
);
1278 BUG_ON(atomic_read(&zcache_curr_pers_pampd_count
) < 0);
1282 static void zcache_pampd_free_obj(struct tmem_pool
*pool
, struct tmem_obj
*obj
)
1286 static void zcache_pampd_new_obj(struct tmem_obj
*obj
)
1290 static int zcache_pampd_replace_in_obj(void *pampd
, struct tmem_obj
*obj
)
1295 static bool zcache_pampd_is_remote(void *pampd
)
1300 static struct tmem_pamops zcache_pamops
= {
1301 .create
= zcache_pampd_create
,
1302 .get_data
= zcache_pampd_get_data
,
1303 .get_data_and_free
= zcache_pampd_get_data_and_free
,
1304 .free
= zcache_pampd_free
,
1305 .free_obj
= zcache_pampd_free_obj
,
1306 .new_obj
= zcache_pampd_new_obj
,
1307 .replace_in_obj
= zcache_pampd_replace_in_obj
,
1308 .is_remote
= zcache_pampd_is_remote
,
1312 * zcache compression/decompression and related per-cpu stuff
1315 static DEFINE_PER_CPU(unsigned char *, zcache_dstmem
);
1316 #define ZCACHE_DSTMEM_ORDER 1
1318 static int zcache_compress(struct page
*from
, void **out_va
, unsigned *out_len
)
1321 unsigned char *dmem
= __get_cpu_var(zcache_dstmem
);
1324 BUG_ON(!irqs_disabled());
1325 if (unlikely(dmem
== NULL
))
1326 goto out
; /* no buffer or no compressor so can't compress */
1327 *out_len
= PAGE_SIZE
<< ZCACHE_DSTMEM_ORDER
;
1328 from_va
= kmap_atomic(from
);
1330 ret
= zcache_comp_op(ZCACHE_COMPOP_COMPRESS
, from_va
, PAGE_SIZE
, dmem
,
1334 kunmap_atomic(from_va
);
1340 static int zcache_comp_cpu_up(int cpu
)
1342 struct crypto_comp
*tfm
;
1344 tfm
= crypto_alloc_comp(zcache_comp_name
, 0, 0);
1347 *per_cpu_ptr(zcache_comp_pcpu_tfms
, cpu
) = tfm
;
1351 static void zcache_comp_cpu_down(int cpu
)
1353 struct crypto_comp
*tfm
;
1355 tfm
= *per_cpu_ptr(zcache_comp_pcpu_tfms
, cpu
);
1356 crypto_free_comp(tfm
);
1357 *per_cpu_ptr(zcache_comp_pcpu_tfms
, cpu
) = NULL
;
1360 static int zcache_cpu_notifier(struct notifier_block
*nb
,
1361 unsigned long action
, void *pcpu
)
1363 int ret
, cpu
= (long)pcpu
;
1364 struct zcache_preload
*kp
;
1367 case CPU_UP_PREPARE
:
1368 ret
= zcache_comp_cpu_up(cpu
);
1369 if (ret
!= NOTIFY_OK
) {
1370 pr_err("zcache: can't allocate compressor transform\n");
1373 per_cpu(zcache_dstmem
, cpu
) = (void *)__get_free_pages(
1374 GFP_KERNEL
| __GFP_REPEAT
, ZCACHE_DSTMEM_ORDER
);
1377 case CPU_UP_CANCELED
:
1378 zcache_comp_cpu_down(cpu
);
1379 free_pages((unsigned long)per_cpu(zcache_dstmem
, cpu
),
1380 ZCACHE_DSTMEM_ORDER
);
1381 per_cpu(zcache_dstmem
, cpu
) = NULL
;
1382 kp
= &per_cpu(zcache_preloads
, cpu
);
1384 kmem_cache_free(zcache_objnode_cache
,
1385 kp
->objnodes
[kp
->nr
- 1]);
1386 kp
->objnodes
[kp
->nr
- 1] = NULL
;
1390 kmem_cache_free(zcache_obj_cache
, kp
->obj
);
1394 free_page((unsigned long)kp
->page
);
1404 static struct notifier_block zcache_cpu_notifier_block
= {
1405 .notifier_call
= zcache_cpu_notifier
1409 #define ZCACHE_SYSFS_RO(_name) \
1410 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1411 struct kobj_attribute *attr, char *buf) \
1413 return sprintf(buf, "%lu\n", zcache_##_name); \
1415 static struct kobj_attribute zcache_##_name##_attr = { \
1416 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1417 .show = zcache_##_name##_show, \
1420 #define ZCACHE_SYSFS_RO_ATOMIC(_name) \
1421 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1422 struct kobj_attribute *attr, char *buf) \
1424 return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
1426 static struct kobj_attribute zcache_##_name##_attr = { \
1427 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1428 .show = zcache_##_name##_show, \
1431 #define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
1432 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1433 struct kobj_attribute *attr, char *buf) \
1435 return _func(buf); \
1437 static struct kobj_attribute zcache_##_name##_attr = { \
1438 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1439 .show = zcache_##_name##_show, \
1442 ZCACHE_SYSFS_RO(curr_obj_count_max
);
1443 ZCACHE_SYSFS_RO(curr_objnode_count_max
);
1444 ZCACHE_SYSFS_RO(flush_total
);
1445 ZCACHE_SYSFS_RO(flush_found
);
1446 ZCACHE_SYSFS_RO(flobj_total
);
1447 ZCACHE_SYSFS_RO(flobj_found
);
1448 ZCACHE_SYSFS_RO(failed_eph_puts
);
1449 ZCACHE_SYSFS_RO(failed_pers_puts
);
1450 ZCACHE_SYSFS_RO(zbud_curr_zbytes
);
1451 ZCACHE_SYSFS_RO(zbud_cumul_zpages
);
1452 ZCACHE_SYSFS_RO(zbud_cumul_zbytes
);
1453 ZCACHE_SYSFS_RO(zbud_buddied_count
);
1454 ZCACHE_SYSFS_RO(zbpg_unused_list_count
);
1455 ZCACHE_SYSFS_RO(evicted_raw_pages
);
1456 ZCACHE_SYSFS_RO(evicted_unbuddied_pages
);
1457 ZCACHE_SYSFS_RO(evicted_buddied_pages
);
1458 ZCACHE_SYSFS_RO(failed_get_free_pages
);
1459 ZCACHE_SYSFS_RO(failed_alloc
);
1460 ZCACHE_SYSFS_RO(put_to_flush
);
1461 ZCACHE_SYSFS_RO(compress_poor
);
1462 ZCACHE_SYSFS_RO(mean_compress_poor
);
1463 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages
);
1464 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages
);
1465 ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count
);
1466 ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count
);
1467 ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts
,
1468 zbud_show_unbuddied_list_counts
);
1469 ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts
,
1470 zbud_show_cumul_chunk_counts
);
1471 ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts
,
1472 zv_curr_dist_counts_show
);
1473 ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts
,
1474 zv_cumul_dist_counts_show
);
1476 static struct attribute
*zcache_attrs
[] = {
1477 &zcache_curr_obj_count_attr
.attr
,
1478 &zcache_curr_obj_count_max_attr
.attr
,
1479 &zcache_curr_objnode_count_attr
.attr
,
1480 &zcache_curr_objnode_count_max_attr
.attr
,
1481 &zcache_flush_total_attr
.attr
,
1482 &zcache_flobj_total_attr
.attr
,
1483 &zcache_flush_found_attr
.attr
,
1484 &zcache_flobj_found_attr
.attr
,
1485 &zcache_failed_eph_puts_attr
.attr
,
1486 &zcache_failed_pers_puts_attr
.attr
,
1487 &zcache_compress_poor_attr
.attr
,
1488 &zcache_mean_compress_poor_attr
.attr
,
1489 &zcache_zbud_curr_raw_pages_attr
.attr
,
1490 &zcache_zbud_curr_zpages_attr
.attr
,
1491 &zcache_zbud_curr_zbytes_attr
.attr
,
1492 &zcache_zbud_cumul_zpages_attr
.attr
,
1493 &zcache_zbud_cumul_zbytes_attr
.attr
,
1494 &zcache_zbud_buddied_count_attr
.attr
,
1495 &zcache_zbpg_unused_list_count_attr
.attr
,
1496 &zcache_evicted_raw_pages_attr
.attr
,
1497 &zcache_evicted_unbuddied_pages_attr
.attr
,
1498 &zcache_evicted_buddied_pages_attr
.attr
,
1499 &zcache_failed_get_free_pages_attr
.attr
,
1500 &zcache_failed_alloc_attr
.attr
,
1501 &zcache_put_to_flush_attr
.attr
,
1502 &zcache_zbud_unbuddied_list_counts_attr
.attr
,
1503 &zcache_zbud_cumul_chunk_counts_attr
.attr
,
1504 &zcache_zv_curr_dist_counts_attr
.attr
,
1505 &zcache_zv_cumul_dist_counts_attr
.attr
,
1506 &zcache_zv_max_zsize_attr
.attr
,
1507 &zcache_zv_max_mean_zsize_attr
.attr
,
1508 &zcache_zv_page_count_policy_percent_attr
.attr
,
1512 static struct attribute_group zcache_attr_group
= {
1513 .attrs
= zcache_attrs
,
1517 #endif /* CONFIG_SYSFS */
1519 * When zcache is disabled ("frozen"), pools can be created and destroyed,
1520 * but all puts (and thus all other operations that require memory allocation)
1521 * must fail. If zcache is unfrozen, accepts puts, then frozen again,
1522 * data consistency requires all puts while frozen to be converted into
1525 static bool zcache_freeze
;
1528 * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
1530 static int shrink_zcache_memory(struct shrinker
*shrink
,
1531 struct shrink_control
*sc
)
1534 int nr
= sc
->nr_to_scan
;
1535 gfp_t gfp_mask
= sc
->gfp_mask
;
1538 if (!(gfp_mask
& __GFP_FS
))
1539 /* does this case really need to be skipped? */
1541 zbud_evict_pages(nr
);
1543 ret
= (int)atomic_read(&zcache_zbud_curr_raw_pages
);
1548 static struct shrinker zcache_shrinker
= {
1549 .shrink
= shrink_zcache_memory
,
1550 .seeks
= DEFAULT_SEEKS
,
1554 * zcache shims between cleancache/frontswap ops and tmem
1557 static int zcache_put_page(int cli_id
, int pool_id
, struct tmem_oid
*oidp
,
1558 uint32_t index
, struct page
*page
)
1560 struct tmem_pool
*pool
;
1563 BUG_ON(!irqs_disabled());
1564 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1565 if (unlikely(pool
== NULL
))
1567 if (!zcache_freeze
&& zcache_do_preload(pool
) == 0) {
1568 /* preload does preempt_disable on success */
1569 ret
= tmem_put(pool
, oidp
, index
, (char *)(page
),
1570 PAGE_SIZE
, 0, is_ephemeral(pool
));
1572 if (is_ephemeral(pool
))
1573 zcache_failed_eph_puts
++;
1575 zcache_failed_pers_puts
++;
1578 zcache_put_to_flush
++;
1579 if (atomic_read(&pool
->obj_count
) > 0)
1580 /* the put fails whether the flush succeeds or not */
1581 (void)tmem_flush_page(pool
, oidp
, index
);
1584 zcache_put_pool(pool
);
1589 static int zcache_get_page(int cli_id
, int pool_id
, struct tmem_oid
*oidp
,
1590 uint32_t index
, struct page
*page
)
1592 struct tmem_pool
*pool
;
1594 unsigned long flags
;
1595 size_t size
= PAGE_SIZE
;
1597 local_irq_save(flags
);
1598 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1599 if (likely(pool
!= NULL
)) {
1600 if (atomic_read(&pool
->obj_count
) > 0)
1601 ret
= tmem_get(pool
, oidp
, index
, (char *)(page
),
1602 &size
, 0, is_ephemeral(pool
));
1603 zcache_put_pool(pool
);
1605 local_irq_restore(flags
);
1609 static int zcache_flush_page(int cli_id
, int pool_id
,
1610 struct tmem_oid
*oidp
, uint32_t index
)
1612 struct tmem_pool
*pool
;
1614 unsigned long flags
;
1616 local_irq_save(flags
);
1617 zcache_flush_total
++;
1618 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1619 if (likely(pool
!= NULL
)) {
1620 if (atomic_read(&pool
->obj_count
) > 0)
1621 ret
= tmem_flush_page(pool
, oidp
, index
);
1622 zcache_put_pool(pool
);
1625 zcache_flush_found
++;
1626 local_irq_restore(flags
);
1630 static int zcache_flush_object(int cli_id
, int pool_id
,
1631 struct tmem_oid
*oidp
)
1633 struct tmem_pool
*pool
;
1635 unsigned long flags
;
1637 local_irq_save(flags
);
1638 zcache_flobj_total
++;
1639 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1640 if (likely(pool
!= NULL
)) {
1641 if (atomic_read(&pool
->obj_count
) > 0)
1642 ret
= tmem_flush_object(pool
, oidp
);
1643 zcache_put_pool(pool
);
1646 zcache_flobj_found
++;
1647 local_irq_restore(flags
);
1651 static int zcache_destroy_pool(int cli_id
, int pool_id
)
1653 struct tmem_pool
*pool
= NULL
;
1654 struct zcache_client
*cli
;
1660 cli
= get_zcache_client(cli_id
);
1664 atomic_inc(&cli
->refcount
);
1665 pool
= idr_find(&cli
->tmem_pools
, pool_id
);
1668 idr_remove(&cli
->tmem_pools
, pool_id
);
1669 /* wait for pool activity on other cpus to quiesce */
1670 while (atomic_read(&pool
->refcount
) != 0)
1672 atomic_dec(&cli
->refcount
);
1674 ret
= tmem_destroy_pool(pool
);
1677 pr_info("zcache: destroyed pool id=%d, cli_id=%d\n",
1683 static int zcache_new_pool(uint16_t cli_id
, uint32_t flags
)
1686 struct tmem_pool
*pool
;
1687 struct zcache_client
*cli
= NULL
;
1690 cli
= get_zcache_client(cli_id
);
1694 atomic_inc(&cli
->refcount
);
1695 pool
= kmalloc(sizeof(struct tmem_pool
), GFP_ATOMIC
);
1697 pr_info("zcache: pool creation failed: out of memory\n");
1702 r
= idr_pre_get(&cli
->tmem_pools
, GFP_ATOMIC
);
1705 pr_info("zcache: pool creation failed: out of memory\n");
1708 r
= idr_get_new(&cli
->tmem_pools
, pool
, &poolid
);
1709 } while (r
== -EAGAIN
);
1711 pr_info("zcache: pool creation failed: error %d\n", r
);
1716 atomic_set(&pool
->refcount
, 0);
1718 pool
->pool_id
= poolid
;
1719 tmem_new_pool(pool
, flags
);
1720 pr_info("zcache: created %s tmem pool, id=%d, client=%d\n",
1721 flags
& TMEM_POOL_PERSIST
? "persistent" : "ephemeral",
1725 atomic_dec(&cli
->refcount
);
1730 * Two kernel functionalities currently can be layered on top of tmem.
1731 * These are "cleancache" which is used as a second-chance cache for clean
1732 * page cache pages; and "frontswap" which is used for swap pages
1733 * to avoid writes to disk. A generic "shim" is provided here for each
1734 * to translate in-kernel semantics to zcache semantics.
1737 #ifdef CONFIG_CLEANCACHE
1738 static void zcache_cleancache_put_page(int pool_id
,
1739 struct cleancache_filekey key
,
1740 pgoff_t index
, struct page
*page
)
1742 u32 ind
= (u32
) index
;
1743 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1745 if (likely(ind
== index
))
1746 (void)zcache_put_page(LOCAL_CLIENT
, pool_id
, &oid
, index
, page
);
1749 static int zcache_cleancache_get_page(int pool_id
,
1750 struct cleancache_filekey key
,
1751 pgoff_t index
, struct page
*page
)
1753 u32 ind
= (u32
) index
;
1754 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1757 if (likely(ind
== index
))
1758 ret
= zcache_get_page(LOCAL_CLIENT
, pool_id
, &oid
, index
, page
);
1762 static void zcache_cleancache_flush_page(int pool_id
,
1763 struct cleancache_filekey key
,
1766 u32 ind
= (u32
) index
;
1767 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1769 if (likely(ind
== index
))
1770 (void)zcache_flush_page(LOCAL_CLIENT
, pool_id
, &oid
, ind
);
1773 static void zcache_cleancache_flush_inode(int pool_id
,
1774 struct cleancache_filekey key
)
1776 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1778 (void)zcache_flush_object(LOCAL_CLIENT
, pool_id
, &oid
);
1781 static void zcache_cleancache_flush_fs(int pool_id
)
1784 (void)zcache_destroy_pool(LOCAL_CLIENT
, pool_id
);
1787 static int zcache_cleancache_init_fs(size_t pagesize
)
1789 BUG_ON(sizeof(struct cleancache_filekey
) !=
1790 sizeof(struct tmem_oid
));
1791 BUG_ON(pagesize
!= PAGE_SIZE
);
1792 return zcache_new_pool(LOCAL_CLIENT
, 0);
1795 static int zcache_cleancache_init_shared_fs(char *uuid
, size_t pagesize
)
1797 /* shared pools are unsupported and map to private */
1798 BUG_ON(sizeof(struct cleancache_filekey
) !=
1799 sizeof(struct tmem_oid
));
1800 BUG_ON(pagesize
!= PAGE_SIZE
);
1801 return zcache_new_pool(LOCAL_CLIENT
, 0);
1804 static struct cleancache_ops zcache_cleancache_ops
= {
1805 .put_page
= zcache_cleancache_put_page
,
1806 .get_page
= zcache_cleancache_get_page
,
1807 .invalidate_page
= zcache_cleancache_flush_page
,
1808 .invalidate_inode
= zcache_cleancache_flush_inode
,
1809 .invalidate_fs
= zcache_cleancache_flush_fs
,
1810 .init_shared_fs
= zcache_cleancache_init_shared_fs
,
1811 .init_fs
= zcache_cleancache_init_fs
1814 struct cleancache_ops
zcache_cleancache_register_ops(void)
1816 struct cleancache_ops old_ops
=
1817 cleancache_register_ops(&zcache_cleancache_ops
);
1823 #ifdef CONFIG_FRONTSWAP
1824 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1825 static int zcache_frontswap_poolid
= -1;
1828 * Swizzling increases objects per swaptype, increasing tmem concurrency
1829 * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
1830 * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from
1831 * frontswap_load(), but has side-effects. Hence using 8.
1834 #define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
1835 #define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
1836 #define iswiz(_ind) (_ind >> SWIZ_BITS)
1838 static inline struct tmem_oid
oswiz(unsigned type
, u32 ind
)
1840 struct tmem_oid oid
= { .oid
= { 0 } };
1841 oid
.oid
[0] = _oswiz(type
, ind
);
1845 static int zcache_frontswap_store(unsigned type
, pgoff_t offset
,
1848 u64 ind64
= (u64
)offset
;
1849 u32 ind
= (u32
)offset
;
1850 struct tmem_oid oid
= oswiz(type
, ind
);
1852 unsigned long flags
;
1854 BUG_ON(!PageLocked(page
));
1855 if (likely(ind64
== ind
)) {
1856 local_irq_save(flags
);
1857 ret
= zcache_put_page(LOCAL_CLIENT
, zcache_frontswap_poolid
,
1858 &oid
, iswiz(ind
), page
);
1859 local_irq_restore(flags
);
1864 /* returns 0 if the page was successfully gotten from frontswap, -1 if
1865 * was not present (should never happen!) */
1866 static int zcache_frontswap_load(unsigned type
, pgoff_t offset
,
1869 u64 ind64
= (u64
)offset
;
1870 u32 ind
= (u32
)offset
;
1871 struct tmem_oid oid
= oswiz(type
, ind
);
1874 BUG_ON(!PageLocked(page
));
1875 if (likely(ind64
== ind
))
1876 ret
= zcache_get_page(LOCAL_CLIENT
, zcache_frontswap_poolid
,
1877 &oid
, iswiz(ind
), page
);
1881 /* flush a single page from frontswap */
1882 static void zcache_frontswap_flush_page(unsigned type
, pgoff_t offset
)
1884 u64 ind64
= (u64
)offset
;
1885 u32 ind
= (u32
)offset
;
1886 struct tmem_oid oid
= oswiz(type
, ind
);
1888 if (likely(ind64
== ind
))
1889 (void)zcache_flush_page(LOCAL_CLIENT
, zcache_frontswap_poolid
,
1893 /* flush all pages from the passed swaptype */
1894 static void zcache_frontswap_flush_area(unsigned type
)
1896 struct tmem_oid oid
;
1899 for (ind
= SWIZ_MASK
; ind
>= 0; ind
--) {
1900 oid
= oswiz(type
, ind
);
1901 (void)zcache_flush_object(LOCAL_CLIENT
,
1902 zcache_frontswap_poolid
, &oid
);
1906 static void zcache_frontswap_init(unsigned ignored
)
1908 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1909 if (zcache_frontswap_poolid
< 0)
1910 zcache_frontswap_poolid
=
1911 zcache_new_pool(LOCAL_CLIENT
, TMEM_POOL_PERSIST
);
1914 static struct frontswap_ops zcache_frontswap_ops
= {
1915 .store
= zcache_frontswap_store
,
1916 .load
= zcache_frontswap_load
,
1917 .invalidate_page
= zcache_frontswap_flush_page
,
1918 .invalidate_area
= zcache_frontswap_flush_area
,
1919 .init
= zcache_frontswap_init
1922 struct frontswap_ops
zcache_frontswap_register_ops(void)
1924 struct frontswap_ops old_ops
=
1925 frontswap_register_ops(&zcache_frontswap_ops
);
1932 * zcache initialization
1933 * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
1937 static int zcache_enabled
;
1939 static int __init
enable_zcache(char *s
)
1944 __setup("zcache", enable_zcache
);
1946 /* allow independent dynamic disabling of cleancache and frontswap */
1948 static int use_cleancache
= 1;
1950 static int __init
no_cleancache(char *s
)
1956 __setup("nocleancache", no_cleancache
);
1958 static int use_frontswap
= 1;
1960 static int __init
no_frontswap(char *s
)
1966 __setup("nofrontswap", no_frontswap
);
1968 static int __init
enable_zcache_compressor(char *s
)
1970 strncpy(zcache_comp_name
, s
, ZCACHE_COMP_NAME_SZ
);
1974 __setup("zcache=", enable_zcache_compressor
);
1977 static int __init
zcache_comp_init(void)
1981 /* check crypto algorithm */
1982 if (*zcache_comp_name
!= '\0') {
1983 ret
= crypto_has_comp(zcache_comp_name
, 0, 0);
1985 pr_info("zcache: %s not supported\n",
1989 strcpy(zcache_comp_name
, "lzo");
1990 ret
= crypto_has_comp(zcache_comp_name
, 0, 0);
1995 pr_info("zcache: using %s compressor\n", zcache_comp_name
);
1997 /* alloc percpu transforms */
1999 zcache_comp_pcpu_tfms
= alloc_percpu(struct crypto_comp
*);
2000 if (!zcache_comp_pcpu_tfms
)
2006 static int __init
zcache_init(void)
2011 ret
= sysfs_create_group(mm_kobj
, &zcache_attr_group
);
2013 pr_err("zcache: can't create sysfs\n");
2016 #endif /* CONFIG_SYSFS */
2018 if (zcache_enabled
) {
2021 tmem_register_hostops(&zcache_hostops
);
2022 tmem_register_pamops(&zcache_pamops
);
2023 ret
= register_cpu_notifier(&zcache_cpu_notifier_block
);
2025 pr_err("zcache: can't register cpu notifier\n");
2028 ret
= zcache_comp_init();
2030 pr_err("zcache: compressor initialization failed\n");
2033 for_each_online_cpu(cpu
) {
2034 void *pcpu
= (void *)(long)cpu
;
2035 zcache_cpu_notifier(&zcache_cpu_notifier_block
,
2036 CPU_UP_PREPARE
, pcpu
);
2039 zcache_objnode_cache
= kmem_cache_create("zcache_objnode",
2040 sizeof(struct tmem_objnode
), 0, 0, NULL
);
2041 zcache_obj_cache
= kmem_cache_create("zcache_obj",
2042 sizeof(struct tmem_obj
), 0, 0, NULL
);
2043 ret
= zcache_new_client(LOCAL_CLIENT
);
2045 pr_err("zcache: can't create client\n");
2049 #ifdef CONFIG_CLEANCACHE
2050 if (zcache_enabled
&& use_cleancache
) {
2051 struct cleancache_ops old_ops
;
2054 register_shrinker(&zcache_shrinker
);
2055 old_ops
= zcache_cleancache_register_ops();
2056 pr_info("zcache: cleancache enabled using kernel "
2057 "transcendent memory and compression buddies\n");
2058 if (old_ops
.init_fs
!= NULL
)
2059 pr_warning("zcache: cleancache_ops overridden");
2062 #ifdef CONFIG_FRONTSWAP
2063 if (zcache_enabled
&& use_frontswap
) {
2064 struct frontswap_ops old_ops
;
2066 old_ops
= zcache_frontswap_register_ops();
2067 pr_info("zcache: frontswap enabled using kernel "
2068 "transcendent memory and zsmalloc\n");
2069 if (old_ops
.init
!= NULL
)
2070 pr_warning("zcache: frontswap_ops overridden");
2077 module_init(zcache_init
)