4 * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
5 * Copyright (c) 2010,2011, Nitin Gupta
7 * Zcache provides an in-kernel "host implementation" for transcendent memory
8 * and, thus indirectly, for cleancache and frontswap. Zcache includes two
9 * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
10 * 1) "compression buddies" ("zbud") is used for ephemeral pages
11 * 2) xvmalloc is used for persistent pages.
12 * Xvmalloc (based on the TLSF allocator) has very low fragmentation
13 * so maximizes space efficiency, while zbud allows pairs (and potentially,
14 * in the future, more than a pair of) compressed pages to be closely linked
15 * so that reclaiming can be done via the kernel's physical-page-oriented
16 * "shrinker" interface.
18 * [1] For a definition of page-accessible memory (aka PAM), see:
19 * http://marc.info/?l=linux-mm&m=127811271605009
22 #include <linux/module.h>
23 #include <linux/cpu.h>
24 #include <linux/highmem.h>
25 #include <linux/list.h>
26 #include <linux/lzo.h>
27 #include <linux/slab.h>
28 #include <linux/spinlock.h>
29 #include <linux/types.h>
30 #include <linux/atomic.h>
31 #include <linux/math64.h>
34 #include "../zram/xvmalloc.h" /* if built in drivers/staging */
36 #if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
37 #error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
39 #ifdef CONFIG_CLEANCACHE
40 #include <linux/cleancache.h>
42 #ifdef CONFIG_FRONTSWAP
43 #include <linux/frontswap.h>
47 /* this is more aggressive but may cause other problems? */
48 #define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
50 #define ZCACHE_GFP_MASK \
51 (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
54 #define MAX_POOLS_PER_CLIENT 16
56 #define MAX_CLIENTS 16
57 #define LOCAL_CLIENT ((uint16_t)-1)
59 MODULE_LICENSE("GPL");
61 struct zcache_client
{
62 struct tmem_pool
*tmem_pools
[MAX_POOLS_PER_CLIENT
];
63 struct xv_pool
*xvpool
;
68 static struct zcache_client zcache_host
;
69 static struct zcache_client zcache_clients
[MAX_CLIENTS
];
71 static inline uint16_t get_client_id_from_client(struct zcache_client
*cli
)
74 if (cli
== &zcache_host
)
76 return cli
- &zcache_clients
[0];
79 static inline bool is_local_client(struct zcache_client
*cli
)
81 return cli
== &zcache_host
;
85 * Compression buddies ("zbud") provides for packing two (or, possibly
86 * in the future, more) compressed ephemeral pages into a single "raw"
87 * (physical) page and tracking them with data structures so that
88 * the raw pages can be easily reclaimed.
90 * A zbud page ("zbpg") is an aligned page containing a list_head,
91 * a lock, and two "zbud headers". The remainder of the physical
92 * page is divided up into aligned 64-byte "chunks" which contain
93 * the compressed data for zero, one, or two zbuds. Each zbpg
94 * resides on: (1) an "unused list" if it has no zbuds; (2) a
95 * "buddied" list if it is fully populated with two zbuds; or
96 * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
97 * the one unbuddied zbud uses. The data inside a zbpg cannot be
98 * read or written unless the zbpg's lock is held.
101 #define ZBH_SENTINEL 0x43214321
102 #define ZBPG_SENTINEL 0xdeadbeef
104 #define ZBUD_MAX_BUDS 2
111 uint16_t size
; /* compressed size in bytes, zero means unused */
116 struct list_head bud_list
;
118 struct zbud_hdr buddy
[ZBUD_MAX_BUDS
];
120 /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
123 #define CHUNK_SHIFT 6
124 #define CHUNK_SIZE (1 << CHUNK_SHIFT)
125 #define CHUNK_MASK (~(CHUNK_SIZE-1))
126 #define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \
127 CHUNK_MASK) >> CHUNK_SHIFT)
128 #define MAX_CHUNK (NCHUNKS-1)
131 struct list_head list
;
133 } zbud_unbuddied
[NCHUNKS
];
134 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
135 /* element 0 is never used but optimizing that isn't worth it */
136 static unsigned long zbud_cumul_chunk_counts
[NCHUNKS
];
138 struct list_head zbud_buddied_list
;
139 static unsigned long zcache_zbud_buddied_count
;
141 /* protects the buddied list and all unbuddied lists */
142 static DEFINE_SPINLOCK(zbud_budlists_spinlock
);
144 static LIST_HEAD(zbpg_unused_list
);
145 static unsigned long zcache_zbpg_unused_list_count
;
147 /* protects the unused page list */
148 static DEFINE_SPINLOCK(zbpg_unused_list_spinlock
);
150 static atomic_t zcache_zbud_curr_raw_pages
;
151 static atomic_t zcache_zbud_curr_zpages
;
152 static unsigned long zcache_zbud_curr_zbytes
;
153 static unsigned long zcache_zbud_cumul_zpages
;
154 static unsigned long zcache_zbud_cumul_zbytes
;
155 static unsigned long zcache_compress_poor
;
156 static unsigned long zcache_mean_compress_poor
;
158 /* forward references */
159 static void *zcache_get_free_page(void);
160 static void zcache_free_page(void *p
);
163 * zbud helper functions
166 static inline unsigned zbud_max_buddy_size(void)
168 return MAX_CHUNK
<< CHUNK_SHIFT
;
171 static inline unsigned zbud_size_to_chunks(unsigned size
)
173 BUG_ON(size
== 0 || size
> zbud_max_buddy_size());
174 return (size
+ CHUNK_SIZE
- 1) >> CHUNK_SHIFT
;
177 static inline int zbud_budnum(struct zbud_hdr
*zh
)
179 unsigned offset
= (unsigned long)zh
& (PAGE_SIZE
- 1);
180 struct zbud_page
*zbpg
= NULL
;
181 unsigned budnum
= -1U;
184 for (i
= 0; i
< ZBUD_MAX_BUDS
; i
++)
185 if (offset
== offsetof(typeof(*zbpg
), buddy
[i
])) {
189 BUG_ON(budnum
== -1U);
193 static char *zbud_data(struct zbud_hdr
*zh
, unsigned size
)
195 struct zbud_page
*zbpg
;
199 ASSERT_SENTINEL(zh
, ZBH
);
200 budnum
= zbud_budnum(zh
);
201 BUG_ON(size
== 0 || size
> zbud_max_buddy_size());
202 zbpg
= container_of(zh
, struct zbud_page
, buddy
[budnum
]);
203 ASSERT_SPINLOCK(&zbpg
->lock
);
206 p
+= ((sizeof(struct zbud_page
) + CHUNK_SIZE
- 1) &
208 else if (budnum
== 1)
209 p
+= PAGE_SIZE
- ((size
+ CHUNK_SIZE
- 1) & CHUNK_MASK
);
214 * zbud raw page management
217 static struct zbud_page
*zbud_alloc_raw_page(void)
219 struct zbud_page
*zbpg
= NULL
;
220 struct zbud_hdr
*zh0
, *zh1
;
223 /* if any pages on the zbpg list, use one */
224 spin_lock(&zbpg_unused_list_spinlock
);
225 if (!list_empty(&zbpg_unused_list
)) {
226 zbpg
= list_first_entry(&zbpg_unused_list
,
227 struct zbud_page
, bud_list
);
228 list_del_init(&zbpg
->bud_list
);
229 zcache_zbpg_unused_list_count
--;
232 spin_unlock(&zbpg_unused_list_spinlock
);
234 /* none on zbpg list, try to get a kernel page */
235 zbpg
= zcache_get_free_page();
236 if (likely(zbpg
!= NULL
)) {
237 INIT_LIST_HEAD(&zbpg
->bud_list
);
238 zh0
= &zbpg
->buddy
[0]; zh1
= &zbpg
->buddy
[1];
239 spin_lock_init(&zbpg
->lock
);
241 ASSERT_INVERTED_SENTINEL(zbpg
, ZBPG
);
242 SET_SENTINEL(zbpg
, ZBPG
);
243 BUG_ON(zh0
->size
!= 0 || tmem_oid_valid(&zh0
->oid
));
244 BUG_ON(zh1
->size
!= 0 || tmem_oid_valid(&zh1
->oid
));
246 atomic_inc(&zcache_zbud_curr_raw_pages
);
247 INIT_LIST_HEAD(&zbpg
->bud_list
);
248 SET_SENTINEL(zbpg
, ZBPG
);
249 zh0
->size
= 0; zh1
->size
= 0;
250 tmem_oid_set_invalid(&zh0
->oid
);
251 tmem_oid_set_invalid(&zh1
->oid
);
257 static void zbud_free_raw_page(struct zbud_page
*zbpg
)
259 struct zbud_hdr
*zh0
= &zbpg
->buddy
[0], *zh1
= &zbpg
->buddy
[1];
261 ASSERT_SENTINEL(zbpg
, ZBPG
);
262 BUG_ON(!list_empty(&zbpg
->bud_list
));
263 ASSERT_SPINLOCK(&zbpg
->lock
);
264 BUG_ON(zh0
->size
!= 0 || tmem_oid_valid(&zh0
->oid
));
265 BUG_ON(zh1
->size
!= 0 || tmem_oid_valid(&zh1
->oid
));
266 INVERT_SENTINEL(zbpg
, ZBPG
);
267 spin_unlock(&zbpg
->lock
);
268 spin_lock(&zbpg_unused_list_spinlock
);
269 list_add(&zbpg
->bud_list
, &zbpg_unused_list
);
270 zcache_zbpg_unused_list_count
++;
271 spin_unlock(&zbpg_unused_list_spinlock
);
275 * core zbud handling routines
278 static unsigned zbud_free(struct zbud_hdr
*zh
)
282 ASSERT_SENTINEL(zh
, ZBH
);
283 BUG_ON(!tmem_oid_valid(&zh
->oid
));
285 BUG_ON(zh
->size
== 0 || zh
->size
> zbud_max_buddy_size());
287 tmem_oid_set_invalid(&zh
->oid
);
288 INVERT_SENTINEL(zh
, ZBH
);
289 zcache_zbud_curr_zbytes
-= size
;
290 atomic_dec(&zcache_zbud_curr_zpages
);
294 static void zbud_free_and_delist(struct zbud_hdr
*zh
)
297 struct zbud_hdr
*zh_other
;
298 unsigned budnum
= zbud_budnum(zh
), size
;
299 struct zbud_page
*zbpg
=
300 container_of(zh
, struct zbud_page
, buddy
[budnum
]);
302 spin_lock(&zbpg
->lock
);
303 if (list_empty(&zbpg
->bud_list
)) {
304 /* ignore zombie page... see zbud_evict_pages() */
305 spin_unlock(&zbpg
->lock
);
308 size
= zbud_free(zh
);
309 ASSERT_SPINLOCK(&zbpg
->lock
);
310 zh_other
= &zbpg
->buddy
[(budnum
== 0) ? 1 : 0];
311 if (zh_other
->size
== 0) { /* was unbuddied: unlist and free */
312 chunks
= zbud_size_to_chunks(size
) ;
313 spin_lock(&zbud_budlists_spinlock
);
314 BUG_ON(list_empty(&zbud_unbuddied
[chunks
].list
));
315 list_del_init(&zbpg
->bud_list
);
316 zbud_unbuddied
[chunks
].count
--;
317 spin_unlock(&zbud_budlists_spinlock
);
318 zbud_free_raw_page(zbpg
);
319 } else { /* was buddied: move remaining buddy to unbuddied list */
320 chunks
= zbud_size_to_chunks(zh_other
->size
) ;
321 spin_lock(&zbud_budlists_spinlock
);
322 list_del_init(&zbpg
->bud_list
);
323 zcache_zbud_buddied_count
--;
324 list_add_tail(&zbpg
->bud_list
, &zbud_unbuddied
[chunks
].list
);
325 zbud_unbuddied
[chunks
].count
++;
326 spin_unlock(&zbud_budlists_spinlock
);
327 spin_unlock(&zbpg
->lock
);
331 static struct zbud_hdr
*zbud_create(uint16_t client_id
, uint16_t pool_id
,
332 struct tmem_oid
*oid
,
333 uint32_t index
, struct page
*page
,
334 void *cdata
, unsigned size
)
336 struct zbud_hdr
*zh0
, *zh1
, *zh
= NULL
;
337 struct zbud_page
*zbpg
= NULL
, *ztmp
;
340 int i
, found_good_buddy
= 0;
342 nchunks
= zbud_size_to_chunks(size
) ;
343 for (i
= MAX_CHUNK
- nchunks
+ 1; i
> 0; i
--) {
344 spin_lock(&zbud_budlists_spinlock
);
345 if (!list_empty(&zbud_unbuddied
[i
].list
)) {
346 list_for_each_entry_safe(zbpg
, ztmp
,
347 &zbud_unbuddied
[i
].list
, bud_list
) {
348 if (spin_trylock(&zbpg
->lock
)) {
349 found_good_buddy
= i
;
350 goto found_unbuddied
;
354 spin_unlock(&zbud_budlists_spinlock
);
356 /* didn't find a good buddy, try allocating a new page */
357 zbpg
= zbud_alloc_raw_page();
358 if (unlikely(zbpg
== NULL
))
360 /* ok, have a page, now compress the data before taking locks */
361 spin_lock(&zbpg
->lock
);
362 spin_lock(&zbud_budlists_spinlock
);
363 list_add_tail(&zbpg
->bud_list
, &zbud_unbuddied
[nchunks
].list
);
364 zbud_unbuddied
[nchunks
].count
++;
365 zh
= &zbpg
->buddy
[0];
369 ASSERT_SPINLOCK(&zbpg
->lock
);
370 zh0
= &zbpg
->buddy
[0]; zh1
= &zbpg
->buddy
[1];
371 BUG_ON(!((zh0
->size
== 0) ^ (zh1
->size
== 0)));
372 if (zh0
->size
!= 0) { /* buddy0 in use, buddy1 is vacant */
373 ASSERT_SENTINEL(zh0
, ZBH
);
375 } else if (zh1
->size
!= 0) { /* buddy1 in use, buddy0 is vacant */
376 ASSERT_SENTINEL(zh1
, ZBH
);
380 list_del_init(&zbpg
->bud_list
);
381 zbud_unbuddied
[found_good_buddy
].count
--;
382 list_add_tail(&zbpg
->bud_list
, &zbud_buddied_list
);
383 zcache_zbud_buddied_count
++;
386 SET_SENTINEL(zh
, ZBH
);
390 zh
->pool_id
= pool_id
;
391 zh
->client_id
= client_id
;
392 /* can wait to copy the data until the list locks are dropped */
393 spin_unlock(&zbud_budlists_spinlock
);
395 to
= zbud_data(zh
, size
);
396 memcpy(to
, cdata
, size
);
397 spin_unlock(&zbpg
->lock
);
398 zbud_cumul_chunk_counts
[nchunks
]++;
399 atomic_inc(&zcache_zbud_curr_zpages
);
400 zcache_zbud_cumul_zpages
++;
401 zcache_zbud_curr_zbytes
+= size
;
402 zcache_zbud_cumul_zbytes
+= size
;
407 static int zbud_decompress(struct page
*page
, struct zbud_hdr
*zh
)
409 struct zbud_page
*zbpg
;
410 unsigned budnum
= zbud_budnum(zh
);
411 size_t out_len
= PAGE_SIZE
;
412 char *to_va
, *from_va
;
416 zbpg
= container_of(zh
, struct zbud_page
, buddy
[budnum
]);
417 spin_lock(&zbpg
->lock
);
418 if (list_empty(&zbpg
->bud_list
)) {
419 /* ignore zombie page... see zbud_evict_pages() */
423 ASSERT_SENTINEL(zh
, ZBH
);
424 BUG_ON(zh
->size
== 0 || zh
->size
> zbud_max_buddy_size());
425 to_va
= kmap_atomic(page
, KM_USER0
);
427 from_va
= zbud_data(zh
, size
);
428 ret
= lzo1x_decompress_safe(from_va
, size
, to_va
, &out_len
);
429 BUG_ON(ret
!= LZO_E_OK
);
430 BUG_ON(out_len
!= PAGE_SIZE
);
431 kunmap_atomic(to_va
, KM_USER0
);
433 spin_unlock(&zbpg
->lock
);
438 * The following routines handle shrinking of ephemeral pages by evicting
439 * pages "least valuable" first.
442 static unsigned long zcache_evicted_raw_pages
;
443 static unsigned long zcache_evicted_buddied_pages
;
444 static unsigned long zcache_evicted_unbuddied_pages
;
446 static struct tmem_pool
*zcache_get_pool_by_id(uint16_t cli_id
,
448 static void zcache_put_pool(struct tmem_pool
*pool
);
451 * Flush and free all zbuds in a zbpg, then free the pageframe
453 static void zbud_evict_zbpg(struct zbud_page
*zbpg
)
457 uint32_t pool_id
[ZBUD_MAX_BUDS
], client_id
[ZBUD_MAX_BUDS
];
458 uint32_t index
[ZBUD_MAX_BUDS
];
459 struct tmem_oid oid
[ZBUD_MAX_BUDS
];
460 struct tmem_pool
*pool
;
462 ASSERT_SPINLOCK(&zbpg
->lock
);
463 BUG_ON(!list_empty(&zbpg
->bud_list
));
464 for (i
= 0, j
= 0; i
< ZBUD_MAX_BUDS
; i
++) {
465 zh
= &zbpg
->buddy
[i
];
467 client_id
[j
] = zh
->client_id
;
468 pool_id
[j
] = zh
->pool_id
;
470 index
[j
] = zh
->index
;
475 spin_unlock(&zbpg
->lock
);
476 for (i
= 0; i
< j
; i
++) {
477 pool
= zcache_get_pool_by_id(client_id
[i
], pool_id
[i
]);
479 tmem_flush_page(pool
, &oid
[i
], index
[i
]);
480 zcache_put_pool(pool
);
483 ASSERT_SENTINEL(zbpg
, ZBPG
);
484 spin_lock(&zbpg
->lock
);
485 zbud_free_raw_page(zbpg
);
489 * Free nr pages. This code is funky because we want to hold the locks
490 * protecting various lists for as short a time as possible, and in some
491 * circumstances the list may change asynchronously when the list lock is
492 * not held. In some cases we also trylock not only to avoid waiting on a
493 * page in use by another cpu, but also to avoid potential deadlock due to
496 static void zbud_evict_pages(int nr
)
498 struct zbud_page
*zbpg
;
501 /* first try freeing any pages on unused list */
503 spin_lock_bh(&zbpg_unused_list_spinlock
);
504 if (!list_empty(&zbpg_unused_list
)) {
505 /* can't walk list here, since it may change when unlocked */
506 zbpg
= list_first_entry(&zbpg_unused_list
,
507 struct zbud_page
, bud_list
);
508 list_del_init(&zbpg
->bud_list
);
509 zcache_zbpg_unused_list_count
--;
510 atomic_dec(&zcache_zbud_curr_raw_pages
);
511 spin_unlock_bh(&zbpg_unused_list_spinlock
);
512 zcache_free_page(zbpg
);
513 zcache_evicted_raw_pages
++;
516 goto retry_unused_list
;
518 spin_unlock_bh(&zbpg_unused_list_spinlock
);
520 /* now try freeing unbuddied pages, starting with least space avail */
521 for (i
= 0; i
< MAX_CHUNK
; i
++) {
523 spin_lock_bh(&zbud_budlists_spinlock
);
524 if (list_empty(&zbud_unbuddied
[i
].list
)) {
525 spin_unlock_bh(&zbud_budlists_spinlock
);
528 list_for_each_entry(zbpg
, &zbud_unbuddied
[i
].list
, bud_list
) {
529 if (unlikely(!spin_trylock(&zbpg
->lock
)))
531 list_del_init(&zbpg
->bud_list
);
532 zbud_unbuddied
[i
].count
--;
533 spin_unlock(&zbud_budlists_spinlock
);
534 zcache_evicted_unbuddied_pages
++;
535 /* want budlists unlocked when doing zbpg eviction */
536 zbud_evict_zbpg(zbpg
);
540 goto retry_unbud_list_i
;
542 spin_unlock_bh(&zbud_budlists_spinlock
);
545 /* as a last resort, free buddied pages */
547 spin_lock_bh(&zbud_budlists_spinlock
);
548 if (list_empty(&zbud_buddied_list
)) {
549 spin_unlock_bh(&zbud_budlists_spinlock
);
552 list_for_each_entry(zbpg
, &zbud_buddied_list
, bud_list
) {
553 if (unlikely(!spin_trylock(&zbpg
->lock
)))
555 list_del_init(&zbpg
->bud_list
);
556 zcache_zbud_buddied_count
--;
557 spin_unlock(&zbud_budlists_spinlock
);
558 zcache_evicted_buddied_pages
++;
559 /* want budlists unlocked when doing zbpg eviction */
560 zbud_evict_zbpg(zbpg
);
566 spin_unlock_bh(&zbud_budlists_spinlock
);
571 static void zbud_init(void)
575 INIT_LIST_HEAD(&zbud_buddied_list
);
576 zcache_zbud_buddied_count
= 0;
577 for (i
= 0; i
< NCHUNKS
; i
++) {
578 INIT_LIST_HEAD(&zbud_unbuddied
[i
].list
);
579 zbud_unbuddied
[i
].count
= 0;
585 * These sysfs routines show a nice distribution of how many zbpg's are
586 * currently (and have ever been placed) in each unbuddied list. It's fun
587 * to watch but can probably go away before final merge.
589 static int zbud_show_unbuddied_list_counts(char *buf
)
594 for (i
= 0; i
< NCHUNKS
; i
++)
595 p
+= sprintf(p
, "%u ", zbud_unbuddied
[i
].count
);
599 static int zbud_show_cumul_chunk_counts(char *buf
)
601 unsigned long i
, chunks
= 0, total_chunks
= 0, sum_total_chunks
= 0;
602 unsigned long total_chunks_lte_21
= 0, total_chunks_lte_32
= 0;
603 unsigned long total_chunks_lte_42
= 0;
606 for (i
= 0; i
< NCHUNKS
; i
++) {
607 p
+= sprintf(p
, "%lu ", zbud_cumul_chunk_counts
[i
]);
608 chunks
+= zbud_cumul_chunk_counts
[i
];
609 total_chunks
+= zbud_cumul_chunk_counts
[i
];
610 sum_total_chunks
+= i
* zbud_cumul_chunk_counts
[i
];
612 total_chunks_lte_21
= total_chunks
;
614 total_chunks_lte_32
= total_chunks
;
616 total_chunks_lte_42
= total_chunks
;
618 p
+= sprintf(p
, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
619 total_chunks_lte_21
, total_chunks_lte_32
, total_chunks_lte_42
,
620 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
626 * This "zv" PAM implementation combines the TLSF-based xvMalloc
627 * with lzo1x compression to maximize the amount of data that can
628 * be packed into a physical page.
630 * Zv represents a PAM page with the index and object (plus a "size" value
631 * necessary for decompression) immediately preceding the compressed data.
634 #define ZVH_SENTINEL 0x43214321
643 /* rudimentary policy limits */
644 /* total number of persistent pages may not exceed this percentage */
645 static unsigned int zv_page_count_policy_percent
= 75;
647 * byte count defining poor compression; pages with greater zsize will be
650 static unsigned int zv_max_zsize
= (PAGE_SIZE
/ 8) * 7;
652 * byte count defining poor *mean* compression; pages with greater zsize
653 * will be rejected until sufficient better-compressed pages are accepted
654 * driving the mean below this threshold
656 static unsigned int zv_max_mean_zsize
= (PAGE_SIZE
/ 8) * 5;
658 static unsigned long zv_curr_dist_counts
[NCHUNKS
];
659 static unsigned long zv_cumul_dist_counts
[NCHUNKS
];
661 static struct zv_hdr
*zv_create(struct xv_pool
*xvpool
, uint32_t pool_id
,
662 struct tmem_oid
*oid
, uint32_t index
,
663 void *cdata
, unsigned clen
)
666 struct zv_hdr
*zv
= NULL
;
668 int alloc_size
= clen
+ sizeof(struct zv_hdr
);
669 int chunks
= (alloc_size
+ (CHUNK_SIZE
- 1)) >> CHUNK_SHIFT
;
672 BUG_ON(!irqs_disabled());
673 BUG_ON(chunks
>= NCHUNKS
);
674 ret
= xv_malloc(xvpool
, alloc_size
,
675 &page
, &offset
, ZCACHE_GFP_MASK
);
678 zv_curr_dist_counts
[chunks
]++;
679 zv_cumul_dist_counts
[chunks
]++;
680 zv
= kmap_atomic(page
, KM_USER0
) + offset
;
683 zv
->pool_id
= pool_id
;
684 SET_SENTINEL(zv
, ZVH
);
685 memcpy((char *)zv
+ sizeof(struct zv_hdr
), cdata
, clen
);
686 kunmap_atomic(zv
, KM_USER0
);
691 static void zv_free(struct xv_pool
*xvpool
, struct zv_hdr
*zv
)
696 uint16_t size
= xv_get_object_size(zv
);
697 int chunks
= (size
+ (CHUNK_SIZE
- 1)) >> CHUNK_SHIFT
;
699 ASSERT_SENTINEL(zv
, ZVH
);
700 BUG_ON(chunks
>= NCHUNKS
);
701 zv_curr_dist_counts
[chunks
]--;
704 INVERT_SENTINEL(zv
, ZVH
);
705 page
= virt_to_page(zv
);
706 offset
= (unsigned long)zv
& ~PAGE_MASK
;
707 local_irq_save(flags
);
708 xv_free(xvpool
, page
, offset
);
709 local_irq_restore(flags
);
712 static void zv_decompress(struct page
*page
, struct zv_hdr
*zv
)
714 size_t clen
= PAGE_SIZE
;
719 ASSERT_SENTINEL(zv
, ZVH
);
720 size
= xv_get_object_size(zv
) - sizeof(*zv
);
722 to_va
= kmap_atomic(page
, KM_USER0
);
723 ret
= lzo1x_decompress_safe((char *)zv
+ sizeof(*zv
),
725 kunmap_atomic(to_va
, KM_USER0
);
726 BUG_ON(ret
!= LZO_E_OK
);
727 BUG_ON(clen
!= PAGE_SIZE
);
732 * show a distribution of compression stats for zv pages.
735 static int zv_curr_dist_counts_show(char *buf
)
737 unsigned long i
, n
, chunks
= 0, sum_total_chunks
= 0;
740 for (i
= 0; i
< NCHUNKS
; i
++) {
741 n
= zv_curr_dist_counts
[i
];
742 p
+= sprintf(p
, "%lu ", n
);
744 sum_total_chunks
+= i
* n
;
746 p
+= sprintf(p
, "mean:%lu\n",
747 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
751 static int zv_cumul_dist_counts_show(char *buf
)
753 unsigned long i
, n
, chunks
= 0, sum_total_chunks
= 0;
756 for (i
= 0; i
< NCHUNKS
; i
++) {
757 n
= zv_cumul_dist_counts
[i
];
758 p
+= sprintf(p
, "%lu ", n
);
760 sum_total_chunks
+= i
* n
;
762 p
+= sprintf(p
, "mean:%lu\n",
763 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
768 * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
769 * pages that don't compress to less than this value (including metadata
770 * overhead) to be rejected. We don't allow the value to get too close
773 static ssize_t
zv_max_zsize_show(struct kobject
*kobj
,
774 struct kobj_attribute
*attr
,
777 return sprintf(buf
, "%u\n", zv_max_zsize
);
780 static ssize_t
zv_max_zsize_store(struct kobject
*kobj
,
781 struct kobj_attribute
*attr
,
782 const char *buf
, size_t count
)
787 if (!capable(CAP_SYS_ADMIN
))
790 err
= strict_strtoul(buf
, 10, &val
);
791 if (err
|| (val
== 0) || (val
> (PAGE_SIZE
/ 8) * 7))
798 * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
799 * pages that don't compress to less than this value (including metadata
800 * overhead) to be rejected UNLESS the mean compression is also smaller
801 * than this value. In other words, we are load-balancing-by-zsize the
802 * accepted pages. Again, we don't allow the value to get too close
805 static ssize_t
zv_max_mean_zsize_show(struct kobject
*kobj
,
806 struct kobj_attribute
*attr
,
809 return sprintf(buf
, "%u\n", zv_max_mean_zsize
);
812 static ssize_t
zv_max_mean_zsize_store(struct kobject
*kobj
,
813 struct kobj_attribute
*attr
,
814 const char *buf
, size_t count
)
819 if (!capable(CAP_SYS_ADMIN
))
822 err
= strict_strtoul(buf
, 10, &val
);
823 if (err
|| (val
== 0) || (val
> (PAGE_SIZE
/ 8) * 7))
825 zv_max_mean_zsize
= val
;
830 * setting zv_page_count_policy_percent via sysfs sets an upper bound of
831 * persistent (e.g. swap) pages that will be retained according to:
832 * (zv_page_count_policy_percent * totalram_pages) / 100)
833 * when that limit is reached, further puts will be rejected (until
834 * some pages have been flushed). Note that, due to compression,
835 * this number may exceed 100; it defaults to 75 and we set an
836 * arbitary limit of 150. A poor choice will almost certainly result
837 * in OOM's, so this value should only be changed prudently.
839 static ssize_t
zv_page_count_policy_percent_show(struct kobject
*kobj
,
840 struct kobj_attribute
*attr
,
843 return sprintf(buf
, "%u\n", zv_page_count_policy_percent
);
846 static ssize_t
zv_page_count_policy_percent_store(struct kobject
*kobj
,
847 struct kobj_attribute
*attr
,
848 const char *buf
, size_t count
)
853 if (!capable(CAP_SYS_ADMIN
))
856 err
= strict_strtoul(buf
, 10, &val
);
857 if (err
|| (val
== 0) || (val
> 150))
859 zv_page_count_policy_percent
= val
;
863 static struct kobj_attribute zcache_zv_max_zsize_attr
= {
864 .attr
= { .name
= "zv_max_zsize", .mode
= 0644 },
865 .show
= zv_max_zsize_show
,
866 .store
= zv_max_zsize_store
,
869 static struct kobj_attribute zcache_zv_max_mean_zsize_attr
= {
870 .attr
= { .name
= "zv_max_mean_zsize", .mode
= 0644 },
871 .show
= zv_max_mean_zsize_show
,
872 .store
= zv_max_mean_zsize_store
,
875 static struct kobj_attribute zcache_zv_page_count_policy_percent_attr
= {
876 .attr
= { .name
= "zv_page_count_policy_percent",
878 .show
= zv_page_count_policy_percent_show
,
879 .store
= zv_page_count_policy_percent_store
,
884 * zcache core code starts here
887 /* useful stats not collected by cleancache or frontswap */
888 static unsigned long zcache_flush_total
;
889 static unsigned long zcache_flush_found
;
890 static unsigned long zcache_flobj_total
;
891 static unsigned long zcache_flobj_found
;
892 static unsigned long zcache_failed_eph_puts
;
893 static unsigned long zcache_failed_pers_puts
;
896 * Tmem operations assume the poolid implies the invoking client.
897 * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
898 * RAMster has each client numbered by cluster node, and a KVM version
899 * of zcache would have one client per guest and each client might
902 static struct tmem_pool
*zcache_get_pool_by_id(uint16_t cli_id
, uint16_t poolid
)
904 struct tmem_pool
*pool
= NULL
;
905 struct zcache_client
*cli
= NULL
;
907 if (cli_id
== LOCAL_CLIENT
)
910 if (cli_id
>= MAX_CLIENTS
)
912 cli
= &zcache_clients
[cli_id
];
915 atomic_inc(&cli
->refcount
);
917 if (poolid
< MAX_POOLS_PER_CLIENT
) {
918 pool
= cli
->tmem_pools
[poolid
];
920 atomic_inc(&pool
->refcount
);
926 static void zcache_put_pool(struct tmem_pool
*pool
)
928 struct zcache_client
*cli
= NULL
;
933 atomic_dec(&pool
->refcount
);
934 atomic_dec(&cli
->refcount
);
937 int zcache_new_client(uint16_t cli_id
)
939 struct zcache_client
*cli
= NULL
;
942 if (cli_id
== LOCAL_CLIENT
)
944 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
945 cli
= &zcache_clients
[cli_id
];
951 #ifdef CONFIG_FRONTSWAP
952 cli
->xvpool
= xv_create_pool();
953 if (cli
->xvpool
== NULL
)
961 /* counters for debugging */
962 static unsigned long zcache_failed_get_free_pages
;
963 static unsigned long zcache_failed_alloc
;
964 static unsigned long zcache_put_to_flush
;
967 * for now, used named slabs so can easily track usage; later can
968 * either just use kmalloc, or perhaps add a slab-like allocator
969 * to more carefully manage total memory utilization
971 static struct kmem_cache
*zcache_objnode_cache
;
972 static struct kmem_cache
*zcache_obj_cache
;
973 static atomic_t zcache_curr_obj_count
= ATOMIC_INIT(0);
974 static unsigned long zcache_curr_obj_count_max
;
975 static atomic_t zcache_curr_objnode_count
= ATOMIC_INIT(0);
976 static unsigned long zcache_curr_objnode_count_max
;
979 * to avoid memory allocation recursion (e.g. due to direct reclaim), we
980 * preload all necessary data structures so the hostops callbacks never
981 * actually do a malloc
983 struct zcache_preload
{
985 struct tmem_obj
*obj
;
987 struct tmem_objnode
*objnodes
[OBJNODE_TREE_MAX_PATH
];
989 static DEFINE_PER_CPU(struct zcache_preload
, zcache_preloads
) = { 0, };
991 static int zcache_do_preload(struct tmem_pool
*pool
)
993 struct zcache_preload
*kp
;
994 struct tmem_objnode
*objnode
;
995 struct tmem_obj
*obj
;
999 if (unlikely(zcache_objnode_cache
== NULL
))
1001 if (unlikely(zcache_obj_cache
== NULL
))
1004 kp
= &__get_cpu_var(zcache_preloads
);
1005 while (kp
->nr
< ARRAY_SIZE(kp
->objnodes
)) {
1006 preempt_enable_no_resched();
1007 objnode
= kmem_cache_alloc(zcache_objnode_cache
,
1009 if (unlikely(objnode
== NULL
)) {
1010 zcache_failed_alloc
++;
1014 kp
= &__get_cpu_var(zcache_preloads
);
1015 if (kp
->nr
< ARRAY_SIZE(kp
->objnodes
))
1016 kp
->objnodes
[kp
->nr
++] = objnode
;
1018 kmem_cache_free(zcache_objnode_cache
, objnode
);
1020 preempt_enable_no_resched();
1021 obj
= kmem_cache_alloc(zcache_obj_cache
, ZCACHE_GFP_MASK
);
1022 if (unlikely(obj
== NULL
)) {
1023 zcache_failed_alloc
++;
1026 page
= (void *)__get_free_page(ZCACHE_GFP_MASK
);
1027 if (unlikely(page
== NULL
)) {
1028 zcache_failed_get_free_pages
++;
1029 kmem_cache_free(zcache_obj_cache
, obj
);
1033 kp
= &__get_cpu_var(zcache_preloads
);
1034 if (kp
->obj
== NULL
)
1037 kmem_cache_free(zcache_obj_cache
, obj
);
1038 if (kp
->page
== NULL
)
1041 free_page((unsigned long)page
);
1047 static void *zcache_get_free_page(void)
1049 struct zcache_preload
*kp
;
1052 kp
= &__get_cpu_var(zcache_preloads
);
1054 BUG_ON(page
== NULL
);
1059 static void zcache_free_page(void *p
)
1061 free_page((unsigned long)p
);
1065 * zcache implementation for tmem host ops
1068 static struct tmem_objnode
*zcache_objnode_alloc(struct tmem_pool
*pool
)
1070 struct tmem_objnode
*objnode
= NULL
;
1071 unsigned long count
;
1072 struct zcache_preload
*kp
;
1074 kp
= &__get_cpu_var(zcache_preloads
);
1077 objnode
= kp
->objnodes
[kp
->nr
- 1];
1078 BUG_ON(objnode
== NULL
);
1079 kp
->objnodes
[kp
->nr
- 1] = NULL
;
1081 count
= atomic_inc_return(&zcache_curr_objnode_count
);
1082 if (count
> zcache_curr_objnode_count_max
)
1083 zcache_curr_objnode_count_max
= count
;
1088 static void zcache_objnode_free(struct tmem_objnode
*objnode
,
1089 struct tmem_pool
*pool
)
1091 atomic_dec(&zcache_curr_objnode_count
);
1092 BUG_ON(atomic_read(&zcache_curr_objnode_count
) < 0);
1093 kmem_cache_free(zcache_objnode_cache
, objnode
);
1096 static struct tmem_obj
*zcache_obj_alloc(struct tmem_pool
*pool
)
1098 struct tmem_obj
*obj
= NULL
;
1099 unsigned long count
;
1100 struct zcache_preload
*kp
;
1102 kp
= &__get_cpu_var(zcache_preloads
);
1104 BUG_ON(obj
== NULL
);
1106 count
= atomic_inc_return(&zcache_curr_obj_count
);
1107 if (count
> zcache_curr_obj_count_max
)
1108 zcache_curr_obj_count_max
= count
;
1112 static void zcache_obj_free(struct tmem_obj
*obj
, struct tmem_pool
*pool
)
1114 atomic_dec(&zcache_curr_obj_count
);
1115 BUG_ON(atomic_read(&zcache_curr_obj_count
) < 0);
1116 kmem_cache_free(zcache_obj_cache
, obj
);
1119 static struct tmem_hostops zcache_hostops
= {
1120 .obj_alloc
= zcache_obj_alloc
,
1121 .obj_free
= zcache_obj_free
,
1122 .objnode_alloc
= zcache_objnode_alloc
,
1123 .objnode_free
= zcache_objnode_free
,
1127 * zcache implementations for PAM page descriptor ops
1130 static atomic_t zcache_curr_eph_pampd_count
= ATOMIC_INIT(0);
1131 static unsigned long zcache_curr_eph_pampd_count_max
;
1132 static atomic_t zcache_curr_pers_pampd_count
= ATOMIC_INIT(0);
1133 static unsigned long zcache_curr_pers_pampd_count_max
;
1135 /* forward reference */
1136 static int zcache_compress(struct page
*from
, void **out_va
, size_t *out_len
);
1138 static void *zcache_pampd_create(char *data
, size_t size
, bool raw
, int eph
,
1139 struct tmem_pool
*pool
, struct tmem_oid
*oid
,
1142 void *pampd
= NULL
, *cdata
;
1145 unsigned long count
;
1146 struct page
*page
= (struct page
*)(data
);
1147 struct zcache_client
*cli
= pool
->client
;
1148 uint16_t client_id
= get_client_id_from_client(cli
);
1149 unsigned long zv_mean_zsize
;
1150 unsigned long curr_pers_pampd_count
;
1154 ret
= zcache_compress(page
, &cdata
, &clen
);
1157 if (clen
== 0 || clen
> zbud_max_buddy_size()) {
1158 zcache_compress_poor
++;
1161 pampd
= (void *)zbud_create(client_id
, pool
->pool_id
, oid
,
1162 index
, page
, cdata
, clen
);
1163 if (pampd
!= NULL
) {
1164 count
= atomic_inc_return(&zcache_curr_eph_pampd_count
);
1165 if (count
> zcache_curr_eph_pampd_count_max
)
1166 zcache_curr_eph_pampd_count_max
= count
;
1169 curr_pers_pampd_count
=
1170 atomic_read(&zcache_curr_pers_pampd_count
);
1171 if (curr_pers_pampd_count
>
1172 (zv_page_count_policy_percent
* totalram_pages
) / 100)
1174 ret
= zcache_compress(page
, &cdata
, &clen
);
1177 /* reject if compression is too poor */
1178 if (clen
> zv_max_zsize
) {
1179 zcache_compress_poor
++;
1182 /* reject if mean compression is too poor */
1183 if ((clen
> zv_max_mean_zsize
) && (curr_pers_pampd_count
> 0)) {
1184 total_zsize
= xv_get_total_size_bytes(cli
->xvpool
);
1185 zv_mean_zsize
= div_u64(total_zsize
,
1186 curr_pers_pampd_count
);
1187 if (zv_mean_zsize
> zv_max_mean_zsize
) {
1188 zcache_mean_compress_poor
++;
1192 pampd
= (void *)zv_create(cli
->xvpool
, pool
->pool_id
,
1193 oid
, index
, cdata
, clen
);
1196 count
= atomic_inc_return(&zcache_curr_pers_pampd_count
);
1197 if (count
> zcache_curr_pers_pampd_count_max
)
1198 zcache_curr_pers_pampd_count_max
= count
;
1205 * fill the pageframe corresponding to the struct page with the data
1206 * from the passed pampd
1208 static int zcache_pampd_get_data(char *data
, size_t *bufsize
, bool raw
,
1209 void *pampd
, struct tmem_pool
*pool
,
1210 struct tmem_oid
*oid
, uint32_t index
)
1214 BUG_ON(is_ephemeral(pool
));
1215 zv_decompress((struct page
*)(data
), pampd
);
1220 * fill the pageframe corresponding to the struct page with the data
1221 * from the passed pampd
1223 static int zcache_pampd_get_data_and_free(char *data
, size_t *bufsize
, bool raw
,
1224 void *pampd
, struct tmem_pool
*pool
,
1225 struct tmem_oid
*oid
, uint32_t index
)
1229 BUG_ON(!is_ephemeral(pool
));
1230 zbud_decompress(virt_to_page(data
), pampd
);
1231 zbud_free_and_delist((struct zbud_hdr
*)pampd
);
1232 atomic_dec(&zcache_curr_eph_pampd_count
);
1237 * free the pampd and remove it from any zcache lists
1238 * pampd must no longer be pointed to from any tmem data structures!
1240 static void zcache_pampd_free(void *pampd
, struct tmem_pool
*pool
,
1241 struct tmem_oid
*oid
, uint32_t index
)
1243 struct zcache_client
*cli
= pool
->client
;
1245 if (is_ephemeral(pool
)) {
1246 zbud_free_and_delist((struct zbud_hdr
*)pampd
);
1247 atomic_dec(&zcache_curr_eph_pampd_count
);
1248 BUG_ON(atomic_read(&zcache_curr_eph_pampd_count
) < 0);
1250 zv_free(cli
->xvpool
, (struct zv_hdr
*)pampd
);
1251 atomic_dec(&zcache_curr_pers_pampd_count
);
1252 BUG_ON(atomic_read(&zcache_curr_pers_pampd_count
) < 0);
1256 static void zcache_pampd_free_obj(struct tmem_pool
*pool
, struct tmem_obj
*obj
)
1260 static void zcache_pampd_new_obj(struct tmem_obj
*obj
)
1264 static int zcache_pampd_replace_in_obj(void *pampd
, struct tmem_obj
*obj
)
1269 static bool zcache_pampd_is_remote(void *pampd
)
1274 static struct tmem_pamops zcache_pamops
= {
1275 .create
= zcache_pampd_create
,
1276 .get_data
= zcache_pampd_get_data
,
1277 .get_data_and_free
= zcache_pampd_get_data_and_free
,
1278 .free
= zcache_pampd_free
,
1279 .free_obj
= zcache_pampd_free_obj
,
1280 .new_obj
= zcache_pampd_new_obj
,
1281 .replace_in_obj
= zcache_pampd_replace_in_obj
,
1282 .is_remote
= zcache_pampd_is_remote
,
1286 * zcache compression/decompression and related per-cpu stuff
1289 #define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
1290 #define LZO_DSTMEM_PAGE_ORDER 1
1291 static DEFINE_PER_CPU(unsigned char *, zcache_workmem
);
1292 static DEFINE_PER_CPU(unsigned char *, zcache_dstmem
);
1294 static int zcache_compress(struct page
*from
, void **out_va
, size_t *out_len
)
1297 unsigned char *dmem
= __get_cpu_var(zcache_dstmem
);
1298 unsigned char *wmem
= __get_cpu_var(zcache_workmem
);
1301 BUG_ON(!irqs_disabled());
1302 if (unlikely(dmem
== NULL
|| wmem
== NULL
))
1303 goto out
; /* no buffer, so can't compress */
1304 from_va
= kmap_atomic(from
, KM_USER0
);
1306 ret
= lzo1x_1_compress(from_va
, PAGE_SIZE
, dmem
, out_len
, wmem
);
1307 BUG_ON(ret
!= LZO_E_OK
);
1309 kunmap_atomic(from_va
, KM_USER0
);
1316 static int zcache_cpu_notifier(struct notifier_block
*nb
,
1317 unsigned long action
, void *pcpu
)
1319 int cpu
= (long)pcpu
;
1320 struct zcache_preload
*kp
;
1323 case CPU_UP_PREPARE
:
1324 per_cpu(zcache_dstmem
, cpu
) = (void *)__get_free_pages(
1325 GFP_KERNEL
| __GFP_REPEAT
,
1326 LZO_DSTMEM_PAGE_ORDER
),
1327 per_cpu(zcache_workmem
, cpu
) =
1328 kzalloc(LZO1X_MEM_COMPRESS
,
1329 GFP_KERNEL
| __GFP_REPEAT
);
1332 case CPU_UP_CANCELED
:
1333 free_pages((unsigned long)per_cpu(zcache_dstmem
, cpu
),
1334 LZO_DSTMEM_PAGE_ORDER
);
1335 per_cpu(zcache_dstmem
, cpu
) = NULL
;
1336 kfree(per_cpu(zcache_workmem
, cpu
));
1337 per_cpu(zcache_workmem
, cpu
) = NULL
;
1338 kp
= &per_cpu(zcache_preloads
, cpu
);
1340 kmem_cache_free(zcache_objnode_cache
,
1341 kp
->objnodes
[kp
->nr
- 1]);
1342 kp
->objnodes
[kp
->nr
- 1] = NULL
;
1346 kmem_cache_free(zcache_obj_cache
, kp
->obj
);
1350 free_page((unsigned long)kp
->page
);
1360 static struct notifier_block zcache_cpu_notifier_block
= {
1361 .notifier_call
= zcache_cpu_notifier
1365 #define ZCACHE_SYSFS_RO(_name) \
1366 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1367 struct kobj_attribute *attr, char *buf) \
1369 return sprintf(buf, "%lu\n", zcache_##_name); \
1371 static struct kobj_attribute zcache_##_name##_attr = { \
1372 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1373 .show = zcache_##_name##_show, \
1376 #define ZCACHE_SYSFS_RO_ATOMIC(_name) \
1377 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1378 struct kobj_attribute *attr, char *buf) \
1380 return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
1382 static struct kobj_attribute zcache_##_name##_attr = { \
1383 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1384 .show = zcache_##_name##_show, \
1387 #define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
1388 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1389 struct kobj_attribute *attr, char *buf) \
1391 return _func(buf); \
1393 static struct kobj_attribute zcache_##_name##_attr = { \
1394 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1395 .show = zcache_##_name##_show, \
1398 ZCACHE_SYSFS_RO(curr_obj_count_max
);
1399 ZCACHE_SYSFS_RO(curr_objnode_count_max
);
1400 ZCACHE_SYSFS_RO(flush_total
);
1401 ZCACHE_SYSFS_RO(flush_found
);
1402 ZCACHE_SYSFS_RO(flobj_total
);
1403 ZCACHE_SYSFS_RO(flobj_found
);
1404 ZCACHE_SYSFS_RO(failed_eph_puts
);
1405 ZCACHE_SYSFS_RO(failed_pers_puts
);
1406 ZCACHE_SYSFS_RO(zbud_curr_zbytes
);
1407 ZCACHE_SYSFS_RO(zbud_cumul_zpages
);
1408 ZCACHE_SYSFS_RO(zbud_cumul_zbytes
);
1409 ZCACHE_SYSFS_RO(zbud_buddied_count
);
1410 ZCACHE_SYSFS_RO(zbpg_unused_list_count
);
1411 ZCACHE_SYSFS_RO(evicted_raw_pages
);
1412 ZCACHE_SYSFS_RO(evicted_unbuddied_pages
);
1413 ZCACHE_SYSFS_RO(evicted_buddied_pages
);
1414 ZCACHE_SYSFS_RO(failed_get_free_pages
);
1415 ZCACHE_SYSFS_RO(failed_alloc
);
1416 ZCACHE_SYSFS_RO(put_to_flush
);
1417 ZCACHE_SYSFS_RO(compress_poor
);
1418 ZCACHE_SYSFS_RO(mean_compress_poor
);
1419 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages
);
1420 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages
);
1421 ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count
);
1422 ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count
);
1423 ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts
,
1424 zbud_show_unbuddied_list_counts
);
1425 ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts
,
1426 zbud_show_cumul_chunk_counts
);
1427 ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts
,
1428 zv_curr_dist_counts_show
);
1429 ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts
,
1430 zv_cumul_dist_counts_show
);
1432 static struct attribute
*zcache_attrs
[] = {
1433 &zcache_curr_obj_count_attr
.attr
,
1434 &zcache_curr_obj_count_max_attr
.attr
,
1435 &zcache_curr_objnode_count_attr
.attr
,
1436 &zcache_curr_objnode_count_max_attr
.attr
,
1437 &zcache_flush_total_attr
.attr
,
1438 &zcache_flobj_total_attr
.attr
,
1439 &zcache_flush_found_attr
.attr
,
1440 &zcache_flobj_found_attr
.attr
,
1441 &zcache_failed_eph_puts_attr
.attr
,
1442 &zcache_failed_pers_puts_attr
.attr
,
1443 &zcache_compress_poor_attr
.attr
,
1444 &zcache_mean_compress_poor_attr
.attr
,
1445 &zcache_zbud_curr_raw_pages_attr
.attr
,
1446 &zcache_zbud_curr_zpages_attr
.attr
,
1447 &zcache_zbud_curr_zbytes_attr
.attr
,
1448 &zcache_zbud_cumul_zpages_attr
.attr
,
1449 &zcache_zbud_cumul_zbytes_attr
.attr
,
1450 &zcache_zbud_buddied_count_attr
.attr
,
1451 &zcache_zbpg_unused_list_count_attr
.attr
,
1452 &zcache_evicted_raw_pages_attr
.attr
,
1453 &zcache_evicted_unbuddied_pages_attr
.attr
,
1454 &zcache_evicted_buddied_pages_attr
.attr
,
1455 &zcache_failed_get_free_pages_attr
.attr
,
1456 &zcache_failed_alloc_attr
.attr
,
1457 &zcache_put_to_flush_attr
.attr
,
1458 &zcache_zbud_unbuddied_list_counts_attr
.attr
,
1459 &zcache_zbud_cumul_chunk_counts_attr
.attr
,
1460 &zcache_zv_curr_dist_counts_attr
.attr
,
1461 &zcache_zv_cumul_dist_counts_attr
.attr
,
1462 &zcache_zv_max_zsize_attr
.attr
,
1463 &zcache_zv_max_mean_zsize_attr
.attr
,
1464 &zcache_zv_page_count_policy_percent_attr
.attr
,
1468 static struct attribute_group zcache_attr_group
= {
1469 .attrs
= zcache_attrs
,
1473 #endif /* CONFIG_SYSFS */
1475 * When zcache is disabled ("frozen"), pools can be created and destroyed,
1476 * but all puts (and thus all other operations that require memory allocation)
1477 * must fail. If zcache is unfrozen, accepts puts, then frozen again,
1478 * data consistency requires all puts while frozen to be converted into
1481 static bool zcache_freeze
;
1484 * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
1486 static int shrink_zcache_memory(struct shrinker
*shrink
,
1487 struct shrink_control
*sc
)
1490 int nr
= sc
->nr_to_scan
;
1491 gfp_t gfp_mask
= sc
->gfp_mask
;
1494 if (!(gfp_mask
& __GFP_FS
))
1495 /* does this case really need to be skipped? */
1497 zbud_evict_pages(nr
);
1499 ret
= (int)atomic_read(&zcache_zbud_curr_raw_pages
);
1504 static struct shrinker zcache_shrinker
= {
1505 .shrink
= shrink_zcache_memory
,
1506 .seeks
= DEFAULT_SEEKS
,
1510 * zcache shims between cleancache/frontswap ops and tmem
1513 static int zcache_put_page(int cli_id
, int pool_id
, struct tmem_oid
*oidp
,
1514 uint32_t index
, struct page
*page
)
1516 struct tmem_pool
*pool
;
1519 BUG_ON(!irqs_disabled());
1520 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1521 if (unlikely(pool
== NULL
))
1523 if (!zcache_freeze
&& zcache_do_preload(pool
) == 0) {
1524 /* preload does preempt_disable on success */
1525 ret
= tmem_put(pool
, oidp
, index
, (char *)(page
),
1526 PAGE_SIZE
, 0, is_ephemeral(pool
));
1528 if (is_ephemeral(pool
))
1529 zcache_failed_eph_puts
++;
1531 zcache_failed_pers_puts
++;
1533 zcache_put_pool(pool
);
1534 preempt_enable_no_resched();
1536 zcache_put_to_flush
++;
1537 if (atomic_read(&pool
->obj_count
) > 0)
1538 /* the put fails whether the flush succeeds or not */
1539 (void)tmem_flush_page(pool
, oidp
, index
);
1540 zcache_put_pool(pool
);
1546 static int zcache_get_page(int cli_id
, int pool_id
, struct tmem_oid
*oidp
,
1547 uint32_t index
, struct page
*page
)
1549 struct tmem_pool
*pool
;
1551 unsigned long flags
;
1552 size_t size
= PAGE_SIZE
;
1554 local_irq_save(flags
);
1555 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1556 if (likely(pool
!= NULL
)) {
1557 if (atomic_read(&pool
->obj_count
) > 0)
1558 ret
= tmem_get(pool
, oidp
, index
, (char *)(page
),
1559 &size
, 0, is_ephemeral(pool
));
1560 zcache_put_pool(pool
);
1562 local_irq_restore(flags
);
1566 static int zcache_flush_page(int cli_id
, int pool_id
,
1567 struct tmem_oid
*oidp
, uint32_t index
)
1569 struct tmem_pool
*pool
;
1571 unsigned long flags
;
1573 local_irq_save(flags
);
1574 zcache_flush_total
++;
1575 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1576 if (likely(pool
!= NULL
)) {
1577 if (atomic_read(&pool
->obj_count
) > 0)
1578 ret
= tmem_flush_page(pool
, oidp
, index
);
1579 zcache_put_pool(pool
);
1582 zcache_flush_found
++;
1583 local_irq_restore(flags
);
1587 static int zcache_flush_object(int cli_id
, int pool_id
,
1588 struct tmem_oid
*oidp
)
1590 struct tmem_pool
*pool
;
1592 unsigned long flags
;
1594 local_irq_save(flags
);
1595 zcache_flobj_total
++;
1596 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1597 if (likely(pool
!= NULL
)) {
1598 if (atomic_read(&pool
->obj_count
) > 0)
1599 ret
= tmem_flush_object(pool
, oidp
);
1600 zcache_put_pool(pool
);
1603 zcache_flobj_found
++;
1604 local_irq_restore(flags
);
1608 static int zcache_destroy_pool(int cli_id
, int pool_id
)
1610 struct tmem_pool
*pool
= NULL
;
1611 struct zcache_client
*cli
= NULL
;
1616 if (cli_id
== LOCAL_CLIENT
)
1618 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
1619 cli
= &zcache_clients
[cli_id
];
1622 atomic_inc(&cli
->refcount
);
1623 pool
= cli
->tmem_pools
[pool_id
];
1626 cli
->tmem_pools
[pool_id
] = NULL
;
1627 /* wait for pool activity on other cpus to quiesce */
1628 while (atomic_read(&pool
->refcount
) != 0)
1630 atomic_dec(&cli
->refcount
);
1632 ret
= tmem_destroy_pool(pool
);
1635 pr_info("zcache: destroyed pool id=%d, cli_id=%d\n",
1641 static int zcache_new_pool(uint16_t cli_id
, uint32_t flags
)
1644 struct tmem_pool
*pool
;
1645 struct zcache_client
*cli
= NULL
;
1647 if (cli_id
== LOCAL_CLIENT
)
1649 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
1650 cli
= &zcache_clients
[cli_id
];
1653 atomic_inc(&cli
->refcount
);
1654 pool
= kmalloc(sizeof(struct tmem_pool
), GFP_ATOMIC
);
1656 pr_info("zcache: pool creation failed: out of memory\n");
1660 for (poolid
= 0; poolid
< MAX_POOLS_PER_CLIENT
; poolid
++)
1661 if (cli
->tmem_pools
[poolid
] == NULL
)
1663 if (poolid
>= MAX_POOLS_PER_CLIENT
) {
1664 pr_info("zcache: pool creation failed: max exceeded\n");
1669 atomic_set(&pool
->refcount
, 0);
1671 pool
->pool_id
= poolid
;
1672 tmem_new_pool(pool
, flags
);
1673 cli
->tmem_pools
[poolid
] = pool
;
1674 pr_info("zcache: created %s tmem pool, id=%d, client=%d\n",
1675 flags
& TMEM_POOL_PERSIST
? "persistent" : "ephemeral",
1679 atomic_dec(&cli
->refcount
);
1684 * Two kernel functionalities currently can be layered on top of tmem.
1685 * These are "cleancache" which is used as a second-chance cache for clean
1686 * page cache pages; and "frontswap" which is used for swap pages
1687 * to avoid writes to disk. A generic "shim" is provided here for each
1688 * to translate in-kernel semantics to zcache semantics.
1691 #ifdef CONFIG_CLEANCACHE
1692 static void zcache_cleancache_put_page(int pool_id
,
1693 struct cleancache_filekey key
,
1694 pgoff_t index
, struct page
*page
)
1696 u32 ind
= (u32
) index
;
1697 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1699 if (likely(ind
== index
))
1700 (void)zcache_put_page(LOCAL_CLIENT
, pool_id
, &oid
, index
, page
);
1703 static int zcache_cleancache_get_page(int pool_id
,
1704 struct cleancache_filekey key
,
1705 pgoff_t index
, struct page
*page
)
1707 u32 ind
= (u32
) index
;
1708 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1711 if (likely(ind
== index
))
1712 ret
= zcache_get_page(LOCAL_CLIENT
, pool_id
, &oid
, index
, page
);
1716 static void zcache_cleancache_flush_page(int pool_id
,
1717 struct cleancache_filekey key
,
1720 u32 ind
= (u32
) index
;
1721 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1723 if (likely(ind
== index
))
1724 (void)zcache_flush_page(LOCAL_CLIENT
, pool_id
, &oid
, ind
);
1727 static void zcache_cleancache_flush_inode(int pool_id
,
1728 struct cleancache_filekey key
)
1730 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1732 (void)zcache_flush_object(LOCAL_CLIENT
, pool_id
, &oid
);
1735 static void zcache_cleancache_flush_fs(int pool_id
)
1738 (void)zcache_destroy_pool(LOCAL_CLIENT
, pool_id
);
1741 static int zcache_cleancache_init_fs(size_t pagesize
)
1743 BUG_ON(sizeof(struct cleancache_filekey
) !=
1744 sizeof(struct tmem_oid
));
1745 BUG_ON(pagesize
!= PAGE_SIZE
);
1746 return zcache_new_pool(LOCAL_CLIENT
, 0);
1749 static int zcache_cleancache_init_shared_fs(char *uuid
, size_t pagesize
)
1751 /* shared pools are unsupported and map to private */
1752 BUG_ON(sizeof(struct cleancache_filekey
) !=
1753 sizeof(struct tmem_oid
));
1754 BUG_ON(pagesize
!= PAGE_SIZE
);
1755 return zcache_new_pool(LOCAL_CLIENT
, 0);
1758 static struct cleancache_ops zcache_cleancache_ops
= {
1759 .put_page
= zcache_cleancache_put_page
,
1760 .get_page
= zcache_cleancache_get_page
,
1761 .flush_page
= zcache_cleancache_flush_page
,
1762 .flush_inode
= zcache_cleancache_flush_inode
,
1763 .flush_fs
= zcache_cleancache_flush_fs
,
1764 .init_shared_fs
= zcache_cleancache_init_shared_fs
,
1765 .init_fs
= zcache_cleancache_init_fs
1768 struct cleancache_ops
zcache_cleancache_register_ops(void)
1770 struct cleancache_ops old_ops
=
1771 cleancache_register_ops(&zcache_cleancache_ops
);
1777 #ifdef CONFIG_FRONTSWAP
1778 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1779 static int zcache_frontswap_poolid
= -1;
1782 * Swizzling increases objects per swaptype, increasing tmem concurrency
1783 * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
1784 * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from
1785 * frontswap_get_page()
1787 #define SWIZ_BITS 27
1788 #define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
1789 #define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
1790 #define iswiz(_ind) (_ind >> SWIZ_BITS)
1792 static inline struct tmem_oid
oswiz(unsigned type
, u32 ind
)
1794 struct tmem_oid oid
= { .oid
= { 0 } };
1795 oid
.oid
[0] = _oswiz(type
, ind
);
1799 static int zcache_frontswap_put_page(unsigned type
, pgoff_t offset
,
1802 u64 ind64
= (u64
)offset
;
1803 u32 ind
= (u32
)offset
;
1804 struct tmem_oid oid
= oswiz(type
, ind
);
1806 unsigned long flags
;
1808 BUG_ON(!PageLocked(page
));
1809 if (likely(ind64
== ind
)) {
1810 local_irq_save(flags
);
1811 ret
= zcache_put_page(LOCAL_CLIENT
, zcache_frontswap_poolid
,
1812 &oid
, iswiz(ind
), page
);
1813 local_irq_restore(flags
);
1818 /* returns 0 if the page was successfully gotten from frontswap, -1 if
1819 * was not present (should never happen!) */
1820 static int zcache_frontswap_get_page(unsigned type
, pgoff_t offset
,
1823 u64 ind64
= (u64
)offset
;
1824 u32 ind
= (u32
)offset
;
1825 struct tmem_oid oid
= oswiz(type
, ind
);
1828 BUG_ON(!PageLocked(page
));
1829 if (likely(ind64
== ind
))
1830 ret
= zcache_get_page(LOCAL_CLIENT
, zcache_frontswap_poolid
,
1831 &oid
, iswiz(ind
), page
);
1835 /* flush a single page from frontswap */
1836 static void zcache_frontswap_flush_page(unsigned type
, pgoff_t offset
)
1838 u64 ind64
= (u64
)offset
;
1839 u32 ind
= (u32
)offset
;
1840 struct tmem_oid oid
= oswiz(type
, ind
);
1842 if (likely(ind64
== ind
))
1843 (void)zcache_flush_page(LOCAL_CLIENT
, zcache_frontswap_poolid
,
1847 /* flush all pages from the passed swaptype */
1848 static void zcache_frontswap_flush_area(unsigned type
)
1850 struct tmem_oid oid
;
1853 for (ind
= SWIZ_MASK
; ind
>= 0; ind
--) {
1854 oid
= oswiz(type
, ind
);
1855 (void)zcache_flush_object(LOCAL_CLIENT
,
1856 zcache_frontswap_poolid
, &oid
);
1860 static void zcache_frontswap_init(unsigned ignored
)
1862 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1863 if (zcache_frontswap_poolid
< 0)
1864 zcache_frontswap_poolid
=
1865 zcache_new_pool(LOCAL_CLIENT
, TMEM_POOL_PERSIST
);
1868 static struct frontswap_ops zcache_frontswap_ops
= {
1869 .put_page
= zcache_frontswap_put_page
,
1870 .get_page
= zcache_frontswap_get_page
,
1871 .flush_page
= zcache_frontswap_flush_page
,
1872 .flush_area
= zcache_frontswap_flush_area
,
1873 .init
= zcache_frontswap_init
1876 struct frontswap_ops
zcache_frontswap_register_ops(void)
1878 struct frontswap_ops old_ops
=
1879 frontswap_register_ops(&zcache_frontswap_ops
);
1886 * zcache initialization
1887 * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
1891 static int zcache_enabled
;
1893 static int __init
enable_zcache(char *s
)
1898 __setup("zcache", enable_zcache
);
1900 /* allow independent dynamic disabling of cleancache and frontswap */
1902 static int use_cleancache
= 1;
1904 static int __init
no_cleancache(char *s
)
1910 __setup("nocleancache", no_cleancache
);
1912 static int use_frontswap
= 1;
1914 static int __init
no_frontswap(char *s
)
1920 __setup("nofrontswap", no_frontswap
);
1922 static int __init
zcache_init(void)
1927 ret
= sysfs_create_group(mm_kobj
, &zcache_attr_group
);
1929 pr_err("zcache: can't create sysfs\n");
1932 #endif /* CONFIG_SYSFS */
1933 #if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
1934 if (zcache_enabled
) {
1937 tmem_register_hostops(&zcache_hostops
);
1938 tmem_register_pamops(&zcache_pamops
);
1939 ret
= register_cpu_notifier(&zcache_cpu_notifier_block
);
1941 pr_err("zcache: can't register cpu notifier\n");
1944 for_each_online_cpu(cpu
) {
1945 void *pcpu
= (void *)(long)cpu
;
1946 zcache_cpu_notifier(&zcache_cpu_notifier_block
,
1947 CPU_UP_PREPARE
, pcpu
);
1950 zcache_objnode_cache
= kmem_cache_create("zcache_objnode",
1951 sizeof(struct tmem_objnode
), 0, 0, NULL
);
1952 zcache_obj_cache
= kmem_cache_create("zcache_obj",
1953 sizeof(struct tmem_obj
), 0, 0, NULL
);
1954 ret
= zcache_new_client(LOCAL_CLIENT
);
1956 pr_err("zcache: can't create client\n");
1960 #ifdef CONFIG_CLEANCACHE
1961 if (zcache_enabled
&& use_cleancache
) {
1962 struct cleancache_ops old_ops
;
1965 register_shrinker(&zcache_shrinker
);
1966 old_ops
= zcache_cleancache_register_ops();
1967 pr_info("zcache: cleancache enabled using kernel "
1968 "transcendent memory and compression buddies\n");
1969 if (old_ops
.init_fs
!= NULL
)
1970 pr_warning("zcache: cleancache_ops overridden");
1973 #ifdef CONFIG_FRONTSWAP
1974 if (zcache_enabled
&& use_frontswap
) {
1975 struct frontswap_ops old_ops
;
1977 old_ops
= zcache_frontswap_register_ops();
1978 pr_info("zcache: frontswap enabled using kernel "
1979 "transcendent memory and xvmalloc\n");
1980 if (old_ops
.init
!= NULL
)
1981 pr_warning("zcache: frontswap_ops overridden");
1988 module_init(zcache_init
)