4 * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
5 * Copyright (c) 2010,2011, Nitin Gupta
7 * Zcache provides an in-kernel "host implementation" for transcendent memory
8 * and, thus indirectly, for cleancache and frontswap. Zcache includes two
9 * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
10 * 1) "compression buddies" ("zbud") is used for ephemeral pages
11 * 2) xvmalloc is used for persistent pages.
12 * Xvmalloc (based on the TLSF allocator) has very low fragmentation
13 * so maximizes space efficiency, while zbud allows pairs (and potentially,
14 * in the future, more than a pair of) compressed pages to be closely linked
15 * so that reclaiming can be done via the kernel's physical-page-oriented
16 * "shrinker" interface.
18 * [1] For a definition of page-accessible memory (aka PAM), see:
19 * http://marc.info/?l=linux-mm&m=127811271605009
22 #include <linux/module.h>
23 #include <linux/cpu.h>
24 #include <linux/highmem.h>
25 #include <linux/list.h>
26 #include <linux/lzo.h>
27 #include <linux/slab.h>
28 #include <linux/spinlock.h>
29 #include <linux/types.h>
30 #include <linux/atomic.h>
31 #include <linux/math64.h>
34 #include "../zram/xvmalloc.h" /* if built in drivers/staging */
36 #if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
37 #error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
39 #ifdef CONFIG_CLEANCACHE
40 #include <linux/cleancache.h>
42 #ifdef CONFIG_FRONTSWAP
43 #include <linux/frontswap.h>
47 /* this is more aggressive but may cause other problems? */
48 #define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
50 #define ZCACHE_GFP_MASK \
51 (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
54 #define MAX_POOLS_PER_CLIENT 16
56 #define MAX_CLIENTS 16
57 #define LOCAL_CLIENT ((uint16_t)-1)
59 MODULE_LICENSE("GPL");
61 struct zcache_client
{
62 struct tmem_pool
*tmem_pools
[MAX_POOLS_PER_CLIENT
];
63 struct xv_pool
*xvpool
;
68 static struct zcache_client zcache_host
;
69 static struct zcache_client zcache_clients
[MAX_CLIENTS
];
71 static inline uint16_t get_client_id_from_client(struct zcache_client
*cli
)
74 if (cli
== &zcache_host
)
76 return cli
- &zcache_clients
[0];
79 static inline bool is_local_client(struct zcache_client
*cli
)
81 return cli
== &zcache_host
;
85 * Compression buddies ("zbud") provides for packing two (or, possibly
86 * in the future, more) compressed ephemeral pages into a single "raw"
87 * (physical) page and tracking them with data structures so that
88 * the raw pages can be easily reclaimed.
90 * A zbud page ("zbpg") is an aligned page containing a list_head,
91 * a lock, and two "zbud headers". The remainder of the physical
92 * page is divided up into aligned 64-byte "chunks" which contain
93 * the compressed data for zero, one, or two zbuds. Each zbpg
94 * resides on: (1) an "unused list" if it has no zbuds; (2) a
95 * "buddied" list if it is fully populated with two zbuds; or
96 * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
97 * the one unbuddied zbud uses. The data inside a zbpg cannot be
98 * read or written unless the zbpg's lock is held.
101 #define ZBH_SENTINEL 0x43214321
102 #define ZBPG_SENTINEL 0xdeadbeef
104 #define ZBUD_MAX_BUDS 2
111 uint16_t size
; /* compressed size in bytes, zero means unused */
116 struct list_head bud_list
;
118 struct zbud_hdr buddy
[ZBUD_MAX_BUDS
];
120 /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
123 #define CHUNK_SHIFT 6
124 #define CHUNK_SIZE (1 << CHUNK_SHIFT)
125 #define CHUNK_MASK (~(CHUNK_SIZE-1))
126 #define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \
127 CHUNK_MASK) >> CHUNK_SHIFT)
128 #define MAX_CHUNK (NCHUNKS-1)
131 struct list_head list
;
133 } zbud_unbuddied
[NCHUNKS
];
134 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
135 /* element 0 is never used but optimizing that isn't worth it */
136 static unsigned long zbud_cumul_chunk_counts
[NCHUNKS
];
138 struct list_head zbud_buddied_list
;
139 static unsigned long zcache_zbud_buddied_count
;
141 /* protects the buddied list and all unbuddied lists */
142 static DEFINE_SPINLOCK(zbud_budlists_spinlock
);
144 static LIST_HEAD(zbpg_unused_list
);
145 static unsigned long zcache_zbpg_unused_list_count
;
147 /* protects the unused page list */
148 static DEFINE_SPINLOCK(zbpg_unused_list_spinlock
);
150 static atomic_t zcache_zbud_curr_raw_pages
;
151 static atomic_t zcache_zbud_curr_zpages
;
152 static unsigned long zcache_zbud_curr_zbytes
;
153 static unsigned long zcache_zbud_cumul_zpages
;
154 static unsigned long zcache_zbud_cumul_zbytes
;
155 static unsigned long zcache_compress_poor
;
156 static unsigned long zcache_mean_compress_poor
;
158 /* forward references */
159 static void *zcache_get_free_page(void);
160 static void zcache_free_page(void *p
);
163 * zbud helper functions
166 static inline unsigned zbud_max_buddy_size(void)
168 return MAX_CHUNK
<< CHUNK_SHIFT
;
171 static inline unsigned zbud_size_to_chunks(unsigned size
)
173 BUG_ON(size
== 0 || size
> zbud_max_buddy_size());
174 return (size
+ CHUNK_SIZE
- 1) >> CHUNK_SHIFT
;
177 static inline int zbud_budnum(struct zbud_hdr
*zh
)
179 unsigned offset
= (unsigned long)zh
& (PAGE_SIZE
- 1);
180 struct zbud_page
*zbpg
= NULL
;
181 unsigned budnum
= -1U;
184 for (i
= 0; i
< ZBUD_MAX_BUDS
; i
++)
185 if (offset
== offsetof(typeof(*zbpg
), buddy
[i
])) {
189 BUG_ON(budnum
== -1U);
193 static char *zbud_data(struct zbud_hdr
*zh
, unsigned size
)
195 struct zbud_page
*zbpg
;
199 ASSERT_SENTINEL(zh
, ZBH
);
200 budnum
= zbud_budnum(zh
);
201 BUG_ON(size
== 0 || size
> zbud_max_buddy_size());
202 zbpg
= container_of(zh
, struct zbud_page
, buddy
[budnum
]);
203 ASSERT_SPINLOCK(&zbpg
->lock
);
206 p
+= ((sizeof(struct zbud_page
) + CHUNK_SIZE
- 1) &
208 else if (budnum
== 1)
209 p
+= PAGE_SIZE
- ((size
+ CHUNK_SIZE
- 1) & CHUNK_MASK
);
214 * zbud raw page management
217 static struct zbud_page
*zbud_alloc_raw_page(void)
219 struct zbud_page
*zbpg
= NULL
;
220 struct zbud_hdr
*zh0
, *zh1
;
223 /* if any pages on the zbpg list, use one */
224 spin_lock(&zbpg_unused_list_spinlock
);
225 if (!list_empty(&zbpg_unused_list
)) {
226 zbpg
= list_first_entry(&zbpg_unused_list
,
227 struct zbud_page
, bud_list
);
228 list_del_init(&zbpg
->bud_list
);
229 zcache_zbpg_unused_list_count
--;
232 spin_unlock(&zbpg_unused_list_spinlock
);
234 /* none on zbpg list, try to get a kernel page */
235 zbpg
= zcache_get_free_page();
236 if (likely(zbpg
!= NULL
)) {
237 INIT_LIST_HEAD(&zbpg
->bud_list
);
238 zh0
= &zbpg
->buddy
[0]; zh1
= &zbpg
->buddy
[1];
239 spin_lock_init(&zbpg
->lock
);
241 ASSERT_INVERTED_SENTINEL(zbpg
, ZBPG
);
242 SET_SENTINEL(zbpg
, ZBPG
);
243 BUG_ON(zh0
->size
!= 0 || tmem_oid_valid(&zh0
->oid
));
244 BUG_ON(zh1
->size
!= 0 || tmem_oid_valid(&zh1
->oid
));
246 atomic_inc(&zcache_zbud_curr_raw_pages
);
247 INIT_LIST_HEAD(&zbpg
->bud_list
);
248 SET_SENTINEL(zbpg
, ZBPG
);
249 zh0
->size
= 0; zh1
->size
= 0;
250 tmem_oid_set_invalid(&zh0
->oid
);
251 tmem_oid_set_invalid(&zh1
->oid
);
257 static void zbud_free_raw_page(struct zbud_page
*zbpg
)
259 struct zbud_hdr
*zh0
= &zbpg
->buddy
[0], *zh1
= &zbpg
->buddy
[1];
261 ASSERT_SENTINEL(zbpg
, ZBPG
);
262 BUG_ON(!list_empty(&zbpg
->bud_list
));
263 ASSERT_SPINLOCK(&zbpg
->lock
);
264 BUG_ON(zh0
->size
!= 0 || tmem_oid_valid(&zh0
->oid
));
265 BUG_ON(zh1
->size
!= 0 || tmem_oid_valid(&zh1
->oid
));
266 INVERT_SENTINEL(zbpg
, ZBPG
);
267 spin_unlock(&zbpg
->lock
);
268 spin_lock(&zbpg_unused_list_spinlock
);
269 list_add(&zbpg
->bud_list
, &zbpg_unused_list
);
270 zcache_zbpg_unused_list_count
++;
271 spin_unlock(&zbpg_unused_list_spinlock
);
275 * core zbud handling routines
278 static unsigned zbud_free(struct zbud_hdr
*zh
)
282 ASSERT_SENTINEL(zh
, ZBH
);
283 BUG_ON(!tmem_oid_valid(&zh
->oid
));
285 BUG_ON(zh
->size
== 0 || zh
->size
> zbud_max_buddy_size());
287 tmem_oid_set_invalid(&zh
->oid
);
288 INVERT_SENTINEL(zh
, ZBH
);
289 zcache_zbud_curr_zbytes
-= size
;
290 atomic_dec(&zcache_zbud_curr_zpages
);
294 static void zbud_free_and_delist(struct zbud_hdr
*zh
)
297 struct zbud_hdr
*zh_other
;
298 unsigned budnum
= zbud_budnum(zh
), size
;
299 struct zbud_page
*zbpg
=
300 container_of(zh
, struct zbud_page
, buddy
[budnum
]);
302 spin_lock(&zbpg
->lock
);
303 if (list_empty(&zbpg
->bud_list
)) {
304 /* ignore zombie page... see zbud_evict_pages() */
305 spin_unlock(&zbpg
->lock
);
308 size
= zbud_free(zh
);
309 ASSERT_SPINLOCK(&zbpg
->lock
);
310 zh_other
= &zbpg
->buddy
[(budnum
== 0) ? 1 : 0];
311 if (zh_other
->size
== 0) { /* was unbuddied: unlist and free */
312 chunks
= zbud_size_to_chunks(size
) ;
313 spin_lock(&zbud_budlists_spinlock
);
314 BUG_ON(list_empty(&zbud_unbuddied
[chunks
].list
));
315 list_del_init(&zbpg
->bud_list
);
316 zbud_unbuddied
[chunks
].count
--;
317 spin_unlock(&zbud_budlists_spinlock
);
318 zbud_free_raw_page(zbpg
);
319 } else { /* was buddied: move remaining buddy to unbuddied list */
320 chunks
= zbud_size_to_chunks(zh_other
->size
) ;
321 spin_lock(&zbud_budlists_spinlock
);
322 list_del_init(&zbpg
->bud_list
);
323 zcache_zbud_buddied_count
--;
324 list_add_tail(&zbpg
->bud_list
, &zbud_unbuddied
[chunks
].list
);
325 zbud_unbuddied
[chunks
].count
++;
326 spin_unlock(&zbud_budlists_spinlock
);
327 spin_unlock(&zbpg
->lock
);
331 static struct zbud_hdr
*zbud_create(uint16_t client_id
, uint16_t pool_id
,
332 struct tmem_oid
*oid
,
333 uint32_t index
, struct page
*page
,
334 void *cdata
, unsigned size
)
336 struct zbud_hdr
*zh0
, *zh1
, *zh
= NULL
;
337 struct zbud_page
*zbpg
= NULL
, *ztmp
;
340 int i
, found_good_buddy
= 0;
342 nchunks
= zbud_size_to_chunks(size
) ;
343 for (i
= MAX_CHUNK
- nchunks
+ 1; i
> 0; i
--) {
344 spin_lock(&zbud_budlists_spinlock
);
345 if (!list_empty(&zbud_unbuddied
[i
].list
)) {
346 list_for_each_entry_safe(zbpg
, ztmp
,
347 &zbud_unbuddied
[i
].list
, bud_list
) {
348 if (spin_trylock(&zbpg
->lock
)) {
349 found_good_buddy
= i
;
350 goto found_unbuddied
;
354 spin_unlock(&zbud_budlists_spinlock
);
356 /* didn't find a good buddy, try allocating a new page */
357 zbpg
= zbud_alloc_raw_page();
358 if (unlikely(zbpg
== NULL
))
360 /* ok, have a page, now compress the data before taking locks */
361 spin_lock(&zbpg
->lock
);
362 spin_lock(&zbud_budlists_spinlock
);
363 list_add_tail(&zbpg
->bud_list
, &zbud_unbuddied
[nchunks
].list
);
364 zbud_unbuddied
[nchunks
].count
++;
365 zh
= &zbpg
->buddy
[0];
369 ASSERT_SPINLOCK(&zbpg
->lock
);
370 zh0
= &zbpg
->buddy
[0]; zh1
= &zbpg
->buddy
[1];
371 BUG_ON(!((zh0
->size
== 0) ^ (zh1
->size
== 0)));
372 if (zh0
->size
!= 0) { /* buddy0 in use, buddy1 is vacant */
373 ASSERT_SENTINEL(zh0
, ZBH
);
375 } else if (zh1
->size
!= 0) { /* buddy1 in use, buddy0 is vacant */
376 ASSERT_SENTINEL(zh1
, ZBH
);
380 list_del_init(&zbpg
->bud_list
);
381 zbud_unbuddied
[found_good_buddy
].count
--;
382 list_add_tail(&zbpg
->bud_list
, &zbud_buddied_list
);
383 zcache_zbud_buddied_count
++;
386 SET_SENTINEL(zh
, ZBH
);
390 zh
->pool_id
= pool_id
;
391 zh
->client_id
= client_id
;
392 /* can wait to copy the data until the list locks are dropped */
393 spin_unlock(&zbud_budlists_spinlock
);
395 to
= zbud_data(zh
, size
);
396 memcpy(to
, cdata
, size
);
397 spin_unlock(&zbpg
->lock
);
398 zbud_cumul_chunk_counts
[nchunks
]++;
399 atomic_inc(&zcache_zbud_curr_zpages
);
400 zcache_zbud_cumul_zpages
++;
401 zcache_zbud_curr_zbytes
+= size
;
402 zcache_zbud_cumul_zbytes
+= size
;
407 static int zbud_decompress(struct page
*page
, struct zbud_hdr
*zh
)
409 struct zbud_page
*zbpg
;
410 unsigned budnum
= zbud_budnum(zh
);
411 size_t out_len
= PAGE_SIZE
;
412 char *to_va
, *from_va
;
416 zbpg
= container_of(zh
, struct zbud_page
, buddy
[budnum
]);
417 spin_lock(&zbpg
->lock
);
418 if (list_empty(&zbpg
->bud_list
)) {
419 /* ignore zombie page... see zbud_evict_pages() */
423 ASSERT_SENTINEL(zh
, ZBH
);
424 BUG_ON(zh
->size
== 0 || zh
->size
> zbud_max_buddy_size());
425 to_va
= kmap_atomic(page
, KM_USER0
);
427 from_va
= zbud_data(zh
, size
);
428 ret
= lzo1x_decompress_safe(from_va
, size
, to_va
, &out_len
);
429 BUG_ON(ret
!= LZO_E_OK
);
430 BUG_ON(out_len
!= PAGE_SIZE
);
431 kunmap_atomic(to_va
, KM_USER0
);
433 spin_unlock(&zbpg
->lock
);
438 * The following routines handle shrinking of ephemeral pages by evicting
439 * pages "least valuable" first.
442 static unsigned long zcache_evicted_raw_pages
;
443 static unsigned long zcache_evicted_buddied_pages
;
444 static unsigned long zcache_evicted_unbuddied_pages
;
446 static struct tmem_pool
*zcache_get_pool_by_id(uint16_t cli_id
,
448 static void zcache_put_pool(struct tmem_pool
*pool
);
451 * Flush and free all zbuds in a zbpg, then free the pageframe
453 static void zbud_evict_zbpg(struct zbud_page
*zbpg
)
457 uint32_t pool_id
[ZBUD_MAX_BUDS
], client_id
[ZBUD_MAX_BUDS
];
458 uint32_t index
[ZBUD_MAX_BUDS
];
459 struct tmem_oid oid
[ZBUD_MAX_BUDS
];
460 struct tmem_pool
*pool
;
462 ASSERT_SPINLOCK(&zbpg
->lock
);
463 BUG_ON(!list_empty(&zbpg
->bud_list
));
464 for (i
= 0, j
= 0; i
< ZBUD_MAX_BUDS
; i
++) {
465 zh
= &zbpg
->buddy
[i
];
467 client_id
[j
] = zh
->client_id
;
468 pool_id
[j
] = zh
->pool_id
;
470 index
[j
] = zh
->index
;
475 spin_unlock(&zbpg
->lock
);
476 for (i
= 0; i
< j
; i
++) {
477 pool
= zcache_get_pool_by_id(client_id
[i
], pool_id
[i
]);
479 tmem_flush_page(pool
, &oid
[i
], index
[i
]);
480 zcache_put_pool(pool
);
483 ASSERT_SENTINEL(zbpg
, ZBPG
);
484 spin_lock(&zbpg
->lock
);
485 zbud_free_raw_page(zbpg
);
489 * Free nr pages. This code is funky because we want to hold the locks
490 * protecting various lists for as short a time as possible, and in some
491 * circumstances the list may change asynchronously when the list lock is
492 * not held. In some cases we also trylock not only to avoid waiting on a
493 * page in use by another cpu, but also to avoid potential deadlock due to
496 static void zbud_evict_pages(int nr
)
498 struct zbud_page
*zbpg
;
501 /* first try freeing any pages on unused list */
503 spin_lock_bh(&zbpg_unused_list_spinlock
);
504 if (!list_empty(&zbpg_unused_list
)) {
505 /* can't walk list here, since it may change when unlocked */
506 zbpg
= list_first_entry(&zbpg_unused_list
,
507 struct zbud_page
, bud_list
);
508 list_del_init(&zbpg
->bud_list
);
509 zcache_zbpg_unused_list_count
--;
510 atomic_dec(&zcache_zbud_curr_raw_pages
);
511 spin_unlock_bh(&zbpg_unused_list_spinlock
);
512 zcache_free_page(zbpg
);
513 zcache_evicted_raw_pages
++;
516 goto retry_unused_list
;
518 spin_unlock_bh(&zbpg_unused_list_spinlock
);
520 /* now try freeing unbuddied pages, starting with least space avail */
521 for (i
= 0; i
< MAX_CHUNK
; i
++) {
523 spin_lock_bh(&zbud_budlists_spinlock
);
524 if (list_empty(&zbud_unbuddied
[i
].list
)) {
525 spin_unlock_bh(&zbud_budlists_spinlock
);
528 list_for_each_entry(zbpg
, &zbud_unbuddied
[i
].list
, bud_list
) {
529 if (unlikely(!spin_trylock(&zbpg
->lock
)))
531 list_del_init(&zbpg
->bud_list
);
532 zbud_unbuddied
[i
].count
--;
533 spin_unlock(&zbud_budlists_spinlock
);
534 zcache_evicted_unbuddied_pages
++;
535 /* want budlists unlocked when doing zbpg eviction */
536 zbud_evict_zbpg(zbpg
);
540 goto retry_unbud_list_i
;
542 spin_unlock_bh(&zbud_budlists_spinlock
);
545 /* as a last resort, free buddied pages */
547 spin_lock_bh(&zbud_budlists_spinlock
);
548 if (list_empty(&zbud_buddied_list
)) {
549 spin_unlock_bh(&zbud_budlists_spinlock
);
552 list_for_each_entry(zbpg
, &zbud_buddied_list
, bud_list
) {
553 if (unlikely(!spin_trylock(&zbpg
->lock
)))
555 list_del_init(&zbpg
->bud_list
);
556 zcache_zbud_buddied_count
--;
557 spin_unlock(&zbud_budlists_spinlock
);
558 zcache_evicted_buddied_pages
++;
559 /* want budlists unlocked when doing zbpg eviction */
560 zbud_evict_zbpg(zbpg
);
566 spin_unlock_bh(&zbud_budlists_spinlock
);
571 static void zbud_init(void)
575 INIT_LIST_HEAD(&zbud_buddied_list
);
576 zcache_zbud_buddied_count
= 0;
577 for (i
= 0; i
< NCHUNKS
; i
++) {
578 INIT_LIST_HEAD(&zbud_unbuddied
[i
].list
);
579 zbud_unbuddied
[i
].count
= 0;
585 * These sysfs routines show a nice distribution of how many zbpg's are
586 * currently (and have ever been placed) in each unbuddied list. It's fun
587 * to watch but can probably go away before final merge.
589 static int zbud_show_unbuddied_list_counts(char *buf
)
594 for (i
= 0; i
< NCHUNKS
; i
++)
595 p
+= sprintf(p
, "%u ", zbud_unbuddied
[i
].count
);
599 static int zbud_show_cumul_chunk_counts(char *buf
)
601 unsigned long i
, chunks
= 0, total_chunks
= 0, sum_total_chunks
= 0;
602 unsigned long total_chunks_lte_21
= 0, total_chunks_lte_32
= 0;
603 unsigned long total_chunks_lte_42
= 0;
606 for (i
= 0; i
< NCHUNKS
; i
++) {
607 p
+= sprintf(p
, "%lu ", zbud_cumul_chunk_counts
[i
]);
608 chunks
+= zbud_cumul_chunk_counts
[i
];
609 total_chunks
+= zbud_cumul_chunk_counts
[i
];
610 sum_total_chunks
+= i
* zbud_cumul_chunk_counts
[i
];
612 total_chunks_lte_21
= total_chunks
;
614 total_chunks_lte_32
= total_chunks
;
616 total_chunks_lte_42
= total_chunks
;
618 p
+= sprintf(p
, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
619 total_chunks_lte_21
, total_chunks_lte_32
, total_chunks_lte_42
,
620 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
626 * This "zv" PAM implementation combines the TLSF-based xvMalloc
627 * with lzo1x compression to maximize the amount of data that can
628 * be packed into a physical page.
630 * Zv represents a PAM page with the index and object (plus a "size" value
631 * necessary for decompression) immediately preceding the compressed data.
634 #define ZVH_SENTINEL 0x43214321
643 /* rudimentary policy limits */
644 /* total number of persistent pages may not exceed this percentage */
645 static unsigned int zv_page_count_policy_percent
= 75;
647 * byte count defining poor compression; pages with greater zsize will be
650 static unsigned int zv_max_zsize
= (PAGE_SIZE
/ 8) * 7;
652 * byte count defining poor *mean* compression; pages with greater zsize
653 * will be rejected until sufficient better-compressed pages are accepted
654 * driving the man below this threshold
656 static unsigned int zv_max_mean_zsize
= (PAGE_SIZE
/ 8) * 5;
658 static unsigned long zv_curr_dist_counts
[NCHUNKS
];
659 static unsigned long zv_cumul_dist_counts
[NCHUNKS
];
661 static struct zv_hdr
*zv_create(struct xv_pool
*xvpool
, uint32_t pool_id
,
662 struct tmem_oid
*oid
, uint32_t index
,
663 void *cdata
, unsigned clen
)
666 struct zv_hdr
*zv
= NULL
;
668 int alloc_size
= clen
+ sizeof(struct zv_hdr
);
669 int chunks
= (alloc_size
+ (CHUNK_SIZE
- 1)) >> CHUNK_SHIFT
;
672 BUG_ON(!irqs_disabled());
673 BUG_ON(chunks
>= NCHUNKS
);
674 ret
= xv_malloc(xvpool
, alloc_size
,
675 &page
, &offset
, ZCACHE_GFP_MASK
);
678 zv_curr_dist_counts
[chunks
]++;
679 zv_cumul_dist_counts
[chunks
]++;
680 zv
= kmap_atomic(page
, KM_USER0
) + offset
;
683 zv
->pool_id
= pool_id
;
684 SET_SENTINEL(zv
, ZVH
);
685 memcpy((char *)zv
+ sizeof(struct zv_hdr
), cdata
, clen
);
686 kunmap_atomic(zv
, KM_USER0
);
691 static void zv_free(struct xv_pool
*xvpool
, struct zv_hdr
*zv
)
696 uint16_t size
= xv_get_object_size(zv
);
697 int chunks
= (size
+ (CHUNK_SIZE
- 1)) >> CHUNK_SHIFT
;
699 ASSERT_SENTINEL(zv
, ZVH
);
700 BUG_ON(chunks
>= NCHUNKS
);
701 zv_curr_dist_counts
[chunks
]--;
704 INVERT_SENTINEL(zv
, ZVH
);
705 page
= virt_to_page(zv
);
706 offset
= (unsigned long)zv
& ~PAGE_MASK
;
707 local_irq_save(flags
);
708 xv_free(xvpool
, page
, offset
);
709 local_irq_restore(flags
);
712 static void zv_decompress(struct page
*page
, struct zv_hdr
*zv
)
714 size_t clen
= PAGE_SIZE
;
719 ASSERT_SENTINEL(zv
, ZVH
);
720 size
= xv_get_object_size(zv
) - sizeof(*zv
);
722 to_va
= kmap_atomic(page
, KM_USER0
);
723 ret
= lzo1x_decompress_safe((char *)zv
+ sizeof(*zv
),
725 kunmap_atomic(to_va
, KM_USER0
);
726 BUG_ON(ret
!= LZO_E_OK
);
727 BUG_ON(clen
!= PAGE_SIZE
);
732 * show a distribution of compression stats for zv pages.
735 static int zv_curr_dist_counts_show(char *buf
)
737 unsigned long i
, n
, chunks
= 0, sum_total_chunks
= 0;
740 for (i
= 0; i
< NCHUNKS
; i
++) {
741 n
= zv_curr_dist_counts
[i
];
742 p
+= sprintf(p
, "%lu ", n
);
744 sum_total_chunks
+= i
* n
;
746 p
+= sprintf(p
, "mean:%lu\n",
747 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
751 static int zv_cumul_dist_counts_show(char *buf
)
753 unsigned long i
, n
, chunks
= 0, sum_total_chunks
= 0;
756 for (i
= 0; i
< NCHUNKS
; i
++) {
757 n
= zv_cumul_dist_counts
[i
];
758 p
+= sprintf(p
, "%lu ", n
);
760 sum_total_chunks
+= i
* n
;
762 p
+= sprintf(p
, "mean:%lu\n",
763 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
768 * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
769 * pages that don't compress to less than this value (including metadata
770 * overhead) to be rejected. We don't allow the value to get too close
773 static ssize_t
zv_max_zsize_show(struct kobject
*kobj
,
774 struct kobj_attribute
*attr
,
777 return sprintf(buf
, "%u\n", zv_max_zsize
);
780 static ssize_t
zv_max_zsize_store(struct kobject
*kobj
,
781 struct kobj_attribute
*attr
,
782 const char *buf
, size_t count
)
787 if (!capable(CAP_SYS_ADMIN
))
790 err
= strict_strtoul(buf
, 10, &val
);
791 if (err
|| (val
== 0) || (val
> (PAGE_SIZE
/ 8) * 7))
798 * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
799 * pages that don't compress to less than this value (including metadata
800 * overhead) to be rejected UNLESS the mean compression is also smaller
801 * than this value. In other words, we are load-balancing-by-zsize the
802 * accepted pages. Again, we don't allow the value to get too close
805 static ssize_t
zv_max_mean_zsize_show(struct kobject
*kobj
,
806 struct kobj_attribute
*attr
,
809 return sprintf(buf
, "%u\n", zv_max_mean_zsize
);
812 static ssize_t
zv_max_mean_zsize_store(struct kobject
*kobj
,
813 struct kobj_attribute
*attr
,
814 const char *buf
, size_t count
)
819 if (!capable(CAP_SYS_ADMIN
))
822 err
= strict_strtoul(buf
, 10, &val
);
823 if (err
|| (val
== 0) || (val
> (PAGE_SIZE
/ 8) * 7))
825 zv_max_mean_zsize
= val
;
830 * setting zv_page_count_policy_percent via sysfs sets an upper bound of
831 * persistent (e.g. swap) pages that will be retained according to:
832 * (zv_page_count_policy_percent * totalram_pages) / 100)
833 * when that limit is reached, further puts will be rejected (until
834 * some pages have been flushed). Note that, due to compression,
835 * this number may exceed 100; it defaults to 75 and we set an
836 * arbitary limit of 150. A poor choice will almost certainly result
837 * in OOM's, so this value should only be changed prudently.
839 static ssize_t
zv_page_count_policy_percent_show(struct kobject
*kobj
,
840 struct kobj_attribute
*attr
,
843 return sprintf(buf
, "%u\n", zv_page_count_policy_percent
);
846 static ssize_t
zv_page_count_policy_percent_store(struct kobject
*kobj
,
847 struct kobj_attribute
*attr
,
848 const char *buf
, size_t count
)
853 if (!capable(CAP_SYS_ADMIN
))
856 err
= strict_strtoul(buf
, 10, &val
);
857 if (err
|| (val
== 0) || (val
> 150))
859 zv_page_count_policy_percent
= val
;
863 static struct kobj_attribute zcache_zv_max_zsize_attr
= {
864 .attr
= { .name
= "zv_max_zsize", .mode
= 0644 },
865 .show
= zv_max_zsize_show
,
866 .store
= zv_max_zsize_store
,
869 static struct kobj_attribute zcache_zv_max_mean_zsize_attr
= {
870 .attr
= { .name
= "zv_max_mean_zsize", .mode
= 0644 },
871 .show
= zv_max_mean_zsize_show
,
872 .store
= zv_max_mean_zsize_store
,
875 static struct kobj_attribute zcache_zv_page_count_policy_percent_attr
= {
876 .attr
= { .name
= "zv_page_count_policy_percent",
878 .show
= zv_page_count_policy_percent_show
,
879 .store
= zv_page_count_policy_percent_store
,
884 * zcache core code starts here
887 /* useful stats not collected by cleancache or frontswap */
888 static unsigned long zcache_flush_total
;
889 static unsigned long zcache_flush_found
;
890 static unsigned long zcache_flobj_total
;
891 static unsigned long zcache_flobj_found
;
892 static unsigned long zcache_failed_eph_puts
;
893 static unsigned long zcache_failed_pers_puts
;
896 * Tmem operations assume the poolid implies the invoking client.
897 * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
898 * RAMster has each client numbered by cluster node, and a KVM version
899 * of zcache would have one client per guest and each client might
902 static struct tmem_pool
*zcache_get_pool_by_id(uint16_t cli_id
, uint16_t poolid
)
904 struct tmem_pool
*pool
= NULL
;
905 struct zcache_client
*cli
= NULL
;
907 if (cli_id
== LOCAL_CLIENT
)
910 if (cli_id
>= MAX_CLIENTS
)
912 cli
= &zcache_clients
[cli_id
];
915 atomic_inc(&cli
->refcount
);
917 if (poolid
< MAX_POOLS_PER_CLIENT
) {
918 pool
= cli
->tmem_pools
[poolid
];
920 atomic_inc(&pool
->refcount
);
926 static void zcache_put_pool(struct tmem_pool
*pool
)
928 struct zcache_client
*cli
= NULL
;
933 atomic_dec(&pool
->refcount
);
934 atomic_dec(&cli
->refcount
);
937 int zcache_new_client(uint16_t cli_id
)
939 struct zcache_client
*cli
= NULL
;
942 if (cli_id
== LOCAL_CLIENT
)
944 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
945 cli
= &zcache_clients
[cli_id
];
951 #ifdef CONFIG_FRONTSWAP
952 cli
->xvpool
= xv_create_pool();
953 if (cli
->xvpool
== NULL
)
961 /* counters for debugging */
962 static unsigned long zcache_failed_get_free_pages
;
963 static unsigned long zcache_failed_alloc
;
964 static unsigned long zcache_put_to_flush
;
965 static unsigned long zcache_aborted_preload
;
966 static unsigned long zcache_aborted_shrink
;
969 * Ensure that memory allocation requests in zcache don't result
970 * in direct reclaim requests via the shrinker, which would cause
971 * an infinite loop. Maybe a GFP flag would be better?
973 static DEFINE_SPINLOCK(zcache_direct_reclaim_lock
);
976 * for now, used named slabs so can easily track usage; later can
977 * either just use kmalloc, or perhaps add a slab-like allocator
978 * to more carefully manage total memory utilization
980 static struct kmem_cache
*zcache_objnode_cache
;
981 static struct kmem_cache
*zcache_obj_cache
;
982 static atomic_t zcache_curr_obj_count
= ATOMIC_INIT(0);
983 static unsigned long zcache_curr_obj_count_max
;
984 static atomic_t zcache_curr_objnode_count
= ATOMIC_INIT(0);
985 static unsigned long zcache_curr_objnode_count_max
;
988 * to avoid memory allocation recursion (e.g. due to direct reclaim), we
989 * preload all necessary data structures so the hostops callbacks never
990 * actually do a malloc
992 struct zcache_preload
{
994 struct tmem_obj
*obj
;
996 struct tmem_objnode
*objnodes
[OBJNODE_TREE_MAX_PATH
];
998 static DEFINE_PER_CPU(struct zcache_preload
, zcache_preloads
) = { 0, };
1000 static int zcache_do_preload(struct tmem_pool
*pool
)
1002 struct zcache_preload
*kp
;
1003 struct tmem_objnode
*objnode
;
1004 struct tmem_obj
*obj
;
1008 if (unlikely(zcache_objnode_cache
== NULL
))
1010 if (unlikely(zcache_obj_cache
== NULL
))
1012 if (!spin_trylock(&zcache_direct_reclaim_lock
)) {
1013 zcache_aborted_preload
++;
1017 kp
= &__get_cpu_var(zcache_preloads
);
1018 while (kp
->nr
< ARRAY_SIZE(kp
->objnodes
)) {
1019 preempt_enable_no_resched();
1020 objnode
= kmem_cache_alloc(zcache_objnode_cache
,
1022 if (unlikely(objnode
== NULL
)) {
1023 zcache_failed_alloc
++;
1027 kp
= &__get_cpu_var(zcache_preloads
);
1028 if (kp
->nr
< ARRAY_SIZE(kp
->objnodes
))
1029 kp
->objnodes
[kp
->nr
++] = objnode
;
1031 kmem_cache_free(zcache_objnode_cache
, objnode
);
1033 preempt_enable_no_resched();
1034 obj
= kmem_cache_alloc(zcache_obj_cache
, ZCACHE_GFP_MASK
);
1035 if (unlikely(obj
== NULL
)) {
1036 zcache_failed_alloc
++;
1039 page
= (void *)__get_free_page(ZCACHE_GFP_MASK
);
1040 if (unlikely(page
== NULL
)) {
1041 zcache_failed_get_free_pages
++;
1042 kmem_cache_free(zcache_obj_cache
, obj
);
1046 kp
= &__get_cpu_var(zcache_preloads
);
1047 if (kp
->obj
== NULL
)
1050 kmem_cache_free(zcache_obj_cache
, obj
);
1051 if (kp
->page
== NULL
)
1054 free_page((unsigned long)page
);
1057 spin_unlock(&zcache_direct_reclaim_lock
);
1062 static void *zcache_get_free_page(void)
1064 struct zcache_preload
*kp
;
1067 kp
= &__get_cpu_var(zcache_preloads
);
1069 BUG_ON(page
== NULL
);
1074 static void zcache_free_page(void *p
)
1076 free_page((unsigned long)p
);
1080 * zcache implementation for tmem host ops
1083 static struct tmem_objnode
*zcache_objnode_alloc(struct tmem_pool
*pool
)
1085 struct tmem_objnode
*objnode
= NULL
;
1086 unsigned long count
;
1087 struct zcache_preload
*kp
;
1089 kp
= &__get_cpu_var(zcache_preloads
);
1092 objnode
= kp
->objnodes
[kp
->nr
- 1];
1093 BUG_ON(objnode
== NULL
);
1094 kp
->objnodes
[kp
->nr
- 1] = NULL
;
1096 count
= atomic_inc_return(&zcache_curr_objnode_count
);
1097 if (count
> zcache_curr_objnode_count_max
)
1098 zcache_curr_objnode_count_max
= count
;
1103 static void zcache_objnode_free(struct tmem_objnode
*objnode
,
1104 struct tmem_pool
*pool
)
1106 atomic_dec(&zcache_curr_objnode_count
);
1107 BUG_ON(atomic_read(&zcache_curr_objnode_count
) < 0);
1108 kmem_cache_free(zcache_objnode_cache
, objnode
);
1111 static struct tmem_obj
*zcache_obj_alloc(struct tmem_pool
*pool
)
1113 struct tmem_obj
*obj
= NULL
;
1114 unsigned long count
;
1115 struct zcache_preload
*kp
;
1117 kp
= &__get_cpu_var(zcache_preloads
);
1119 BUG_ON(obj
== NULL
);
1121 count
= atomic_inc_return(&zcache_curr_obj_count
);
1122 if (count
> zcache_curr_obj_count_max
)
1123 zcache_curr_obj_count_max
= count
;
1127 static void zcache_obj_free(struct tmem_obj
*obj
, struct tmem_pool
*pool
)
1129 atomic_dec(&zcache_curr_obj_count
);
1130 BUG_ON(atomic_read(&zcache_curr_obj_count
) < 0);
1131 kmem_cache_free(zcache_obj_cache
, obj
);
1134 static struct tmem_hostops zcache_hostops
= {
1135 .obj_alloc
= zcache_obj_alloc
,
1136 .obj_free
= zcache_obj_free
,
1137 .objnode_alloc
= zcache_objnode_alloc
,
1138 .objnode_free
= zcache_objnode_free
,
1142 * zcache implementations for PAM page descriptor ops
1145 static atomic_t zcache_curr_eph_pampd_count
= ATOMIC_INIT(0);
1146 static unsigned long zcache_curr_eph_pampd_count_max
;
1147 static atomic_t zcache_curr_pers_pampd_count
= ATOMIC_INIT(0);
1148 static unsigned long zcache_curr_pers_pampd_count_max
;
1150 /* forward reference */
1151 static int zcache_compress(struct page
*from
, void **out_va
, size_t *out_len
);
1153 static void *zcache_pampd_create(char *data
, size_t size
, bool raw
, int eph
,
1154 struct tmem_pool
*pool
, struct tmem_oid
*oid
,
1157 void *pampd
= NULL
, *cdata
;
1160 unsigned long count
;
1161 struct page
*page
= virt_to_page(data
);
1162 struct zcache_client
*cli
= pool
->client
;
1163 uint16_t client_id
= get_client_id_from_client(cli
);
1164 unsigned long zv_mean_zsize
;
1165 unsigned long curr_pers_pampd_count
;
1169 ret
= zcache_compress(page
, &cdata
, &clen
);
1172 if (clen
== 0 || clen
> zbud_max_buddy_size()) {
1173 zcache_compress_poor
++;
1176 pampd
= (void *)zbud_create(client_id
, pool
->pool_id
, oid
,
1177 index
, page
, cdata
, clen
);
1178 if (pampd
!= NULL
) {
1179 count
= atomic_inc_return(&zcache_curr_eph_pampd_count
);
1180 if (count
> zcache_curr_eph_pampd_count_max
)
1181 zcache_curr_eph_pampd_count_max
= count
;
1184 curr_pers_pampd_count
=
1185 atomic_read(&zcache_curr_pers_pampd_count
);
1186 if (curr_pers_pampd_count
>
1187 (zv_page_count_policy_percent
* totalram_pages
) / 100)
1189 ret
= zcache_compress(page
, &cdata
, &clen
);
1192 /* reject if compression is too poor */
1193 if (clen
> zv_max_zsize
) {
1194 zcache_compress_poor
++;
1197 /* reject if mean compression is too poor */
1198 if ((clen
> zv_max_mean_zsize
) && (curr_pers_pampd_count
> 0)) {
1199 total_zsize
= xv_get_total_size_bytes(cli
->xvpool
);
1200 zv_mean_zsize
= div_u64(total_zsize
,
1201 curr_pers_pampd_count
);
1202 if (zv_mean_zsize
> zv_max_mean_zsize
) {
1203 zcache_mean_compress_poor
++;
1207 pampd
= (void *)zv_create(cli
->xvpool
, pool
->pool_id
,
1208 oid
, index
, cdata
, clen
);
1211 count
= atomic_inc_return(&zcache_curr_pers_pampd_count
);
1212 if (count
> zcache_curr_pers_pampd_count_max
)
1213 zcache_curr_pers_pampd_count_max
= count
;
1220 * fill the pageframe corresponding to the struct page with the data
1221 * from the passed pampd
1223 static int zcache_pampd_get_data(char *data
, size_t *bufsize
, bool raw
,
1224 void *pampd
, struct tmem_pool
*pool
,
1225 struct tmem_oid
*oid
, uint32_t index
)
1229 BUG_ON(is_ephemeral(pool
));
1230 zv_decompress(virt_to_page(data
), pampd
);
1235 * fill the pageframe corresponding to the struct page with the data
1236 * from the passed pampd
1238 static int zcache_pampd_get_data_and_free(char *data
, size_t *bufsize
, bool raw
,
1239 void *pampd
, struct tmem_pool
*pool
,
1240 struct tmem_oid
*oid
, uint32_t index
)
1244 BUG_ON(!is_ephemeral(pool
));
1245 zbud_decompress(virt_to_page(data
), pampd
);
1246 zbud_free_and_delist((struct zbud_hdr
*)pampd
);
1247 atomic_dec(&zcache_curr_eph_pampd_count
);
1252 * free the pampd and remove it from any zcache lists
1253 * pampd must no longer be pointed to from any tmem data structures!
1255 static void zcache_pampd_free(void *pampd
, struct tmem_pool
*pool
,
1256 struct tmem_oid
*oid
, uint32_t index
)
1258 struct zcache_client
*cli
= pool
->client
;
1260 if (is_ephemeral(pool
)) {
1261 zbud_free_and_delist((struct zbud_hdr
*)pampd
);
1262 atomic_dec(&zcache_curr_eph_pampd_count
);
1263 BUG_ON(atomic_read(&zcache_curr_eph_pampd_count
) < 0);
1265 zv_free(cli
->xvpool
, (struct zv_hdr
*)pampd
);
1266 atomic_dec(&zcache_curr_pers_pampd_count
);
1267 BUG_ON(atomic_read(&zcache_curr_pers_pampd_count
) < 0);
1271 static void zcache_pampd_free_obj(struct tmem_pool
*pool
, struct tmem_obj
*obj
)
1275 static void zcache_pampd_new_obj(struct tmem_obj
*obj
)
1279 static int zcache_pampd_replace_in_obj(void *pampd
, struct tmem_obj
*obj
)
1284 static bool zcache_pampd_is_remote(void *pampd
)
1289 static struct tmem_pamops zcache_pamops
= {
1290 .create
= zcache_pampd_create
,
1291 .get_data
= zcache_pampd_get_data
,
1292 .get_data_and_free
= zcache_pampd_get_data_and_free
,
1293 .free
= zcache_pampd_free
,
1294 .free_obj
= zcache_pampd_free_obj
,
1295 .new_obj
= zcache_pampd_new_obj
,
1296 .replace_in_obj
= zcache_pampd_replace_in_obj
,
1297 .is_remote
= zcache_pampd_is_remote
,
1301 * zcache compression/decompression and related per-cpu stuff
1304 #define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
1305 #define LZO_DSTMEM_PAGE_ORDER 1
1306 static DEFINE_PER_CPU(unsigned char *, zcache_workmem
);
1307 static DEFINE_PER_CPU(unsigned char *, zcache_dstmem
);
1309 static int zcache_compress(struct page
*from
, void **out_va
, size_t *out_len
)
1312 unsigned char *dmem
= __get_cpu_var(zcache_dstmem
);
1313 unsigned char *wmem
= __get_cpu_var(zcache_workmem
);
1316 BUG_ON(!irqs_disabled());
1317 if (unlikely(dmem
== NULL
|| wmem
== NULL
))
1318 goto out
; /* no buffer, so can't compress */
1319 from_va
= kmap_atomic(from
, KM_USER0
);
1321 ret
= lzo1x_1_compress(from_va
, PAGE_SIZE
, dmem
, out_len
, wmem
);
1322 BUG_ON(ret
!= LZO_E_OK
);
1324 kunmap_atomic(from_va
, KM_USER0
);
1331 static int zcache_cpu_notifier(struct notifier_block
*nb
,
1332 unsigned long action
, void *pcpu
)
1334 int cpu
= (long)pcpu
;
1335 struct zcache_preload
*kp
;
1338 case CPU_UP_PREPARE
:
1339 per_cpu(zcache_dstmem
, cpu
) = (void *)__get_free_pages(
1340 GFP_KERNEL
| __GFP_REPEAT
,
1341 LZO_DSTMEM_PAGE_ORDER
),
1342 per_cpu(zcache_workmem
, cpu
) =
1343 kzalloc(LZO1X_MEM_COMPRESS
,
1344 GFP_KERNEL
| __GFP_REPEAT
);
1347 case CPU_UP_CANCELED
:
1348 free_pages((unsigned long)per_cpu(zcache_dstmem
, cpu
),
1349 LZO_DSTMEM_PAGE_ORDER
);
1350 per_cpu(zcache_dstmem
, cpu
) = NULL
;
1351 kfree(per_cpu(zcache_workmem
, cpu
));
1352 per_cpu(zcache_workmem
, cpu
) = NULL
;
1353 kp
= &per_cpu(zcache_preloads
, cpu
);
1355 kmem_cache_free(zcache_objnode_cache
,
1356 kp
->objnodes
[kp
->nr
- 1]);
1357 kp
->objnodes
[kp
->nr
- 1] = NULL
;
1360 kmem_cache_free(zcache_obj_cache
, kp
->obj
);
1361 free_page((unsigned long)kp
->page
);
1369 static struct notifier_block zcache_cpu_notifier_block
= {
1370 .notifier_call
= zcache_cpu_notifier
1374 #define ZCACHE_SYSFS_RO(_name) \
1375 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1376 struct kobj_attribute *attr, char *buf) \
1378 return sprintf(buf, "%lu\n", zcache_##_name); \
1380 static struct kobj_attribute zcache_##_name##_attr = { \
1381 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1382 .show = zcache_##_name##_show, \
1385 #define ZCACHE_SYSFS_RO_ATOMIC(_name) \
1386 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1387 struct kobj_attribute *attr, char *buf) \
1389 return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
1391 static struct kobj_attribute zcache_##_name##_attr = { \
1392 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1393 .show = zcache_##_name##_show, \
1396 #define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
1397 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1398 struct kobj_attribute *attr, char *buf) \
1400 return _func(buf); \
1402 static struct kobj_attribute zcache_##_name##_attr = { \
1403 .attr = { .name = __stringify(_name), .mode = 0444 }, \
1404 .show = zcache_##_name##_show, \
1407 ZCACHE_SYSFS_RO(curr_obj_count_max
);
1408 ZCACHE_SYSFS_RO(curr_objnode_count_max
);
1409 ZCACHE_SYSFS_RO(flush_total
);
1410 ZCACHE_SYSFS_RO(flush_found
);
1411 ZCACHE_SYSFS_RO(flobj_total
);
1412 ZCACHE_SYSFS_RO(flobj_found
);
1413 ZCACHE_SYSFS_RO(failed_eph_puts
);
1414 ZCACHE_SYSFS_RO(failed_pers_puts
);
1415 ZCACHE_SYSFS_RO(zbud_curr_zbytes
);
1416 ZCACHE_SYSFS_RO(zbud_cumul_zpages
);
1417 ZCACHE_SYSFS_RO(zbud_cumul_zbytes
);
1418 ZCACHE_SYSFS_RO(zbud_buddied_count
);
1419 ZCACHE_SYSFS_RO(zbpg_unused_list_count
);
1420 ZCACHE_SYSFS_RO(evicted_raw_pages
);
1421 ZCACHE_SYSFS_RO(evicted_unbuddied_pages
);
1422 ZCACHE_SYSFS_RO(evicted_buddied_pages
);
1423 ZCACHE_SYSFS_RO(failed_get_free_pages
);
1424 ZCACHE_SYSFS_RO(failed_alloc
);
1425 ZCACHE_SYSFS_RO(put_to_flush
);
1426 ZCACHE_SYSFS_RO(aborted_preload
);
1427 ZCACHE_SYSFS_RO(aborted_shrink
);
1428 ZCACHE_SYSFS_RO(compress_poor
);
1429 ZCACHE_SYSFS_RO(mean_compress_poor
);
1430 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages
);
1431 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages
);
1432 ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count
);
1433 ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count
);
1434 ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts
,
1435 zbud_show_unbuddied_list_counts
);
1436 ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts
,
1437 zbud_show_cumul_chunk_counts
);
1438 ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts
,
1439 zv_curr_dist_counts_show
);
1440 ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts
,
1441 zv_cumul_dist_counts_show
);
1443 static struct attribute
*zcache_attrs
[] = {
1444 &zcache_curr_obj_count_attr
.attr
,
1445 &zcache_curr_obj_count_max_attr
.attr
,
1446 &zcache_curr_objnode_count_attr
.attr
,
1447 &zcache_curr_objnode_count_max_attr
.attr
,
1448 &zcache_flush_total_attr
.attr
,
1449 &zcache_flobj_total_attr
.attr
,
1450 &zcache_flush_found_attr
.attr
,
1451 &zcache_flobj_found_attr
.attr
,
1452 &zcache_failed_eph_puts_attr
.attr
,
1453 &zcache_failed_pers_puts_attr
.attr
,
1454 &zcache_compress_poor_attr
.attr
,
1455 &zcache_mean_compress_poor_attr
.attr
,
1456 &zcache_zbud_curr_raw_pages_attr
.attr
,
1457 &zcache_zbud_curr_zpages_attr
.attr
,
1458 &zcache_zbud_curr_zbytes_attr
.attr
,
1459 &zcache_zbud_cumul_zpages_attr
.attr
,
1460 &zcache_zbud_cumul_zbytes_attr
.attr
,
1461 &zcache_zbud_buddied_count_attr
.attr
,
1462 &zcache_zbpg_unused_list_count_attr
.attr
,
1463 &zcache_evicted_raw_pages_attr
.attr
,
1464 &zcache_evicted_unbuddied_pages_attr
.attr
,
1465 &zcache_evicted_buddied_pages_attr
.attr
,
1466 &zcache_failed_get_free_pages_attr
.attr
,
1467 &zcache_failed_alloc_attr
.attr
,
1468 &zcache_put_to_flush_attr
.attr
,
1469 &zcache_aborted_preload_attr
.attr
,
1470 &zcache_aborted_shrink_attr
.attr
,
1471 &zcache_zbud_unbuddied_list_counts_attr
.attr
,
1472 &zcache_zbud_cumul_chunk_counts_attr
.attr
,
1473 &zcache_zv_curr_dist_counts_attr
.attr
,
1474 &zcache_zv_cumul_dist_counts_attr
.attr
,
1475 &zcache_zv_max_zsize_attr
.attr
,
1476 &zcache_zv_max_mean_zsize_attr
.attr
,
1477 &zcache_zv_page_count_policy_percent_attr
.attr
,
1481 static struct attribute_group zcache_attr_group
= {
1482 .attrs
= zcache_attrs
,
1486 #endif /* CONFIG_SYSFS */
1488 * When zcache is disabled ("frozen"), pools can be created and destroyed,
1489 * but all puts (and thus all other operations that require memory allocation)
1490 * must fail. If zcache is unfrozen, accepts puts, then frozen again,
1491 * data consistency requires all puts while frozen to be converted into
1494 static bool zcache_freeze
;
1497 * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
1499 static int shrink_zcache_memory(struct shrinker
*shrink
,
1500 struct shrink_control
*sc
)
1503 int nr
= sc
->nr_to_scan
;
1504 gfp_t gfp_mask
= sc
->gfp_mask
;
1507 if (!(gfp_mask
& __GFP_FS
))
1508 /* does this case really need to be skipped? */
1510 if (spin_trylock(&zcache_direct_reclaim_lock
)) {
1511 zbud_evict_pages(nr
);
1512 spin_unlock(&zcache_direct_reclaim_lock
);
1514 zcache_aborted_shrink
++;
1516 ret
= (int)atomic_read(&zcache_zbud_curr_raw_pages
);
1521 static struct shrinker zcache_shrinker
= {
1522 .shrink
= shrink_zcache_memory
,
1523 .seeks
= DEFAULT_SEEKS
,
1527 * zcache shims between cleancache/frontswap ops and tmem
1530 static int zcache_put_page(int cli_id
, int pool_id
, struct tmem_oid
*oidp
,
1531 uint32_t index
, struct page
*page
)
1533 struct tmem_pool
*pool
;
1536 BUG_ON(!irqs_disabled());
1537 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1538 if (unlikely(pool
== NULL
))
1540 if (!zcache_freeze
&& zcache_do_preload(pool
) == 0) {
1541 /* preload does preempt_disable on success */
1542 ret
= tmem_put(pool
, oidp
, index
, page_address(page
),
1543 PAGE_SIZE
, 0, is_ephemeral(pool
));
1545 if (is_ephemeral(pool
))
1546 zcache_failed_eph_puts
++;
1548 zcache_failed_pers_puts
++;
1550 zcache_put_pool(pool
);
1551 preempt_enable_no_resched();
1553 zcache_put_to_flush
++;
1554 if (atomic_read(&pool
->obj_count
) > 0)
1555 /* the put fails whether the flush succeeds or not */
1556 (void)tmem_flush_page(pool
, oidp
, index
);
1557 zcache_put_pool(pool
);
1563 static int zcache_get_page(int cli_id
, int pool_id
, struct tmem_oid
*oidp
,
1564 uint32_t index
, struct page
*page
)
1566 struct tmem_pool
*pool
;
1568 unsigned long flags
;
1569 size_t size
= PAGE_SIZE
;
1571 local_irq_save(flags
);
1572 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1573 if (likely(pool
!= NULL
)) {
1574 if (atomic_read(&pool
->obj_count
) > 0)
1575 ret
= tmem_get(pool
, oidp
, index
, page_address(page
),
1576 &size
, 0, is_ephemeral(pool
));
1577 zcache_put_pool(pool
);
1579 local_irq_restore(flags
);
1583 static int zcache_flush_page(int cli_id
, int pool_id
,
1584 struct tmem_oid
*oidp
, uint32_t index
)
1586 struct tmem_pool
*pool
;
1588 unsigned long flags
;
1590 local_irq_save(flags
);
1591 zcache_flush_total
++;
1592 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1593 if (likely(pool
!= NULL
)) {
1594 if (atomic_read(&pool
->obj_count
) > 0)
1595 ret
= tmem_flush_page(pool
, oidp
, index
);
1596 zcache_put_pool(pool
);
1599 zcache_flush_found
++;
1600 local_irq_restore(flags
);
1604 static int zcache_flush_object(int cli_id
, int pool_id
,
1605 struct tmem_oid
*oidp
)
1607 struct tmem_pool
*pool
;
1609 unsigned long flags
;
1611 local_irq_save(flags
);
1612 zcache_flobj_total
++;
1613 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
1614 if (likely(pool
!= NULL
)) {
1615 if (atomic_read(&pool
->obj_count
) > 0)
1616 ret
= tmem_flush_object(pool
, oidp
);
1617 zcache_put_pool(pool
);
1620 zcache_flobj_found
++;
1621 local_irq_restore(flags
);
1625 static int zcache_destroy_pool(int cli_id
, int pool_id
)
1627 struct tmem_pool
*pool
= NULL
;
1628 struct zcache_client
*cli
= NULL
;
1633 if (cli_id
== LOCAL_CLIENT
)
1635 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
1636 cli
= &zcache_clients
[cli_id
];
1639 atomic_inc(&cli
->refcount
);
1640 pool
= cli
->tmem_pools
[pool_id
];
1643 cli
->tmem_pools
[pool_id
] = NULL
;
1644 /* wait for pool activity on other cpus to quiesce */
1645 while (atomic_read(&pool
->refcount
) != 0)
1647 atomic_dec(&cli
->refcount
);
1649 ret
= tmem_destroy_pool(pool
);
1652 pr_info("zcache: destroyed pool id=%d, cli_id=%d\n",
1658 static int zcache_new_pool(uint16_t cli_id
, uint32_t flags
)
1661 struct tmem_pool
*pool
;
1662 struct zcache_client
*cli
= NULL
;
1664 if (cli_id
== LOCAL_CLIENT
)
1666 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
1667 cli
= &zcache_clients
[cli_id
];
1670 atomic_inc(&cli
->refcount
);
1671 pool
= kmalloc(sizeof(struct tmem_pool
), GFP_KERNEL
);
1673 pr_info("zcache: pool creation failed: out of memory\n");
1677 for (poolid
= 0; poolid
< MAX_POOLS_PER_CLIENT
; poolid
++)
1678 if (cli
->tmem_pools
[poolid
] == NULL
)
1680 if (poolid
>= MAX_POOLS_PER_CLIENT
) {
1681 pr_info("zcache: pool creation failed: max exceeded\n");
1686 atomic_set(&pool
->refcount
, 0);
1688 pool
->pool_id
= poolid
;
1689 tmem_new_pool(pool
, flags
);
1690 cli
->tmem_pools
[poolid
] = pool
;
1691 pr_info("zcache: created %s tmem pool, id=%d, client=%d\n",
1692 flags
& TMEM_POOL_PERSIST
? "persistent" : "ephemeral",
1696 atomic_dec(&cli
->refcount
);
1701 * Two kernel functionalities currently can be layered on top of tmem.
1702 * These are "cleancache" which is used as a second-chance cache for clean
1703 * page cache pages; and "frontswap" which is used for swap pages
1704 * to avoid writes to disk. A generic "shim" is provided here for each
1705 * to translate in-kernel semantics to zcache semantics.
1708 #ifdef CONFIG_CLEANCACHE
1709 static void zcache_cleancache_put_page(int pool_id
,
1710 struct cleancache_filekey key
,
1711 pgoff_t index
, struct page
*page
)
1713 u32 ind
= (u32
) index
;
1714 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1716 if (likely(ind
== index
))
1717 (void)zcache_put_page(LOCAL_CLIENT
, pool_id
, &oid
, index
, page
);
1720 static int zcache_cleancache_get_page(int pool_id
,
1721 struct cleancache_filekey key
,
1722 pgoff_t index
, struct page
*page
)
1724 u32 ind
= (u32
) index
;
1725 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1728 if (likely(ind
== index
))
1729 ret
= zcache_get_page(LOCAL_CLIENT
, pool_id
, &oid
, index
, page
);
1733 static void zcache_cleancache_flush_page(int pool_id
,
1734 struct cleancache_filekey key
,
1737 u32 ind
= (u32
) index
;
1738 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1740 if (likely(ind
== index
))
1741 (void)zcache_flush_page(LOCAL_CLIENT
, pool_id
, &oid
, ind
);
1744 static void zcache_cleancache_flush_inode(int pool_id
,
1745 struct cleancache_filekey key
)
1747 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
1749 (void)zcache_flush_object(LOCAL_CLIENT
, pool_id
, &oid
);
1752 static void zcache_cleancache_flush_fs(int pool_id
)
1755 (void)zcache_destroy_pool(LOCAL_CLIENT
, pool_id
);
1758 static int zcache_cleancache_init_fs(size_t pagesize
)
1760 BUG_ON(sizeof(struct cleancache_filekey
) !=
1761 sizeof(struct tmem_oid
));
1762 BUG_ON(pagesize
!= PAGE_SIZE
);
1763 return zcache_new_pool(LOCAL_CLIENT
, 0);
1766 static int zcache_cleancache_init_shared_fs(char *uuid
, size_t pagesize
)
1768 /* shared pools are unsupported and map to private */
1769 BUG_ON(sizeof(struct cleancache_filekey
) !=
1770 sizeof(struct tmem_oid
));
1771 BUG_ON(pagesize
!= PAGE_SIZE
);
1772 return zcache_new_pool(LOCAL_CLIENT
, 0);
1775 static struct cleancache_ops zcache_cleancache_ops
= {
1776 .put_page
= zcache_cleancache_put_page
,
1777 .get_page
= zcache_cleancache_get_page
,
1778 .flush_page
= zcache_cleancache_flush_page
,
1779 .flush_inode
= zcache_cleancache_flush_inode
,
1780 .flush_fs
= zcache_cleancache_flush_fs
,
1781 .init_shared_fs
= zcache_cleancache_init_shared_fs
,
1782 .init_fs
= zcache_cleancache_init_fs
1785 struct cleancache_ops
zcache_cleancache_register_ops(void)
1787 struct cleancache_ops old_ops
=
1788 cleancache_register_ops(&zcache_cleancache_ops
);
1794 #ifdef CONFIG_FRONTSWAP
1795 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1796 static int zcache_frontswap_poolid
= -1;
1799 * Swizzling increases objects per swaptype, increasing tmem concurrency
1800 * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
1803 #define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
1804 #define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
1805 #define iswiz(_ind) (_ind >> SWIZ_BITS)
1807 static inline struct tmem_oid
oswiz(unsigned type
, u32 ind
)
1809 struct tmem_oid oid
= { .oid
= { 0 } };
1810 oid
.oid
[0] = _oswiz(type
, ind
);
1814 static int zcache_frontswap_put_page(unsigned type
, pgoff_t offset
,
1817 u64 ind64
= (u64
)offset
;
1818 u32 ind
= (u32
)offset
;
1819 struct tmem_oid oid
= oswiz(type
, ind
);
1821 unsigned long flags
;
1823 BUG_ON(!PageLocked(page
));
1824 if (likely(ind64
== ind
)) {
1825 local_irq_save(flags
);
1826 ret
= zcache_put_page(LOCAL_CLIENT
, zcache_frontswap_poolid
,
1827 &oid
, iswiz(ind
), page
);
1828 local_irq_restore(flags
);
1833 /* returns 0 if the page was successfully gotten from frontswap, -1 if
1834 * was not present (should never happen!) */
1835 static int zcache_frontswap_get_page(unsigned type
, pgoff_t offset
,
1838 u64 ind64
= (u64
)offset
;
1839 u32 ind
= (u32
)offset
;
1840 struct tmem_oid oid
= oswiz(type
, ind
);
1843 BUG_ON(!PageLocked(page
));
1844 if (likely(ind64
== ind
))
1845 ret
= zcache_get_page(LOCAL_CLIENT
, zcache_frontswap_poolid
,
1846 &oid
, iswiz(ind
), page
);
1850 /* flush a single page from frontswap */
1851 static void zcache_frontswap_flush_page(unsigned type
, pgoff_t offset
)
1853 u64 ind64
= (u64
)offset
;
1854 u32 ind
= (u32
)offset
;
1855 struct tmem_oid oid
= oswiz(type
, ind
);
1857 if (likely(ind64
== ind
))
1858 (void)zcache_flush_page(LOCAL_CLIENT
, zcache_frontswap_poolid
,
1862 /* flush all pages from the passed swaptype */
1863 static void zcache_frontswap_flush_area(unsigned type
)
1865 struct tmem_oid oid
;
1868 for (ind
= SWIZ_MASK
; ind
>= 0; ind
--) {
1869 oid
= oswiz(type
, ind
);
1870 (void)zcache_flush_object(LOCAL_CLIENT
,
1871 zcache_frontswap_poolid
, &oid
);
1875 static void zcache_frontswap_init(unsigned ignored
)
1877 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1878 if (zcache_frontswap_poolid
< 0)
1879 zcache_frontswap_poolid
=
1880 zcache_new_pool(LOCAL_CLIENT
, TMEM_POOL_PERSIST
);
1883 static struct frontswap_ops zcache_frontswap_ops
= {
1884 .put_page
= zcache_frontswap_put_page
,
1885 .get_page
= zcache_frontswap_get_page
,
1886 .flush_page
= zcache_frontswap_flush_page
,
1887 .flush_area
= zcache_frontswap_flush_area
,
1888 .init
= zcache_frontswap_init
1891 struct frontswap_ops
zcache_frontswap_register_ops(void)
1893 struct frontswap_ops old_ops
=
1894 frontswap_register_ops(&zcache_frontswap_ops
);
1901 * zcache initialization
1902 * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
1906 static int zcache_enabled
;
1908 static int __init
enable_zcache(char *s
)
1913 __setup("zcache", enable_zcache
);
1915 /* allow independent dynamic disabling of cleancache and frontswap */
1917 static int use_cleancache
= 1;
1919 static int __init
no_cleancache(char *s
)
1925 __setup("nocleancache", no_cleancache
);
1927 static int use_frontswap
= 1;
1929 static int __init
no_frontswap(char *s
)
1935 __setup("nofrontswap", no_frontswap
);
1937 static int __init
zcache_init(void)
1942 ret
= sysfs_create_group(mm_kobj
, &zcache_attr_group
);
1944 pr_err("zcache: can't create sysfs\n");
1947 #endif /* CONFIG_SYSFS */
1948 #if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
1949 if (zcache_enabled
) {
1952 tmem_register_hostops(&zcache_hostops
);
1953 tmem_register_pamops(&zcache_pamops
);
1954 ret
= register_cpu_notifier(&zcache_cpu_notifier_block
);
1956 pr_err("zcache: can't register cpu notifier\n");
1959 for_each_online_cpu(cpu
) {
1960 void *pcpu
= (void *)(long)cpu
;
1961 zcache_cpu_notifier(&zcache_cpu_notifier_block
,
1962 CPU_UP_PREPARE
, pcpu
);
1965 zcache_objnode_cache
= kmem_cache_create("zcache_objnode",
1966 sizeof(struct tmem_objnode
), 0, 0, NULL
);
1967 zcache_obj_cache
= kmem_cache_create("zcache_obj",
1968 sizeof(struct tmem_obj
), 0, 0, NULL
);
1969 ret
= zcache_new_client(LOCAL_CLIENT
);
1971 pr_err("zcache: can't create client\n");
1975 #ifdef CONFIG_CLEANCACHE
1976 if (zcache_enabled
&& use_cleancache
) {
1977 struct cleancache_ops old_ops
;
1980 register_shrinker(&zcache_shrinker
);
1981 old_ops
= zcache_cleancache_register_ops();
1982 pr_info("zcache: cleancache enabled using kernel "
1983 "transcendent memory and compression buddies\n");
1984 if (old_ops
.init_fs
!= NULL
)
1985 pr_warning("zcache: cleancache_ops overridden");
1988 #ifdef CONFIG_FRONTSWAP
1989 if (zcache_enabled
&& use_frontswap
) {
1990 struct frontswap_ops old_ops
;
1992 old_ops
= zcache_frontswap_register_ops();
1993 pr_info("zcache: frontswap enabled using kernel "
1994 "transcendent memory and xvmalloc\n");
1995 if (old_ops
.init
!= NULL
)
1996 pr_warning("ktmem: frontswap_ops overridden");
2003 module_init(zcache_init
)