4 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
5 * Copyright (c) 2010,2011, Nitin Gupta
7 * Zcache provides an in-kernel "host implementation" for transcendent memory
8 * and, thus indirectly, for cleancache and frontswap. Zcache includes two
9 * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
10 * 1) "compression buddies" ("zbud") is used for ephemeral pages
11 * 2) xvmalloc is used for persistent pages.
12 * Xvmalloc (based on the TLSF allocator) has very low fragmentation
13 * so maximizes space efficiency, while zbud allows pairs (and potentially,
14 * in the future, more than a pair of) compressed pages to be closely linked
15 * so that reclaiming can be done via the kernel's physical-page-oriented
16 * "shrinker" interface.
18 * [1] For a definition of page-accessible memory (aka PAM), see:
19 * http://marc.info/?l=linux-mm&m=127811271605009
21 * - handle remotifying of buddied pages (see zbud_remotify_zbpg)
22 * - kernel boot params: nocleancache/nofrontswap don't always work?!?
25 #include <linux/module.h>
26 #include <linux/cpu.h>
27 #include <linux/highmem.h>
28 #include <linux/list.h>
29 #include <linux/lzo.h>
30 #include <linux/slab.h>
31 #include <linux/spinlock.h>
32 #include <linux/types.h>
33 #include <linux/atomic.h>
34 #include <linux/math64.h>
38 #include "cluster/tcp.h"
40 #include "xvmalloc.h" /* temporary until change to zsmalloc */
42 #define RAMSTER_TESTING
44 #if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
45 #error "ramster is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
47 #ifdef CONFIG_CLEANCACHE
48 #include <linux/cleancache.h>
50 #ifdef CONFIG_FRONTSWAP
51 #include <linux/frontswap.h>
54 enum ramster_remotify_op
{
55 RAMSTER_REMOTIFY_EPH_PUT
,
56 RAMSTER_REMOTIFY_PERS_PUT
,
57 RAMSTER_REMOTIFY_FLUSH_PAGE
,
58 RAMSTER_REMOTIFY_FLUSH_OBJ
,
59 RAMSTER_INTRANSIT_PERS
62 struct ramster_remotify_hdr
{
63 enum ramster_remotify_op op
;
64 struct list_head list
;
67 #define ZBH_SENTINEL 0x43214321
68 #define ZBPG_SENTINEL 0xdeadbeef
70 #define ZBUD_MAX_BUDS 2
73 struct ramster_remotify_hdr rem_op
;
78 uint16_t size
; /* compressed size in bytes, zero means unused */
82 #define ZVH_SENTINEL 0x43214321
83 static const int zv_max_page_size
= (PAGE_SIZE
/ 8) * 7;
86 struct ramster_remotify_hdr rem_op
;
94 struct flushlist_node
{
95 struct ramster_remotify_hdr rem_op
;
96 struct tmem_xhandle xh
;
100 struct ramster_remotify_hdr rem_op
;
102 struct zbud_hdr zbud
;
103 struct flushlist_node flist
;
104 } remotify_list_node
;
106 static LIST_HEAD(zcache_rem_op_list
);
107 static DEFINE_SPINLOCK(zcache_rem_op_list_lock
);
110 /* this is more aggressive but may cause other problems? */
111 #define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
113 #define ZCACHE_GFP_MASK \
114 (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
117 #define MAX_POOLS_PER_CLIENT 16
119 #define MAX_CLIENTS 16
120 #define LOCAL_CLIENT ((uint16_t)-1)
122 MODULE_LICENSE("GPL");
124 struct zcache_client
{
125 struct tmem_pool
*tmem_pools
[MAX_POOLS_PER_CLIENT
];
126 struct xv_pool
*xvpool
;
131 static struct zcache_client zcache_host
;
132 static struct zcache_client zcache_clients
[MAX_CLIENTS
];
134 static inline uint16_t get_client_id_from_client(struct zcache_client
*cli
)
137 if (cli
== &zcache_host
)
139 return cli
- &zcache_clients
[0];
142 static inline bool is_local_client(struct zcache_client
*cli
)
144 return cli
== &zcache_host
;
148 * Compression buddies ("zbud") provides for packing two (or, possibly
149 * in the future, more) compressed ephemeral pages into a single "raw"
150 * (physical) page and tracking them with data structures so that
151 * the raw pages can be easily reclaimed.
153 * A zbud page ("zbpg") is an aligned page containing a list_head,
154 * a lock, and two "zbud headers". The remainder of the physical
155 * page is divided up into aligned 64-byte "chunks" which contain
156 * the compressed data for zero, one, or two zbuds. Each zbpg
157 * resides on: (1) an "unused list" if it has no zbuds; (2) a
158 * "buddied" list if it is fully populated with two zbuds; or
159 * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
160 * the one unbuddied zbud uses. The data inside a zbpg cannot be
161 * read or written unless the zbpg's lock is held.
165 struct list_head bud_list
;
167 struct zbud_hdr buddy
[ZBUD_MAX_BUDS
];
169 /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
172 #define CHUNK_SHIFT 6
173 #define CHUNK_SIZE (1 << CHUNK_SHIFT)
174 #define CHUNK_MASK (~(CHUNK_SIZE-1))
175 #define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \
176 CHUNK_MASK) >> CHUNK_SHIFT)
177 #define MAX_CHUNK (NCHUNKS-1)
180 struct list_head list
;
182 } zbud_unbuddied
[NCHUNKS
];
183 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
184 /* element 0 is never used but optimizing that isn't worth it */
185 static unsigned long zbud_cumul_chunk_counts
[NCHUNKS
];
187 struct list_head zbud_buddied_list
;
188 static unsigned long zcache_zbud_buddied_count
;
190 /* protects the buddied list and all unbuddied lists */
191 static DEFINE_SPINLOCK(zbud_budlists_spinlock
);
193 static atomic_t zcache_zbud_curr_raw_pages
;
194 static atomic_t zcache_zbud_curr_zpages
;
195 static unsigned long zcache_zbud_curr_zbytes
;
196 static unsigned long zcache_zbud_cumul_zpages
;
197 static unsigned long zcache_zbud_cumul_zbytes
;
198 static unsigned long zcache_compress_poor
;
199 static unsigned long zcache_policy_percent_exceeded
;
200 static unsigned long zcache_mean_compress_poor
;
204 * - Remote pages are pages with a local pampd but the data is remote
205 * - Foreign pages are pages stored locally but belonging to another node
207 static atomic_t ramster_remote_pers_pages
= ATOMIC_INIT(0);
208 static unsigned long ramster_pers_remotify_enable
;
209 static unsigned long ramster_eph_remotify_enable
;
210 static unsigned long ramster_eph_pages_remoted
;
211 static unsigned long ramster_eph_pages_remote_failed
;
212 static unsigned long ramster_pers_pages_remoted
;
213 static unsigned long ramster_pers_pages_remote_failed
;
214 static unsigned long ramster_pers_pages_remote_nomem
;
215 static unsigned long ramster_remote_objects_flushed
;
216 static unsigned long ramster_remote_object_flushes_failed
;
217 static unsigned long ramster_remote_pages_flushed
;
218 static unsigned long ramster_remote_page_flushes_failed
;
219 static unsigned long ramster_remote_eph_pages_succ_get
;
220 static unsigned long ramster_remote_pers_pages_succ_get
;
221 static unsigned long ramster_remote_eph_pages_unsucc_get
;
222 static unsigned long ramster_remote_pers_pages_unsucc_get
;
223 static atomic_t ramster_curr_flnode_count
= ATOMIC_INIT(0);
224 static unsigned long ramster_curr_flnode_count_max
;
225 static atomic_t ramster_foreign_eph_pampd_count
= ATOMIC_INIT(0);
226 static unsigned long ramster_foreign_eph_pampd_count_max
;
227 static atomic_t ramster_foreign_pers_pampd_count
= ATOMIC_INIT(0);
228 static unsigned long ramster_foreign_pers_pampd_count_max
;
230 /* forward references */
231 static void *zcache_get_free_page(void);
232 static void zcache_free_page(void *p
);
235 * zbud helper functions
238 static inline unsigned zbud_max_buddy_size(void)
240 return MAX_CHUNK
<< CHUNK_SHIFT
;
243 static inline unsigned zbud_size_to_chunks(unsigned size
)
245 BUG_ON(size
== 0 || size
> zbud_max_buddy_size());
246 return (size
+ CHUNK_SIZE
- 1) >> CHUNK_SHIFT
;
249 static inline int zbud_budnum(struct zbud_hdr
*zh
)
251 unsigned offset
= (unsigned long)zh
& (PAGE_SIZE
- 1);
252 struct zbud_page
*zbpg
= NULL
;
253 unsigned budnum
= -1U;
256 for (i
= 0; i
< ZBUD_MAX_BUDS
; i
++)
257 if (offset
== offsetof(typeof(*zbpg
), buddy
[i
])) {
261 BUG_ON(budnum
== -1U);
265 static char *zbud_data(struct zbud_hdr
*zh
, unsigned size
)
267 struct zbud_page
*zbpg
;
271 ASSERT_SENTINEL(zh
, ZBH
);
272 budnum
= zbud_budnum(zh
);
273 BUG_ON(size
== 0 || size
> zbud_max_buddy_size());
274 zbpg
= container_of(zh
, struct zbud_page
, buddy
[budnum
]);
275 ASSERT_SPINLOCK(&zbpg
->lock
);
278 p
+= ((sizeof(struct zbud_page
) + CHUNK_SIZE
- 1) &
280 else if (budnum
== 1)
281 p
+= PAGE_SIZE
- ((size
+ CHUNK_SIZE
- 1) & CHUNK_MASK
);
285 static void zbud_copy_from_pampd(char *data
, size_t *size
, struct zbud_hdr
*zh
)
287 struct zbud_page
*zbpg
;
291 ASSERT_SENTINEL(zh
, ZBH
);
292 budnum
= zbud_budnum(zh
);
293 zbpg
= container_of(zh
, struct zbud_page
, buddy
[budnum
]);
294 spin_lock(&zbpg
->lock
);
295 BUG_ON(zh
->size
> *size
);
298 p
+= ((sizeof(struct zbud_page
) + CHUNK_SIZE
- 1) &
300 else if (budnum
== 1)
301 p
+= PAGE_SIZE
- ((zh
->size
+ CHUNK_SIZE
- 1) & CHUNK_MASK
);
302 /* client should be filled in by caller */
303 memcpy(data
, p
, zh
->size
);
305 spin_unlock(&zbpg
->lock
);
309 * zbud raw page management
312 static struct zbud_page
*zbud_alloc_raw_page(void)
314 struct zbud_page
*zbpg
= NULL
;
315 struct zbud_hdr
*zh0
, *zh1
;
316 zbpg
= zcache_get_free_page();
317 if (likely(zbpg
!= NULL
)) {
318 INIT_LIST_HEAD(&zbpg
->bud_list
);
319 zh0
= &zbpg
->buddy
[0]; zh1
= &zbpg
->buddy
[1];
320 spin_lock_init(&zbpg
->lock
);
321 atomic_inc(&zcache_zbud_curr_raw_pages
);
322 INIT_LIST_HEAD(&zbpg
->bud_list
);
323 SET_SENTINEL(zbpg
, ZBPG
);
324 zh0
->size
= 0; zh1
->size
= 0;
325 tmem_oid_set_invalid(&zh0
->oid
);
326 tmem_oid_set_invalid(&zh1
->oid
);
331 static void zbud_free_raw_page(struct zbud_page
*zbpg
)
333 struct zbud_hdr
*zh0
= &zbpg
->buddy
[0], *zh1
= &zbpg
->buddy
[1];
335 ASSERT_SENTINEL(zbpg
, ZBPG
);
336 BUG_ON(!list_empty(&zbpg
->bud_list
));
337 ASSERT_SPINLOCK(&zbpg
->lock
);
338 BUG_ON(zh0
->size
!= 0 || tmem_oid_valid(&zh0
->oid
));
339 BUG_ON(zh1
->size
!= 0 || tmem_oid_valid(&zh1
->oid
));
340 INVERT_SENTINEL(zbpg
, ZBPG
);
341 spin_unlock(&zbpg
->lock
);
342 atomic_dec(&zcache_zbud_curr_raw_pages
);
343 zcache_free_page(zbpg
);
347 * core zbud handling routines
350 static unsigned zbud_free(struct zbud_hdr
*zh
)
354 ASSERT_SENTINEL(zh
, ZBH
);
355 BUG_ON(!tmem_oid_valid(&zh
->oid
));
357 BUG_ON(zh
->size
== 0 || zh
->size
> zbud_max_buddy_size());
359 tmem_oid_set_invalid(&zh
->oid
);
360 INVERT_SENTINEL(zh
, ZBH
);
361 zcache_zbud_curr_zbytes
-= size
;
362 atomic_dec(&zcache_zbud_curr_zpages
);
366 static void zbud_free_and_delist(struct zbud_hdr
*zh
)
369 struct zbud_hdr
*zh_other
;
370 unsigned budnum
= zbud_budnum(zh
), size
;
371 struct zbud_page
*zbpg
=
372 container_of(zh
, struct zbud_page
, buddy
[budnum
]);
374 /* FIXME, should be BUG_ON, pool destruction path doesn't disable
375 * interrupts tmem_destroy_pool()->tmem_pampd_destroy_all_in_obj()->
376 * tmem_objnode_node_destroy()-> zcache_pampd_free() */
377 WARN_ON(!irqs_disabled());
378 spin_lock(&zbpg
->lock
);
379 if (list_empty(&zbpg
->bud_list
)) {
380 /* ignore zombie page... see zbud_evict_pages() */
381 spin_unlock(&zbpg
->lock
);
384 size
= zbud_free(zh
);
385 ASSERT_SPINLOCK(&zbpg
->lock
);
386 zh_other
= &zbpg
->buddy
[(budnum
== 0) ? 1 : 0];
387 if (zh_other
->size
== 0) { /* was unbuddied: unlist and free */
388 chunks
= zbud_size_to_chunks(size
) ;
389 spin_lock(&zbud_budlists_spinlock
);
390 BUG_ON(list_empty(&zbud_unbuddied
[chunks
].list
));
391 list_del_init(&zbpg
->bud_list
);
392 zbud_unbuddied
[chunks
].count
--;
393 spin_unlock(&zbud_budlists_spinlock
);
394 zbud_free_raw_page(zbpg
);
395 } else { /* was buddied: move remaining buddy to unbuddied list */
396 chunks
= zbud_size_to_chunks(zh_other
->size
) ;
397 spin_lock(&zbud_budlists_spinlock
);
398 list_del_init(&zbpg
->bud_list
);
399 zcache_zbud_buddied_count
--;
400 list_add_tail(&zbpg
->bud_list
, &zbud_unbuddied
[chunks
].list
);
401 zbud_unbuddied
[chunks
].count
++;
402 spin_unlock(&zbud_budlists_spinlock
);
403 spin_unlock(&zbpg
->lock
);
407 static struct zbud_hdr
*zbud_create(uint16_t client_id
, uint16_t pool_id
,
408 struct tmem_oid
*oid
,
409 uint32_t index
, struct page
*page
,
410 void *cdata
, unsigned size
)
412 struct zbud_hdr
*zh0
, *zh1
, *zh
= NULL
;
413 struct zbud_page
*zbpg
= NULL
, *ztmp
;
416 int i
, found_good_buddy
= 0;
418 nchunks
= zbud_size_to_chunks(size
) ;
419 for (i
= MAX_CHUNK
- nchunks
+ 1; i
> 0; i
--) {
420 spin_lock(&zbud_budlists_spinlock
);
421 if (!list_empty(&zbud_unbuddied
[i
].list
)) {
422 list_for_each_entry_safe(zbpg
, ztmp
,
423 &zbud_unbuddied
[i
].list
, bud_list
) {
424 if (spin_trylock(&zbpg
->lock
)) {
425 found_good_buddy
= i
;
426 goto found_unbuddied
;
430 spin_unlock(&zbud_budlists_spinlock
);
432 /* didn't find a good buddy, try allocating a new page */
433 zbpg
= zbud_alloc_raw_page();
434 if (unlikely(zbpg
== NULL
))
436 /* ok, have a page, now compress the data before taking locks */
437 spin_lock(&zbud_budlists_spinlock
);
438 spin_lock(&zbpg
->lock
);
439 list_add_tail(&zbpg
->bud_list
, &zbud_unbuddied
[nchunks
].list
);
440 zbud_unbuddied
[nchunks
].count
++;
441 zh
= &zbpg
->buddy
[0];
445 ASSERT_SPINLOCK(&zbpg
->lock
);
446 zh0
= &zbpg
->buddy
[0]; zh1
= &zbpg
->buddy
[1];
447 BUG_ON(!((zh0
->size
== 0) ^ (zh1
->size
== 0)));
448 if (zh0
->size
!= 0) { /* buddy0 in use, buddy1 is vacant */
449 ASSERT_SENTINEL(zh0
, ZBH
);
451 } else if (zh1
->size
!= 0) { /* buddy1 in use, buddy0 is vacant */
452 ASSERT_SENTINEL(zh1
, ZBH
);
456 list_del_init(&zbpg
->bud_list
);
457 zbud_unbuddied
[found_good_buddy
].count
--;
458 list_add_tail(&zbpg
->bud_list
, &zbud_buddied_list
);
459 zcache_zbud_buddied_count
++;
462 SET_SENTINEL(zh
, ZBH
);
466 zh
->pool_id
= pool_id
;
467 zh
->client_id
= client_id
;
468 to
= zbud_data(zh
, size
);
469 memcpy(to
, cdata
, size
);
470 spin_unlock(&zbpg
->lock
);
471 spin_unlock(&zbud_budlists_spinlock
);
472 zbud_cumul_chunk_counts
[nchunks
]++;
473 atomic_inc(&zcache_zbud_curr_zpages
);
474 zcache_zbud_cumul_zpages
++;
475 zcache_zbud_curr_zbytes
+= size
;
476 zcache_zbud_cumul_zbytes
+= size
;
481 static int zbud_decompress(struct page
*page
, struct zbud_hdr
*zh
)
483 struct zbud_page
*zbpg
;
484 unsigned budnum
= zbud_budnum(zh
);
485 size_t out_len
= PAGE_SIZE
;
486 char *to_va
, *from_va
;
490 zbpg
= container_of(zh
, struct zbud_page
, buddy
[budnum
]);
491 spin_lock(&zbpg
->lock
);
492 if (list_empty(&zbpg
->bud_list
)) {
493 /* ignore zombie page... see zbud_evict_pages() */
497 ASSERT_SENTINEL(zh
, ZBH
);
498 BUG_ON(zh
->size
== 0 || zh
->size
> zbud_max_buddy_size());
499 to_va
= kmap_atomic(page
);
501 from_va
= zbud_data(zh
, size
);
502 ret
= lzo1x_decompress_safe(from_va
, size
, to_va
, &out_len
);
503 BUG_ON(ret
!= LZO_E_OK
);
504 BUG_ON(out_len
!= PAGE_SIZE
);
505 kunmap_atomic(to_va
);
507 spin_unlock(&zbpg
->lock
);
512 * The following routines handle shrinking of ephemeral pages by evicting
513 * pages "least valuable" first.
516 static unsigned long zcache_evicted_raw_pages
;
517 static unsigned long zcache_evicted_buddied_pages
;
518 static unsigned long zcache_evicted_unbuddied_pages
;
520 static struct tmem_pool
*zcache_get_pool_by_id(uint16_t cli_id
,
522 static void zcache_put_pool(struct tmem_pool
*pool
);
525 * Flush and free all zbuds in a zbpg, then free the pageframe
527 static void zbud_evict_zbpg(struct zbud_page
*zbpg
)
531 uint32_t pool_id
[ZBUD_MAX_BUDS
], client_id
[ZBUD_MAX_BUDS
];
532 uint32_t index
[ZBUD_MAX_BUDS
];
533 struct tmem_oid oid
[ZBUD_MAX_BUDS
];
534 struct tmem_pool
*pool
;
537 ASSERT_SPINLOCK(&zbpg
->lock
);
538 for (i
= 0, j
= 0; i
< ZBUD_MAX_BUDS
; i
++) {
539 zh
= &zbpg
->buddy
[i
];
541 client_id
[j
] = zh
->client_id
;
542 pool_id
[j
] = zh
->pool_id
;
544 index
[j
] = zh
->index
;
548 spin_unlock(&zbpg
->lock
);
549 for (i
= 0; i
< j
; i
++) {
550 pool
= zcache_get_pool_by_id(client_id
[i
], pool_id
[i
]);
551 BUG_ON(pool
== NULL
);
552 local_irq_save(flags
);
553 /* these flushes should dispose of any local storage */
554 tmem_flush_page(pool
, &oid
[i
], index
[i
]);
555 local_irq_restore(flags
);
556 zcache_put_pool(pool
);
561 * Free nr pages. This code is funky because we want to hold the locks
562 * protecting various lists for as short a time as possible, and in some
563 * circumstances the list may change asynchronously when the list lock is
564 * not held. In some cases we also trylock not only to avoid waiting on a
565 * page in use by another cpu, but also to avoid potential deadlock due to
568 static void zbud_evict_pages(int nr
)
570 struct zbud_page
*zbpg
;
571 int i
, newly_unused_pages
= 0;
574 /* now try freeing unbuddied pages, starting with least space avail */
575 for (i
= 0; i
< MAX_CHUNK
; i
++) {
577 spin_lock_bh(&zbud_budlists_spinlock
);
578 if (list_empty(&zbud_unbuddied
[i
].list
)) {
579 spin_unlock_bh(&zbud_budlists_spinlock
);
582 list_for_each_entry(zbpg
, &zbud_unbuddied
[i
].list
, bud_list
) {
583 if (unlikely(!spin_trylock(&zbpg
->lock
)))
585 zbud_unbuddied
[i
].count
--;
586 spin_unlock(&zbud_budlists_spinlock
);
587 zcache_evicted_unbuddied_pages
++;
588 /* want budlists unlocked when doing zbpg eviction */
589 zbud_evict_zbpg(zbpg
);
590 newly_unused_pages
++;
594 goto retry_unbud_list_i
;
596 spin_unlock_bh(&zbud_budlists_spinlock
);
599 /* as a last resort, free buddied pages */
601 spin_lock_bh(&zbud_budlists_spinlock
);
602 if (list_empty(&zbud_buddied_list
)) {
603 spin_unlock_bh(&zbud_budlists_spinlock
);
606 list_for_each_entry(zbpg
, &zbud_buddied_list
, bud_list
) {
607 if (unlikely(!spin_trylock(&zbpg
->lock
)))
609 zcache_zbud_buddied_count
--;
610 spin_unlock(&zbud_budlists_spinlock
);
611 zcache_evicted_buddied_pages
++;
612 /* want budlists unlocked when doing zbpg eviction */
613 zbud_evict_zbpg(zbpg
);
614 newly_unused_pages
++;
620 spin_unlock_bh(&zbud_budlists_spinlock
);
626 static DEFINE_PER_CPU(unsigned char *, zcache_remoteputmem
);
628 static int zbud_remotify_zbud(struct tmem_xhandle
*xh
, char *data
,
631 struct tmem_pool
*pool
;
632 int i
, remotenode
, ret
= -1;
633 unsigned char cksum
, *p
;
636 for (p
= data
, cksum
= 0, i
= 0; i
< size
; i
++)
638 ret
= ramster_remote_put(xh
, data
, size
, true, &remotenode
);
640 /* data was successfully remoted so change the local version
641 * to point to the remote node where it landed */
642 pool
= zcache_get_pool_by_id(LOCAL_CLIENT
, xh
->pool_id
);
643 BUG_ON(pool
== NULL
);
644 local_irq_save(flags
);
645 /* tmem_replace will also free up any local space */
646 (void)tmem_replace(pool
, &xh
->oid
, xh
->index
,
647 pampd_make_remote(remotenode
, size
, cksum
));
648 local_irq_restore(flags
);
649 zcache_put_pool(pool
);
650 ramster_eph_pages_remoted
++;
653 ramster_eph_pages_remote_failed
++;
657 static int zbud_remotify_zbpg(struct zbud_page
*zbpg
)
659 struct zbud_hdr
*zh1
, *zh2
= NULL
;
660 struct tmem_xhandle xh1
, xh2
= { 0 };
661 char *data1
= NULL
, *data2
= NULL
;
662 size_t size1
= 0, size2
= 0;
664 unsigned char *tmpmem
= __get_cpu_var(zcache_remoteputmem
);
666 ASSERT_SPINLOCK(&zbpg
->lock
);
667 if (zbpg
->buddy
[0].size
== 0)
668 zh1
= &zbpg
->buddy
[1];
669 else if (zbpg
->buddy
[1].size
== 0)
670 zh1
= &zbpg
->buddy
[0];
672 zh1
= &zbpg
->buddy
[0];
673 zh2
= &zbpg
->buddy
[1];
675 /* don't remotify pages that are already remotified */
676 if (zh1
->client_id
!= LOCAL_CLIENT
)
678 if ((zh2
!= NULL
) && (zh2
->client_id
!= LOCAL_CLIENT
))
681 /* copy the data and metadata so can release lock */
683 xh1
.client_id
= zh1
->client_id
;
684 xh1
.pool_id
= zh1
->pool_id
;
686 xh1
.index
= zh1
->index
;
688 data1
= zbud_data(zh1
, size1
);
689 memcpy(tmpmem
, zbud_data(zh1
, size1
), size1
);
694 xh2
.client_id
= zh2
->client_id
;
695 xh2
.pool_id
= zh2
->pool_id
;
697 xh2
.index
= zh2
->index
;
699 memcpy(tmpmem
, zbud_data(zh2
, size2
), size2
);
702 spin_unlock(&zbpg
->lock
);
705 /* OK, no locks held anymore, remotify one or both zbuds */
707 ret
= zbud_remotify_zbud(&xh1
, data1
, size1
);
709 ret
|= zbud_remotify_zbud(&xh2
, data2
, size2
);
713 void zbud_remotify_pages(int nr
)
715 struct zbud_page
*zbpg
;
719 * for now just try remotifying unbuddied pages, starting with
722 for (i
= 0; i
< MAX_CHUNK
; i
++) {
724 preempt_disable(); /* enable in zbud_remotify_zbpg */
725 spin_lock_bh(&zbud_budlists_spinlock
);
726 if (list_empty(&zbud_unbuddied
[i
].list
)) {
727 spin_unlock_bh(&zbud_budlists_spinlock
);
729 continue; /* next i in for loop */
731 list_for_each_entry(zbpg
, &zbud_unbuddied
[i
].list
, bud_list
) {
732 if (unlikely(!spin_trylock(&zbpg
->lock
)))
733 continue; /* next list_for_each_entry */
734 zbud_unbuddied
[i
].count
--;
735 /* want budlists unlocked when doing zbpg remotify */
736 spin_unlock_bh(&zbud_budlists_spinlock
);
737 ret
= zbud_remotify_zbpg(zbpg
);
738 /* preemption is re-enabled in zbud_remotify_zbpg */
742 goto retry_unbud_list_i
;
744 /* if fail to remotify any page, quit */
745 pr_err("TESTING zbud_remotify_pages failed on page,"
746 " trying to re-add\n");
747 spin_lock_bh(&zbud_budlists_spinlock
);
748 spin_lock(&zbpg
->lock
);
749 list_add_tail(&zbpg
->bud_list
, &zbud_unbuddied
[i
].list
);
750 zbud_unbuddied
[i
].count
++;
751 spin_unlock(&zbpg
->lock
);
752 spin_unlock_bh(&zbud_budlists_spinlock
);
753 pr_err("TESTING zbud_remotify_pages failed on page,"
754 " finished re-add\n");
757 spin_unlock_bh(&zbud_budlists_spinlock
);
762 preempt_disable(); /* enable in zbud_remotify_zbpg */
763 spin_lock_bh(&zbud_budlists_spinlock
);
764 if (list_empty(&zbud_buddied_list
))
766 list_for_each_entry(zbpg
, &zbud_buddied_list
, bud_list
) {
767 if (unlikely(!spin_trylock(&zbpg
->lock
)))
768 continue; /* next list_for_each_entry */
769 zcache_zbud_buddied_count
--;
770 /* want budlists unlocked when doing zbpg remotify */
771 spin_unlock_bh(&zbud_budlists_spinlock
);
772 ret
= zbud_remotify_zbpg(zbpg
);
773 /* preemption is re-enabled in zbud_remotify_zbpg */
777 goto next_buddied_zbpg
;
779 /* if fail to remotify any page, quit */
780 pr_err("TESTING zbud_remotify_pages failed on BUDDIED page,"
781 " trying to re-add\n");
782 spin_lock_bh(&zbud_budlists_spinlock
);
783 spin_lock(&zbpg
->lock
);
784 list_add_tail(&zbpg
->bud_list
, &zbud_buddied_list
);
785 zcache_zbud_buddied_count
++;
786 spin_unlock(&zbpg
->lock
);
787 spin_unlock_bh(&zbud_budlists_spinlock
);
788 pr_err("TESTING zbud_remotify_pages failed on BUDDIED page,"
789 " finished re-add\n");
793 spin_unlock_bh(&zbud_budlists_spinlock
);
799 /* the "flush list" asynchronously collects pages to remotely flush */
800 #define FLUSH_ENTIRE_OBJECT ((uint32_t)-1)
801 static void ramster_flnode_free(struct flushlist_node
*,
804 static void zcache_remote_flush_page(struct flushlist_node
*flnode
)
806 struct tmem_xhandle
*xh
;
811 remotenode
= flnode
->xh
.client_id
;
812 ret
= ramster_remote_flush(xh
, remotenode
);
814 ramster_remote_pages_flushed
++;
816 ramster_remote_page_flushes_failed
++;
817 preempt_enable_no_resched();
818 ramster_flnode_free(flnode
, NULL
);
821 static void zcache_remote_flush_object(struct flushlist_node
*flnode
)
823 struct tmem_xhandle
*xh
;
828 remotenode
= flnode
->xh
.client_id
;
829 ret
= ramster_remote_flush_object(xh
, remotenode
);
831 ramster_remote_objects_flushed
++;
833 ramster_remote_object_flushes_failed
++;
834 preempt_enable_no_resched();
835 ramster_flnode_free(flnode
, NULL
);
838 static void zcache_remote_eph_put(struct zbud_hdr
*zbud
)
843 static void zcache_remote_pers_put(struct zv_hdr
*zv
)
845 struct tmem_xhandle xh
;
848 int remotenode
, ret
= -1;
850 struct tmem_pool
*pool
;
855 unsigned char *tmpmem
= __get_cpu_var(zcache_remoteputmem
);
857 ASSERT_SENTINEL(zv
, ZVH
);
858 BUG_ON(zv
->client_id
!= LOCAL_CLIENT
);
860 xh
.client_id
= zv
->client_id
;
861 xh
.pool_id
= zv
->pool_id
;
863 xh
.index
= zv
->index
;
864 size
= xv_get_object_size(zv
) - sizeof(*zv
);
865 BUG_ON(size
== 0 || size
> zv_max_page_size
);
866 data
= (char *)zv
+ sizeof(*zv
);
867 for (p
= data
, cksum
= 0, i
= 0; i
< size
; i
++)
869 memcpy(tmpmem
, data
, size
);
871 pool
= zcache_get_pool_by_id(zv
->client_id
, zv
->pool_id
);
872 ephemeral
= is_ephemeral(pool
);
873 zcache_put_pool(pool
);
874 /* now OK to release lock set in caller */
875 spin_unlock(&zcache_rem_op_list_lock
);
878 ret
= ramster_remote_put(&xh
, data
, size
, ephemeral
, &remotenode
);
879 preempt_enable_no_resched();
882 * This is some form of a memory leak... if the remote put
883 * fails, there will never be another attempt to remotify
884 * this page. But since we've dropped the zv pointer,
885 * the page may have been freed or the data replaced
886 * so we can't just "put it back" in the remote op list.
887 * Even if we could, not sure where to put it in the list
888 * because there may be flushes that must be strictly
889 * ordered vs the put. So leave this as a FIXME for now.
890 * But count them so we know if it becomes a problem.
892 ramster_pers_pages_remote_failed
++;
895 atomic_inc(&ramster_remote_pers_pages
);
896 ramster_pers_pages_remoted
++;
898 * data was successfully remoted so change the local version to
899 * point to the remote node where it landed
902 pool
= zcache_get_pool_by_id(LOCAL_CLIENT
, xh
.pool_id
);
903 local_irq_save(flags
);
904 (void)tmem_replace(pool
, &xh
.oid
, xh
.index
,
905 pampd_make_remote(remotenode
, size
, cksum
));
906 local_irq_restore(flags
);
907 zcache_put_pool(pool
);
913 static void zcache_do_remotify_ops(int nr
)
915 struct ramster_remotify_hdr
*rem_op
;
916 union remotify_list_node
*u
;
921 spin_lock(&zcache_rem_op_list_lock
);
922 if (list_empty(&zcache_rem_op_list
)) {
923 spin_unlock(&zcache_rem_op_list_lock
);
926 rem_op
= list_first_entry(&zcache_rem_op_list
,
927 struct ramster_remotify_hdr
, list
);
928 list_del_init(&rem_op
->list
);
929 if (rem_op
->op
!= RAMSTER_REMOTIFY_PERS_PUT
)
930 spin_unlock(&zcache_rem_op_list_lock
);
931 u
= (union remotify_list_node
*)rem_op
;
932 switch (rem_op
->op
) {
933 case RAMSTER_REMOTIFY_EPH_PUT
:
935 zcache_remote_eph_put((struct zbud_hdr
*)rem_op
);
937 case RAMSTER_REMOTIFY_PERS_PUT
:
938 zcache_remote_pers_put((struct zv_hdr
*)rem_op
);
940 case RAMSTER_REMOTIFY_FLUSH_PAGE
:
941 zcache_remote_flush_page((struct flushlist_node
*)u
);
943 case RAMSTER_REMOTIFY_FLUSH_OBJ
:
944 zcache_remote_flush_object((struct flushlist_node
*)u
);
955 * Communicate interface revision with userspace
957 #include "cluster/ramster_nodemanager.h"
958 static unsigned long ramster_interface_revision
= R2NM_API_VERSION
;
961 * For now, just push over a few pages every few seconds to
962 * ensure that it basically works
964 static struct workqueue_struct
*ramster_remotify_workqueue
;
965 static void ramster_remotify_process(struct work_struct
*work
);
966 static DECLARE_DELAYED_WORK(ramster_remotify_worker
,
967 ramster_remotify_process
);
969 static void ramster_remotify_queue_delayed_work(unsigned long delay
)
971 if (!queue_delayed_work(ramster_remotify_workqueue
,
972 &ramster_remotify_worker
, delay
))
973 pr_err("ramster_remotify: bad workqueue\n");
977 static int use_frontswap
;
978 static int use_cleancache
;
979 static int ramster_remote_target_nodenum
= -1;
980 static void ramster_remotify_process(struct work_struct
*work
)
982 static bool remotify_in_progress
;
984 BUG_ON(irqs_disabled());
985 if (remotify_in_progress
)
986 ramster_remotify_queue_delayed_work(HZ
);
987 else if (ramster_remote_target_nodenum
!= -1) {
988 remotify_in_progress
= true;
989 #ifdef CONFIG_CLEANCACHE
990 if (use_cleancache
&& ramster_eph_remotify_enable
)
991 zbud_remotify_pages(5000); /* FIXME is this a good number? */
993 #ifdef CONFIG_FRONTSWAP
994 if (use_frontswap
&& ramster_pers_remotify_enable
)
995 zcache_do_remotify_ops(500); /* FIXME is this a good number? */
997 remotify_in_progress
= false;
998 ramster_remotify_queue_delayed_work(HZ
);
1002 static void ramster_remotify_init(void)
1004 unsigned long n
= 60UL;
1005 ramster_remotify_workqueue
=
1006 create_singlethread_workqueue("ramster_remotify");
1007 ramster_remotify_queue_delayed_work(n
* HZ
);
1011 static void zbud_init(void)
1015 INIT_LIST_HEAD(&zbud_buddied_list
);
1016 zcache_zbud_buddied_count
= 0;
1017 for (i
= 0; i
< NCHUNKS
; i
++) {
1018 INIT_LIST_HEAD(&zbud_unbuddied
[i
].list
);
1019 zbud_unbuddied
[i
].count
= 0;
1025 * These sysfs routines show a nice distribution of how many zbpg's are
1026 * currently (and have ever been placed) in each unbuddied list. It's fun
1027 * to watch but can probably go away before final merge.
1029 static int zbud_show_unbuddied_list_counts(char *buf
)
1034 for (i
= 0; i
< NCHUNKS
; i
++)
1035 p
+= sprintf(p
, "%u ", zbud_unbuddied
[i
].count
);
1039 static int zbud_show_cumul_chunk_counts(char *buf
)
1041 unsigned long i
, chunks
= 0, total_chunks
= 0, sum_total_chunks
= 0;
1042 unsigned long total_chunks_lte_21
= 0, total_chunks_lte_32
= 0;
1043 unsigned long total_chunks_lte_42
= 0;
1046 for (i
= 0; i
< NCHUNKS
; i
++) {
1047 p
+= sprintf(p
, "%lu ", zbud_cumul_chunk_counts
[i
]);
1048 chunks
+= zbud_cumul_chunk_counts
[i
];
1049 total_chunks
+= zbud_cumul_chunk_counts
[i
];
1050 sum_total_chunks
+= i
* zbud_cumul_chunk_counts
[i
];
1052 total_chunks_lte_21
= total_chunks
;
1054 total_chunks_lte_32
= total_chunks
;
1056 total_chunks_lte_42
= total_chunks
;
1058 p
+= sprintf(p
, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
1059 total_chunks_lte_21
, total_chunks_lte_32
, total_chunks_lte_42
,
1060 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
1066 * This "zv" PAM implementation combines the TLSF-based xvMalloc
1067 * with lzo1x compression to maximize the amount of data that can
1068 * be packed into a physical page.
1070 * Zv represents a PAM page with the index and object (plus a "size" value
1071 * necessary for decompression) immediately preceding the compressed data.
1074 /* rudimentary policy limits */
1075 /* total number of persistent pages may not exceed this percentage */
1076 static unsigned int zv_page_count_policy_percent
= 75;
1078 * byte count defining poor compression; pages with greater zsize will be
1081 static unsigned int zv_max_zsize
= (PAGE_SIZE
/ 8) * 7;
1083 * byte count defining poor *mean* compression; pages with greater zsize
1084 * will be rejected until sufficient better-compressed pages are accepted
1085 * driving the mean below this threshold
1087 static unsigned int zv_max_mean_zsize
= (PAGE_SIZE
/ 8) * 5;
1089 static atomic_t zv_curr_dist_counts
[NCHUNKS
];
1090 static atomic_t zv_cumul_dist_counts
[NCHUNKS
];
1093 static struct zv_hdr
*zv_create(struct zcache_client
*cli
, uint32_t pool_id
,
1094 struct tmem_oid
*oid
, uint32_t index
,
1095 void *cdata
, unsigned clen
)
1098 struct zv_hdr
*zv
= NULL
;
1100 int alloc_size
= clen
+ sizeof(struct zv_hdr
);
1101 int chunks
= (alloc_size
+ (CHUNK_SIZE
- 1)) >> CHUNK_SHIFT
;
1104 BUG_ON(!irqs_disabled());
1105 BUG_ON(chunks
>= NCHUNKS
);
1106 ret
= xv_malloc(cli
->xvpool
, clen
+ sizeof(struct zv_hdr
),
1107 &page
, &offset
, ZCACHE_GFP_MASK
);
1110 atomic_inc(&zv_curr_dist_counts
[chunks
]);
1111 atomic_inc(&zv_cumul_dist_counts
[chunks
]);
1112 zv
= kmap_atomic(page
) + offset
;
1115 zv
->pool_id
= pool_id
;
1116 SET_SENTINEL(zv
, ZVH
);
1117 INIT_LIST_HEAD(&zv
->rem_op
.list
);
1118 zv
->client_id
= get_client_id_from_client(cli
);
1119 zv
->rem_op
.op
= RAMSTER_REMOTIFY_PERS_PUT
;
1120 if (zv
->client_id
== LOCAL_CLIENT
) {
1121 spin_lock(&zcache_rem_op_list_lock
);
1122 list_add_tail(&zv
->rem_op
.list
, &zcache_rem_op_list
);
1123 spin_unlock(&zcache_rem_op_list_lock
);
1125 memcpy((char *)zv
+ sizeof(struct zv_hdr
), cdata
, clen
);
1131 /* similar to zv_create, but just reserve space, no data yet */
1132 static struct zv_hdr
*zv_alloc(struct tmem_pool
*pool
,
1133 struct tmem_oid
*oid
, uint32_t index
,
1136 struct zcache_client
*cli
= pool
->client
;
1138 struct zv_hdr
*zv
= NULL
;
1142 BUG_ON(!irqs_disabled());
1143 BUG_ON(!is_local_client(pool
->client
));
1144 ret
= xv_malloc(cli
->xvpool
, clen
+ sizeof(struct zv_hdr
),
1145 &page
, &offset
, ZCACHE_GFP_MASK
);
1148 zv
= kmap_atomic(page
) + offset
;
1149 SET_SENTINEL(zv
, ZVH
);
1150 INIT_LIST_HEAD(&zv
->rem_op
.list
);
1151 zv
->client_id
= LOCAL_CLIENT
;
1152 zv
->rem_op
.op
= RAMSTER_INTRANSIT_PERS
;
1155 zv
->pool_id
= pool
->pool_id
;
1161 static void zv_free(struct xv_pool
*xvpool
, struct zv_hdr
*zv
)
1163 unsigned long flags
;
1166 uint16_t size
= xv_get_object_size(zv
);
1167 int chunks
= (size
+ (CHUNK_SIZE
- 1)) >> CHUNK_SHIFT
;
1169 ASSERT_SENTINEL(zv
, ZVH
);
1170 BUG_ON(chunks
>= NCHUNKS
);
1171 atomic_dec(&zv_curr_dist_counts
[chunks
]);
1172 size
-= sizeof(*zv
);
1173 spin_lock(&zcache_rem_op_list_lock
);
1174 size
= xv_get_object_size(zv
) - sizeof(*zv
);
1176 INVERT_SENTINEL(zv
, ZVH
);
1177 if (!list_empty(&zv
->rem_op
.list
))
1178 list_del_init(&zv
->rem_op
.list
);
1179 spin_unlock(&zcache_rem_op_list_lock
);
1180 page
= virt_to_page(zv
);
1181 offset
= (unsigned long)zv
& ~PAGE_MASK
;
1182 local_irq_save(flags
);
1183 xv_free(xvpool
, page
, offset
);
1184 local_irq_restore(flags
);
1187 static void zv_decompress(struct page
*page
, struct zv_hdr
*zv
)
1189 size_t clen
= PAGE_SIZE
;
1194 ASSERT_SENTINEL(zv
, ZVH
);
1195 size
= xv_get_object_size(zv
) - sizeof(*zv
);
1197 to_va
= kmap_atomic(page
);
1198 ret
= lzo1x_decompress_safe((char *)zv
+ sizeof(*zv
),
1199 size
, to_va
, &clen
);
1200 kunmap_atomic(to_va
);
1201 BUG_ON(ret
!= LZO_E_OK
);
1202 BUG_ON(clen
!= PAGE_SIZE
);
1205 static void zv_copy_from_pampd(char *data
, size_t *bufsize
, struct zv_hdr
*zv
)
1209 ASSERT_SENTINEL(zv
, ZVH
);
1210 size
= xv_get_object_size(zv
) - sizeof(*zv
);
1211 BUG_ON(size
== 0 || size
> zv_max_page_size
);
1212 BUG_ON(size
> *bufsize
);
1213 memcpy(data
, (char *)zv
+ sizeof(*zv
), size
);
1217 static void zv_copy_to_pampd(struct zv_hdr
*zv
, char *data
, size_t size
)
1221 ASSERT_SENTINEL(zv
, ZVH
);
1222 zv_size
= xv_get_object_size(zv
) - sizeof(*zv
);
1223 BUG_ON(zv_size
!= size
);
1224 BUG_ON(zv_size
== 0 || zv_size
> zv_max_page_size
);
1225 memcpy((char *)zv
+ sizeof(*zv
), data
, size
);
1230 * show a distribution of compression stats for zv pages.
1233 static int zv_curr_dist_counts_show(char *buf
)
1235 unsigned long i
, n
, chunks
= 0, sum_total_chunks
= 0;
1238 for (i
= 0; i
< NCHUNKS
; i
++) {
1239 n
= atomic_read(&zv_curr_dist_counts
[i
]);
1240 p
+= sprintf(p
, "%lu ", n
);
1242 sum_total_chunks
+= i
* n
;
1244 p
+= sprintf(p
, "mean:%lu\n",
1245 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
1249 static int zv_cumul_dist_counts_show(char *buf
)
1251 unsigned long i
, n
, chunks
= 0, sum_total_chunks
= 0;
1254 for (i
= 0; i
< NCHUNKS
; i
++) {
1255 n
= atomic_read(&zv_cumul_dist_counts
[i
]);
1256 p
+= sprintf(p
, "%lu ", n
);
1258 sum_total_chunks
+= i
* n
;
1260 p
+= sprintf(p
, "mean:%lu\n",
1261 chunks
== 0 ? 0 : sum_total_chunks
/ chunks
);
1266 * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
1267 * pages that don't compress to less than this value (including metadata
1268 * overhead) to be rejected. We don't allow the value to get too close
1271 static ssize_t
zv_max_zsize_show(struct kobject
*kobj
,
1272 struct kobj_attribute
*attr
,
1275 return sprintf(buf
, "%u\n", zv_max_zsize
);
1278 static ssize_t
zv_max_zsize_store(struct kobject
*kobj
,
1279 struct kobj_attribute
*attr
,
1280 const char *buf
, size_t count
)
1285 if (!capable(CAP_SYS_ADMIN
))
1288 err
= kstrtoul(buf
, 10, &val
);
1289 if (err
|| (val
== 0) || (val
> (PAGE_SIZE
/ 8) * 7))
1296 * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
1297 * pages that don't compress to less than this value (including metadata
1298 * overhead) to be rejected UNLESS the mean compression is also smaller
1299 * than this value. In other words, we are load-balancing-by-zsize the
1300 * accepted pages. Again, we don't allow the value to get too close
1303 static ssize_t
zv_max_mean_zsize_show(struct kobject
*kobj
,
1304 struct kobj_attribute
*attr
,
1307 return sprintf(buf
, "%u\n", zv_max_mean_zsize
);
1310 static ssize_t
zv_max_mean_zsize_store(struct kobject
*kobj
,
1311 struct kobj_attribute
*attr
,
1312 const char *buf
, size_t count
)
1317 if (!capable(CAP_SYS_ADMIN
))
1320 err
= kstrtoul(buf
, 10, &val
);
1321 if (err
|| (val
== 0) || (val
> (PAGE_SIZE
/ 8) * 7))
1323 zv_max_mean_zsize
= val
;
1328 * setting zv_page_count_policy_percent via sysfs sets an upper bound of
1329 * persistent (e.g. swap) pages that will be retained according to:
1330 * (zv_page_count_policy_percent * totalram_pages) / 100)
1331 * when that limit is reached, further puts will be rejected (until
1332 * some pages have been flushed). Note that, due to compression,
1333 * this number may exceed 100; it defaults to 75 and we set an
1334 * arbitrary limit of 150. A poor choice will almost certainly result
1335 * in OOM's, so this value should only be changed prudently.
1337 static ssize_t
zv_page_count_policy_percent_show(struct kobject
*kobj
,
1338 struct kobj_attribute
*attr
,
1341 return sprintf(buf
, "%u\n", zv_page_count_policy_percent
);
1344 static ssize_t
zv_page_count_policy_percent_store(struct kobject
*kobj
,
1345 struct kobj_attribute
*attr
,
1346 const char *buf
, size_t count
)
1351 if (!capable(CAP_SYS_ADMIN
))
1354 err
= kstrtoul(buf
, 10, &val
);
1355 if (err
|| (val
== 0) || (val
> 150))
1357 zv_page_count_policy_percent
= val
;
1361 static struct kobj_attribute zcache_zv_max_zsize_attr
= {
1362 .attr
= { .name
= "zv_max_zsize", .mode
= 0644 },
1363 .show
= zv_max_zsize_show
,
1364 .store
= zv_max_zsize_store
,
1367 static struct kobj_attribute zcache_zv_max_mean_zsize_attr
= {
1368 .attr
= { .name
= "zv_max_mean_zsize", .mode
= 0644 },
1369 .show
= zv_max_mean_zsize_show
,
1370 .store
= zv_max_mean_zsize_store
,
1373 static struct kobj_attribute zcache_zv_page_count_policy_percent_attr
= {
1374 .attr
= { .name
= "zv_page_count_policy_percent",
1376 .show
= zv_page_count_policy_percent_show
,
1377 .store
= zv_page_count_policy_percent_store
,
1382 * zcache core code starts here
1385 /* useful stats not collected by cleancache or frontswap */
1386 static unsigned long zcache_flush_total
;
1387 static unsigned long zcache_flush_found
;
1388 static unsigned long zcache_flobj_total
;
1389 static unsigned long zcache_flobj_found
;
1390 static unsigned long zcache_failed_eph_puts
;
1391 static unsigned long zcache_nonactive_puts
;
1392 static unsigned long zcache_failed_pers_puts
;
1395 * Tmem operations assume the poolid implies the invoking client.
1396 * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
1397 * RAMster has each client numbered by cluster node, and a KVM version
1398 * of zcache would have one client per guest and each client might
1401 static struct tmem_pool
*zcache_get_pool_by_id(uint16_t cli_id
, uint16_t poolid
)
1403 struct tmem_pool
*pool
= NULL
;
1404 struct zcache_client
*cli
= NULL
;
1406 if (cli_id
== LOCAL_CLIENT
)
1409 if (cli_id
>= MAX_CLIENTS
)
1411 cli
= &zcache_clients
[cli_id
];
1414 atomic_inc(&cli
->refcount
);
1416 if (poolid
< MAX_POOLS_PER_CLIENT
) {
1417 pool
= cli
->tmem_pools
[poolid
];
1419 atomic_inc(&pool
->refcount
);
1425 static void zcache_put_pool(struct tmem_pool
*pool
)
1427 struct zcache_client
*cli
= NULL
;
1432 atomic_dec(&pool
->refcount
);
1433 atomic_dec(&cli
->refcount
);
1436 int zcache_new_client(uint16_t cli_id
)
1438 struct zcache_client
*cli
= NULL
;
1441 if (cli_id
== LOCAL_CLIENT
)
1443 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
1444 cli
= &zcache_clients
[cli_id
];
1450 #ifdef CONFIG_FRONTSWAP
1451 cli
->xvpool
= xv_create_pool();
1452 if (cli
->xvpool
== NULL
)
1460 /* counters for debugging */
1461 static unsigned long zcache_failed_get_free_pages
;
1462 static unsigned long zcache_failed_alloc
;
1463 static unsigned long zcache_put_to_flush
;
1466 * for now, used named slabs so can easily track usage; later can
1467 * either just use kmalloc, or perhaps add a slab-like allocator
1468 * to more carefully manage total memory utilization
1470 static struct kmem_cache
*zcache_objnode_cache
;
1471 static struct kmem_cache
*zcache_obj_cache
;
1472 static struct kmem_cache
*ramster_flnode_cache
;
1473 static atomic_t zcache_curr_obj_count
= ATOMIC_INIT(0);
1474 static unsigned long zcache_curr_obj_count_max
;
1475 static atomic_t zcache_curr_objnode_count
= ATOMIC_INIT(0);
1476 static unsigned long zcache_curr_objnode_count_max
;
1479 * to avoid memory allocation recursion (e.g. due to direct reclaim), we
1480 * preload all necessary data structures so the hostops callbacks never
1481 * actually do a malloc
1483 struct zcache_preload
{
1485 struct tmem_obj
*obj
;
1487 struct tmem_objnode
*objnodes
[OBJNODE_TREE_MAX_PATH
];
1488 struct flushlist_node
*flnode
;
1490 static DEFINE_PER_CPU(struct zcache_preload
, zcache_preloads
) = { 0, };
1492 static int zcache_do_preload(struct tmem_pool
*pool
)
1494 struct zcache_preload
*kp
;
1495 struct tmem_objnode
*objnode
;
1496 struct tmem_obj
*obj
;
1497 struct flushlist_node
*flnode
;
1501 if (unlikely(zcache_objnode_cache
== NULL
))
1503 if (unlikely(zcache_obj_cache
== NULL
))
1506 kp
= &__get_cpu_var(zcache_preloads
);
1507 while (kp
->nr
< ARRAY_SIZE(kp
->objnodes
)) {
1508 preempt_enable_no_resched();
1509 objnode
= kmem_cache_alloc(zcache_objnode_cache
,
1511 if (unlikely(objnode
== NULL
)) {
1512 zcache_failed_alloc
++;
1516 kp
= &__get_cpu_var(zcache_preloads
);
1517 if (kp
->nr
< ARRAY_SIZE(kp
->objnodes
))
1518 kp
->objnodes
[kp
->nr
++] = objnode
;
1520 kmem_cache_free(zcache_objnode_cache
, objnode
);
1522 preempt_enable_no_resched();
1523 obj
= kmem_cache_alloc(zcache_obj_cache
, ZCACHE_GFP_MASK
);
1524 if (unlikely(obj
== NULL
)) {
1525 zcache_failed_alloc
++;
1528 flnode
= kmem_cache_alloc(ramster_flnode_cache
, ZCACHE_GFP_MASK
);
1529 if (unlikely(flnode
== NULL
)) {
1530 zcache_failed_alloc
++;
1533 if (is_ephemeral(pool
)) {
1534 page
= (void *)__get_free_page(ZCACHE_GFP_MASK
);
1535 if (unlikely(page
== NULL
)) {
1536 zcache_failed_get_free_pages
++;
1537 kmem_cache_free(zcache_obj_cache
, obj
);
1538 kmem_cache_free(ramster_flnode_cache
, flnode
);
1543 kp
= &__get_cpu_var(zcache_preloads
);
1544 if (kp
->obj
== NULL
)
1547 kmem_cache_free(zcache_obj_cache
, obj
);
1548 if (kp
->flnode
== NULL
)
1549 kp
->flnode
= flnode
;
1551 kmem_cache_free(ramster_flnode_cache
, flnode
);
1552 if (is_ephemeral(pool
)) {
1553 if (kp
->page
== NULL
)
1556 free_page((unsigned long)page
);
1563 static int ramster_do_preload_flnode_only(struct tmem_pool
*pool
)
1565 struct zcache_preload
*kp
;
1566 struct flushlist_node
*flnode
;
1569 BUG_ON(!irqs_disabled());
1570 if (unlikely(ramster_flnode_cache
== NULL
))
1572 kp
= &__get_cpu_var(zcache_preloads
);
1573 flnode
= kmem_cache_alloc(ramster_flnode_cache
, GFP_ATOMIC
);
1574 if (unlikely(flnode
== NULL
) && kp
->flnode
== NULL
)
1575 BUG(); /* FIXME handle more gracefully, but how??? */
1576 else if (kp
->flnode
== NULL
)
1577 kp
->flnode
= flnode
;
1579 kmem_cache_free(ramster_flnode_cache
, flnode
);
1583 static void *zcache_get_free_page(void)
1585 struct zcache_preload
*kp
;
1588 kp
= &__get_cpu_var(zcache_preloads
);
1590 BUG_ON(page
== NULL
);
1595 static void zcache_free_page(void *p
)
1597 free_page((unsigned long)p
);
1601 * zcache implementation for tmem host ops
1604 static struct tmem_objnode
*zcache_objnode_alloc(struct tmem_pool
*pool
)
1606 struct tmem_objnode
*objnode
= NULL
;
1607 unsigned long count
;
1608 struct zcache_preload
*kp
;
1610 kp
= &__get_cpu_var(zcache_preloads
);
1613 objnode
= kp
->objnodes
[kp
->nr
- 1];
1614 BUG_ON(objnode
== NULL
);
1615 kp
->objnodes
[kp
->nr
- 1] = NULL
;
1617 count
= atomic_inc_return(&zcache_curr_objnode_count
);
1618 if (count
> zcache_curr_objnode_count_max
)
1619 zcache_curr_objnode_count_max
= count
;
1624 static void zcache_objnode_free(struct tmem_objnode
*objnode
,
1625 struct tmem_pool
*pool
)
1627 atomic_dec(&zcache_curr_objnode_count
);
1628 BUG_ON(atomic_read(&zcache_curr_objnode_count
) < 0);
1629 kmem_cache_free(zcache_objnode_cache
, objnode
);
1632 static struct tmem_obj
*zcache_obj_alloc(struct tmem_pool
*pool
)
1634 struct tmem_obj
*obj
= NULL
;
1635 unsigned long count
;
1636 struct zcache_preload
*kp
;
1638 kp
= &__get_cpu_var(zcache_preloads
);
1640 BUG_ON(obj
== NULL
);
1642 count
= atomic_inc_return(&zcache_curr_obj_count
);
1643 if (count
> zcache_curr_obj_count_max
)
1644 zcache_curr_obj_count_max
= count
;
1648 static void zcache_obj_free(struct tmem_obj
*obj
, struct tmem_pool
*pool
)
1650 atomic_dec(&zcache_curr_obj_count
);
1651 BUG_ON(atomic_read(&zcache_curr_obj_count
) < 0);
1652 kmem_cache_free(zcache_obj_cache
, obj
);
1655 static struct flushlist_node
*ramster_flnode_alloc(struct tmem_pool
*pool
)
1657 struct flushlist_node
*flnode
= NULL
;
1658 struct zcache_preload
*kp
;
1661 kp
= &__get_cpu_var(zcache_preloads
);
1662 flnode
= kp
->flnode
;
1663 BUG_ON(flnode
== NULL
);
1665 count
= atomic_inc_return(&ramster_curr_flnode_count
);
1666 if (count
> ramster_curr_flnode_count_max
)
1667 ramster_curr_flnode_count_max
= count
;
1671 static void ramster_flnode_free(struct flushlist_node
*flnode
,
1672 struct tmem_pool
*pool
)
1674 atomic_dec(&ramster_curr_flnode_count
);
1675 BUG_ON(atomic_read(&ramster_curr_flnode_count
) < 0);
1676 kmem_cache_free(ramster_flnode_cache
, flnode
);
1679 static struct tmem_hostops zcache_hostops
= {
1680 .obj_alloc
= zcache_obj_alloc
,
1681 .obj_free
= zcache_obj_free
,
1682 .objnode_alloc
= zcache_objnode_alloc
,
1683 .objnode_free
= zcache_objnode_free
,
1687 * zcache implementations for PAM page descriptor ops
1691 static inline void dec_and_check(atomic_t
*pvar
)
1694 /* later when all accounting is fixed, make this a BUG */
1695 WARN_ON_ONCE(atomic_read(pvar
) < 0);
1698 static atomic_t zcache_curr_eph_pampd_count
= ATOMIC_INIT(0);
1699 static unsigned long zcache_curr_eph_pampd_count_max
;
1700 static atomic_t zcache_curr_pers_pampd_count
= ATOMIC_INIT(0);
1701 static unsigned long zcache_curr_pers_pampd_count_max
;
1703 /* forward reference */
1704 static int zcache_compress(struct page
*from
, void **out_va
, size_t *out_len
);
1706 static int zcache_pampd_eph_create(char *data
, size_t size
, bool raw
,
1707 struct tmem_pool
*pool
, struct tmem_oid
*oid
,
1708 uint32_t index
, void **pampd
)
1713 struct zcache_client
*cli
= pool
->client
;
1714 uint16_t client_id
= get_client_id_from_client(cli
);
1715 struct page
*page
= NULL
;
1716 unsigned long count
;
1719 page
= virt_to_page(data
);
1720 ret
= zcache_compress(page
, &cdata
, &clen
);
1723 if (clen
== 0 || clen
> zbud_max_buddy_size()) {
1724 zcache_compress_poor
++;
1728 *pampd
= (void *)zbud_create(client_id
, pool
->pool_id
, oid
,
1729 index
, page
, cdata
, clen
);
1730 if (*pampd
== NULL
) {
1735 count
= atomic_inc_return(&zcache_curr_eph_pampd_count
);
1736 if (count
> zcache_curr_eph_pampd_count_max
)
1737 zcache_curr_eph_pampd_count_max
= count
;
1738 if (client_id
!= LOCAL_CLIENT
) {
1739 count
= atomic_inc_return(&ramster_foreign_eph_pampd_count
);
1740 if (count
> ramster_foreign_eph_pampd_count_max
)
1741 ramster_foreign_eph_pampd_count_max
= count
;
1747 static int zcache_pampd_pers_create(char *data
, size_t size
, bool raw
,
1748 struct tmem_pool
*pool
, struct tmem_oid
*oid
,
1749 uint32_t index
, void **pampd
)
1754 struct zcache_client
*cli
= pool
->client
;
1756 unsigned long count
;
1757 unsigned long zv_mean_zsize
;
1759 long curr_pers_pampd_count
;
1761 #ifdef RAMSTER_TESTING
1762 static bool pampd_neg_warned
;
1765 curr_pers_pampd_count
= atomic_read(&zcache_curr_pers_pampd_count
) -
1766 atomic_read(&ramster_remote_pers_pages
);
1767 #ifdef RAMSTER_TESTING
1768 /* should always be positive, but warn if accounting is off */
1769 if (!pampd_neg_warned
) {
1770 pr_warn("ramster: bad accounting for curr_pers_pampd_count\n");
1771 pampd_neg_warned
= true;
1774 if (curr_pers_pampd_count
>
1775 (zv_page_count_policy_percent
* totalram_pages
) / 100) {
1776 zcache_policy_percent_exceeded
++;
1781 page
= virt_to_page(data
);
1782 if (zcache_compress(page
, &cdata
, &clen
) == 0)
1784 /* reject if compression is too poor */
1785 if (clen
> zv_max_zsize
) {
1786 zcache_compress_poor
++;
1789 /* reject if mean compression is too poor */
1790 if ((clen
> zv_max_mean_zsize
) && (curr_pers_pampd_count
> 0)) {
1791 total_zsize
= xv_get_total_size_bytes(cli
->xvpool
);
1792 zv_mean_zsize
= div_u64(total_zsize
, curr_pers_pampd_count
);
1793 if (zv_mean_zsize
> zv_max_mean_zsize
) {
1794 zcache_mean_compress_poor
++;
1799 *pampd
= (void *)zv_create(cli
, pool
->pool_id
, oid
, index
, cdata
, clen
);
1800 if (*pampd
== NULL
) {
1805 count
= atomic_inc_return(&zcache_curr_pers_pampd_count
);
1806 if (count
> zcache_curr_pers_pampd_count_max
)
1807 zcache_curr_pers_pampd_count_max
= count
;
1808 if (is_local_client(cli
))
1810 zv
= *(struct zv_hdr
**)pampd
;
1811 count
= atomic_inc_return(&ramster_foreign_pers_pampd_count
);
1812 if (count
> ramster_foreign_pers_pampd_count_max
)
1813 ramster_foreign_pers_pampd_count_max
= count
;
1818 static void *zcache_pampd_create(char *data
, size_t size
, bool raw
, int eph
,
1819 struct tmem_pool
*pool
, struct tmem_oid
*oid
,
1826 BUG_ON(preemptible());
1827 ephemeral
= (eph
== 1) || ((eph
== 0) && is_ephemeral(pool
));
1829 ret
= zcache_pampd_eph_create(data
, size
, raw
, pool
,
1830 oid
, index
, &pampd
);
1832 ret
= zcache_pampd_pers_create(data
, size
, raw
, pool
,
1833 oid
, index
, &pampd
);
1834 /* FIXME add some counters here for failed creates? */
1839 * fill the pageframe corresponding to the struct page with the data
1840 * from the passed pampd
1842 static int zcache_pampd_get_data(char *data
, size_t *bufsize
, bool raw
,
1843 void *pampd
, struct tmem_pool
*pool
,
1844 struct tmem_oid
*oid
, uint32_t index
)
1848 BUG_ON(preemptible());
1849 BUG_ON(is_ephemeral(pool
)); /* Fix later for shared pools? */
1850 BUG_ON(pampd_is_remote(pampd
));
1852 zv_copy_from_pampd(data
, bufsize
, pampd
);
1854 zv_decompress(virt_to_page(data
), pampd
);
1858 static int zcache_pampd_get_data_and_free(char *data
, size_t *bufsize
, bool raw
,
1859 void *pampd
, struct tmem_pool
*pool
,
1860 struct tmem_oid
*oid
, uint32_t index
)
1863 unsigned long flags
;
1864 struct zcache_client
*cli
= pool
->client
;
1866 BUG_ON(preemptible());
1867 BUG_ON(pampd_is_remote(pampd
));
1868 if (is_ephemeral(pool
)) {
1869 local_irq_save(flags
);
1871 zbud_copy_from_pampd(data
, bufsize
, pampd
);
1873 ret
= zbud_decompress(virt_to_page(data
), pampd
);
1874 zbud_free_and_delist((struct zbud_hdr
*)pampd
);
1875 local_irq_restore(flags
);
1876 if (!is_local_client(cli
))
1877 dec_and_check(&ramster_foreign_eph_pampd_count
);
1878 dec_and_check(&zcache_curr_eph_pampd_count
);
1880 if (is_local_client(cli
))
1883 zv_copy_from_pampd(data
, bufsize
, pampd
);
1885 zv_decompress(virt_to_page(data
), pampd
);
1886 zv_free(cli
->xvpool
, pampd
);
1887 if (!is_local_client(cli
))
1888 dec_and_check(&ramster_foreign_pers_pampd_count
);
1889 dec_and_check(&zcache_curr_pers_pampd_count
);
1895 static bool zcache_pampd_is_remote(void *pampd
)
1897 return pampd_is_remote(pampd
);
1901 * free the pampd and remove it from any zcache lists
1902 * pampd must no longer be pointed to from any tmem data structures!
1904 static void zcache_pampd_free(void *pampd
, struct tmem_pool
*pool
,
1905 struct tmem_oid
*oid
, uint32_t index
, bool acct
)
1907 struct zcache_client
*cli
= pool
->client
;
1908 bool eph
= is_ephemeral(pool
);
1911 BUG_ON(preemptible());
1912 if (pampd_is_remote(pampd
)) {
1913 WARN_ON(acct
== false);
1916 * a NULL oid means to ignore this pampd free
1917 * as the remote freeing will be handled elsewhere
1920 /* FIXME remote flush optional but probably good idea */
1921 /* FIXME get these working properly again */
1922 dec_and_check(&zcache_curr_eph_pampd_count
);
1923 } else if (pampd_is_intransit(pampd
)) {
1924 /* did a pers remote get_and_free, so just free local */
1925 pampd
= pampd_mask_intransit_and_remote(pampd
);
1928 struct flushlist_node
*flnode
=
1929 ramster_flnode_alloc(pool
);
1931 flnode
->xh
.client_id
= pampd_remote_node(pampd
);
1932 flnode
->xh
.pool_id
= pool
->pool_id
;
1933 flnode
->xh
.oid
= *oid
;
1934 flnode
->xh
.index
= index
;
1935 flnode
->rem_op
.op
= RAMSTER_REMOTIFY_FLUSH_PAGE
;
1936 spin_lock(&zcache_rem_op_list_lock
);
1937 list_add(&flnode
->rem_op
.list
, &zcache_rem_op_list
);
1938 spin_unlock(&zcache_rem_op_list_lock
);
1939 dec_and_check(&zcache_curr_pers_pampd_count
);
1940 dec_and_check(&ramster_remote_pers_pages
);
1943 zbud_free_and_delist((struct zbud_hdr
*)pampd
);
1944 if (!is_local_client(pool
->client
))
1945 dec_and_check(&ramster_foreign_eph_pampd_count
);
1947 /* FIXME get these working properly again */
1948 dec_and_check(&zcache_curr_eph_pampd_count
);
1951 zv
= (struct zv_hdr
*)pampd
;
1952 if (!is_local_client(pool
->client
))
1953 dec_and_check(&ramster_foreign_pers_pampd_count
);
1954 zv_free(cli
->xvpool
, zv
);
1956 /* FIXME get these working properly again */
1957 dec_and_check(&zcache_curr_pers_pampd_count
);
1961 static void zcache_pampd_free_obj(struct tmem_pool
*pool
,
1962 struct tmem_obj
*obj
)
1964 struct flushlist_node
*flnode
;
1966 BUG_ON(preemptible());
1967 if (obj
->extra
== NULL
)
1969 BUG_ON(!pampd_is_remote(obj
->extra
));
1970 flnode
= ramster_flnode_alloc(pool
);
1971 flnode
->xh
.client_id
= pampd_remote_node(obj
->extra
);
1972 flnode
->xh
.pool_id
= pool
->pool_id
;
1973 flnode
->xh
.oid
= obj
->oid
;
1974 flnode
->xh
.index
= FLUSH_ENTIRE_OBJECT
;
1975 flnode
->rem_op
.op
= RAMSTER_REMOTIFY_FLUSH_OBJ
;
1976 spin_lock(&zcache_rem_op_list_lock
);
1977 list_add(&flnode
->rem_op
.list
, &zcache_rem_op_list
);
1978 spin_unlock(&zcache_rem_op_list_lock
);
1981 void zcache_pampd_new_obj(struct tmem_obj
*obj
)
1986 int zcache_pampd_replace_in_obj(void *new_pampd
, struct tmem_obj
*obj
)
1990 if (new_pampd
!= NULL
) {
1991 if (obj
->extra
== NULL
)
1992 obj
->extra
= new_pampd
;
1993 /* enforce that all remote pages in an object reside
1994 * in the same node! */
1995 else if (pampd_remote_node(new_pampd
) !=
1996 pampd_remote_node((void *)(obj
->extra
)))
2004 * Called by the message handler after a (still compressed) page has been
2005 * fetched from the remote machine in response to an "is_remote" tmem_get
2006 * or persistent tmem_localify. For a tmem_get, "extra" is the address of
2007 * the page that is to be filled to successfully resolve the tmem_get; for
2008 * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only
2009 * in the local zcache). "data" points to "size" bytes of (compressed) data
2010 * passed in the message. In the case of a persistent remote get, if
2011 * pre-allocation was successful (see zcache_repatriate_preload), the page
2012 * is placed into both local zcache and at "extra".
2014 int zcache_localify(int pool_id
, struct tmem_oid
*oidp
,
2015 uint32_t index
, char *data
, size_t size
,
2019 unsigned long flags
;
2020 struct tmem_pool
*pool
;
2021 bool ephemeral
, delete = false;
2022 size_t clen
= PAGE_SIZE
;
2023 void *pampd
, *saved_hb
;
2024 struct tmem_obj
*obj
;
2026 pool
= zcache_get_pool_by_id(LOCAL_CLIENT
, pool_id
);
2027 if (unlikely(pool
== NULL
))
2028 /* pool doesn't exist anymore */
2030 ephemeral
= is_ephemeral(pool
);
2031 local_irq_save(flags
); /* FIXME: maybe only disable softirqs? */
2032 pampd
= tmem_localify_get_pampd(pool
, oidp
, index
, &obj
, &saved_hb
);
2033 if (pampd
== NULL
) {
2034 /* hmmm... must have been a flush while waiting */
2035 #ifdef RAMSTER_TESTING
2036 pr_err("UNTESTED pampd==NULL in zcache_localify\n");
2039 ramster_remote_eph_pages_unsucc_get
++;
2041 ramster_remote_pers_pages_unsucc_get
++;
2044 } else if (unlikely(!pampd_is_remote(pampd
))) {
2045 /* hmmm... must have been a dup put while waiting */
2046 #ifdef RAMSTER_TESTING
2047 pr_err("UNTESTED dup while waiting in zcache_localify\n");
2050 ramster_remote_eph_pages_unsucc_get
++;
2052 ramster_remote_pers_pages_unsucc_get
++;
2057 } else if (size
== 0) {
2058 /* no remote data, delete the local is_remote pampd */
2061 ramster_remote_eph_pages_unsucc_get
++;
2067 if (!ephemeral
&& pampd_is_intransit(pampd
)) {
2068 /* localify to zcache */
2069 pampd
= pampd_mask_intransit_and_remote(pampd
);
2070 zv_copy_to_pampd(pampd
, data
, size
);
2075 if (extra
!= NULL
) {
2076 /* decompress direct-to-memory to complete remotify */
2077 ret
= lzo1x_decompress_safe((char *)data
, size
,
2078 (char *)extra
, &clen
);
2079 BUG_ON(ret
!= LZO_E_OK
);
2080 BUG_ON(clen
!= PAGE_SIZE
);
2083 ramster_remote_eph_pages_succ_get
++;
2085 ramster_remote_pers_pages_succ_get
++;
2088 tmem_localify_finish(obj
, index
, pampd
, saved_hb
, delete);
2089 zcache_put_pool(pool
);
2090 local_irq_restore(flags
);
2096 * Called on a remote persistent tmem_get to attempt to preallocate
2097 * local storage for the data contained in the remote persistent page.
2098 * If successfully preallocated, returns the pampd, marked as remote and
2099 * in_transit. Else returns NULL. Note that the appropriate tmem data
2100 * structure must be locked.
2102 static void *zcache_pampd_repatriate_preload(void *pampd
,
2103 struct tmem_pool
*pool
,
2104 struct tmem_oid
*oid
,
2108 int clen
= pampd_remote_size(pampd
);
2109 void *ret_pampd
= NULL
;
2110 unsigned long flags
;
2112 if (!pampd_is_remote(pampd
))
2114 if (is_ephemeral(pool
))
2116 if (pampd_is_intransit(pampd
)) {
2118 * to avoid multiple allocations (and maybe a memory leak)
2119 * don't preallocate if already in the process of being
2126 local_irq_save(flags
);
2127 ret_pampd
= (void *)zv_alloc(pool
, oid
, index
, clen
);
2128 if (ret_pampd
!= NULL
) {
2130 * a pampd is marked intransit if it is remote and space has
2131 * been allocated for it locally (note, only happens for
2132 * persistent pages, in which case the remote copy is freed)
2134 ret_pampd
= pampd_mark_intransit(ret_pampd
);
2135 dec_and_check(&ramster_remote_pers_pages
);
2137 ramster_pers_pages_remote_nomem
++;
2138 local_irq_restore(flags
);
2144 * Called on a remote tmem_get to invoke a message to fetch the page.
2145 * Might sleep so no tmem locks can be held. "extra" is passed
2146 * all the way through the round-trip messaging to zcache_localify.
2148 static int zcache_pampd_repatriate(void *fake_pampd
, void *real_pampd
,
2149 struct tmem_pool
*pool
,
2150 struct tmem_oid
*oid
, uint32_t index
,
2151 bool free
, void *extra
)
2153 struct tmem_xhandle xh
;
2156 if (pampd_is_intransit(real_pampd
))
2157 /* have local space pre-reserved, so free remote copy */
2159 xh
= tmem_xhandle_fill(LOCAL_CLIENT
, pool
, oid
, index
);
2160 /* unreliable request/response for now */
2161 ret
= ramster_remote_async_get(&xh
, free
,
2162 pampd_remote_node(fake_pampd
),
2163 pampd_remote_size(fake_pampd
),
2164 pampd_remote_cksum(fake_pampd
),
2166 #ifdef RAMSTER_TESTING
2167 if (ret
!= 0 && ret
!= -ENOENT
)
2168 pr_err("TESTING zcache_pampd_repatriate returns, ret=%d\n",
2174 static struct tmem_pamops zcache_pamops
= {
2175 .create
= zcache_pampd_create
,
2176 .get_data
= zcache_pampd_get_data
,
2177 .free
= zcache_pampd_free
,
2178 .get_data_and_free
= zcache_pampd_get_data_and_free
,
2179 .free_obj
= zcache_pampd_free_obj
,
2180 .is_remote
= zcache_pampd_is_remote
,
2181 .repatriate_preload
= zcache_pampd_repatriate_preload
,
2182 .repatriate
= zcache_pampd_repatriate
,
2183 .new_obj
= zcache_pampd_new_obj
,
2184 .replace_in_obj
= zcache_pampd_replace_in_obj
,
2188 * zcache compression/decompression and related per-cpu stuff
2191 #define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
2192 #define LZO_DSTMEM_PAGE_ORDER 1
2193 static DEFINE_PER_CPU(unsigned char *, zcache_workmem
);
2194 static DEFINE_PER_CPU(unsigned char *, zcache_dstmem
);
2196 static int zcache_compress(struct page
*from
, void **out_va
, size_t *out_len
)
2199 unsigned char *dmem
= __get_cpu_var(zcache_dstmem
);
2200 unsigned char *wmem
= __get_cpu_var(zcache_workmem
);
2203 BUG_ON(!irqs_disabled());
2204 if (unlikely(dmem
== NULL
|| wmem
== NULL
))
2205 goto out
; /* no buffer, so can't compress */
2206 from_va
= kmap_atomic(from
);
2208 ret
= lzo1x_1_compress(from_va
, PAGE_SIZE
, dmem
, out_len
, wmem
);
2209 BUG_ON(ret
!= LZO_E_OK
);
2211 kunmap_atomic(from_va
);
2218 static int zcache_cpu_notifier(struct notifier_block
*nb
,
2219 unsigned long action
, void *pcpu
)
2221 int cpu
= (long)pcpu
;
2222 struct zcache_preload
*kp
;
2225 case CPU_UP_PREPARE
:
2226 per_cpu(zcache_dstmem
, cpu
) = (void *)__get_free_pages(
2227 GFP_KERNEL
| __GFP_REPEAT
,
2228 LZO_DSTMEM_PAGE_ORDER
),
2229 per_cpu(zcache_workmem
, cpu
) =
2230 kzalloc(LZO1X_MEM_COMPRESS
,
2231 GFP_KERNEL
| __GFP_REPEAT
);
2232 per_cpu(zcache_remoteputmem
, cpu
) =
2233 kzalloc(PAGE_SIZE
, GFP_KERNEL
| __GFP_REPEAT
);
2236 case CPU_UP_CANCELED
:
2237 kfree(per_cpu(zcache_remoteputmem
, cpu
));
2238 per_cpu(zcache_remoteputmem
, cpu
) = NULL
;
2239 free_pages((unsigned long)per_cpu(zcache_dstmem
, cpu
),
2240 LZO_DSTMEM_PAGE_ORDER
);
2241 per_cpu(zcache_dstmem
, cpu
) = NULL
;
2242 kfree(per_cpu(zcache_workmem
, cpu
));
2243 per_cpu(zcache_workmem
, cpu
) = NULL
;
2244 kp
= &per_cpu(zcache_preloads
, cpu
);
2246 kmem_cache_free(zcache_objnode_cache
,
2247 kp
->objnodes
[kp
->nr
- 1]);
2248 kp
->objnodes
[kp
->nr
- 1] = NULL
;
2252 kmem_cache_free(zcache_obj_cache
, kp
->obj
);
2256 kmem_cache_free(ramster_flnode_cache
, kp
->flnode
);
2260 free_page((unsigned long)kp
->page
);
2270 static struct notifier_block zcache_cpu_notifier_block
= {
2271 .notifier_call
= zcache_cpu_notifier
2275 #define ZCACHE_SYSFS_RO(_name) \
2276 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
2277 struct kobj_attribute *attr, char *buf) \
2279 return sprintf(buf, "%lu\n", zcache_##_name); \
2281 static struct kobj_attribute zcache_##_name##_attr = { \
2282 .attr = { .name = __stringify(_name), .mode = 0444 }, \
2283 .show = zcache_##_name##_show, \
2286 #define ZCACHE_SYSFS_RO_ATOMIC(_name) \
2287 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
2288 struct kobj_attribute *attr, char *buf) \
2290 return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
2292 static struct kobj_attribute zcache_##_name##_attr = { \
2293 .attr = { .name = __stringify(_name), .mode = 0444 }, \
2294 .show = zcache_##_name##_show, \
2297 #define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
2298 static ssize_t zcache_##_name##_show(struct kobject *kobj, \
2299 struct kobj_attribute *attr, char *buf) \
2301 return _func(buf); \
2303 static struct kobj_attribute zcache_##_name##_attr = { \
2304 .attr = { .name = __stringify(_name), .mode = 0444 }, \
2305 .show = zcache_##_name##_show, \
2308 ZCACHE_SYSFS_RO(curr_obj_count_max
);
2309 ZCACHE_SYSFS_RO(curr_objnode_count_max
);
2310 ZCACHE_SYSFS_RO(flush_total
);
2311 ZCACHE_SYSFS_RO(flush_found
);
2312 ZCACHE_SYSFS_RO(flobj_total
);
2313 ZCACHE_SYSFS_RO(flobj_found
);
2314 ZCACHE_SYSFS_RO(failed_eph_puts
);
2315 ZCACHE_SYSFS_RO(nonactive_puts
);
2316 ZCACHE_SYSFS_RO(failed_pers_puts
);
2317 ZCACHE_SYSFS_RO(zbud_curr_zbytes
);
2318 ZCACHE_SYSFS_RO(zbud_cumul_zpages
);
2319 ZCACHE_SYSFS_RO(zbud_cumul_zbytes
);
2320 ZCACHE_SYSFS_RO(zbud_buddied_count
);
2321 ZCACHE_SYSFS_RO(evicted_raw_pages
);
2322 ZCACHE_SYSFS_RO(evicted_unbuddied_pages
);
2323 ZCACHE_SYSFS_RO(evicted_buddied_pages
);
2324 ZCACHE_SYSFS_RO(failed_get_free_pages
);
2325 ZCACHE_SYSFS_RO(failed_alloc
);
2326 ZCACHE_SYSFS_RO(put_to_flush
);
2327 ZCACHE_SYSFS_RO(compress_poor
);
2328 ZCACHE_SYSFS_RO(mean_compress_poor
);
2329 ZCACHE_SYSFS_RO(policy_percent_exceeded
);
2330 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages
);
2331 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages
);
2332 ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count
);
2333 ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count
);
2334 ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts
,
2335 zbud_show_unbuddied_list_counts
);
2336 ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts
,
2337 zbud_show_cumul_chunk_counts
);
2338 ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts
,
2339 zv_curr_dist_counts_show
);
2340 ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts
,
2341 zv_cumul_dist_counts_show
);
2343 static struct attribute
*zcache_attrs
[] = {
2344 &zcache_curr_obj_count_attr
.attr
,
2345 &zcache_curr_obj_count_max_attr
.attr
,
2346 &zcache_curr_objnode_count_attr
.attr
,
2347 &zcache_curr_objnode_count_max_attr
.attr
,
2348 &zcache_flush_total_attr
.attr
,
2349 &zcache_flobj_total_attr
.attr
,
2350 &zcache_flush_found_attr
.attr
,
2351 &zcache_flobj_found_attr
.attr
,
2352 &zcache_failed_eph_puts_attr
.attr
,
2353 &zcache_nonactive_puts_attr
.attr
,
2354 &zcache_failed_pers_puts_attr
.attr
,
2355 &zcache_policy_percent_exceeded_attr
.attr
,
2356 &zcache_compress_poor_attr
.attr
,
2357 &zcache_mean_compress_poor_attr
.attr
,
2358 &zcache_zbud_curr_raw_pages_attr
.attr
,
2359 &zcache_zbud_curr_zpages_attr
.attr
,
2360 &zcache_zbud_curr_zbytes_attr
.attr
,
2361 &zcache_zbud_cumul_zpages_attr
.attr
,
2362 &zcache_zbud_cumul_zbytes_attr
.attr
,
2363 &zcache_zbud_buddied_count_attr
.attr
,
2364 &zcache_evicted_raw_pages_attr
.attr
,
2365 &zcache_evicted_unbuddied_pages_attr
.attr
,
2366 &zcache_evicted_buddied_pages_attr
.attr
,
2367 &zcache_failed_get_free_pages_attr
.attr
,
2368 &zcache_failed_alloc_attr
.attr
,
2369 &zcache_put_to_flush_attr
.attr
,
2370 &zcache_zbud_unbuddied_list_counts_attr
.attr
,
2371 &zcache_zbud_cumul_chunk_counts_attr
.attr
,
2372 &zcache_zv_curr_dist_counts_attr
.attr
,
2373 &zcache_zv_cumul_dist_counts_attr
.attr
,
2374 &zcache_zv_max_zsize_attr
.attr
,
2375 &zcache_zv_max_mean_zsize_attr
.attr
,
2376 &zcache_zv_page_count_policy_percent_attr
.attr
,
2380 static struct attribute_group zcache_attr_group
= {
2381 .attrs
= zcache_attrs
,
2385 #define RAMSTER_SYSFS_RO(_name) \
2386 static ssize_t ramster_##_name##_show(struct kobject *kobj, \
2387 struct kobj_attribute *attr, char *buf) \
2389 return sprintf(buf, "%lu\n", ramster_##_name); \
2391 static struct kobj_attribute ramster_##_name##_attr = { \
2392 .attr = { .name = __stringify(_name), .mode = 0444 }, \
2393 .show = ramster_##_name##_show, \
2396 #define RAMSTER_SYSFS_RW(_name) \
2397 static ssize_t ramster_##_name##_show(struct kobject *kobj, \
2398 struct kobj_attribute *attr, char *buf) \
2400 return sprintf(buf, "%lu\n", ramster_##_name); \
2402 static ssize_t ramster_##_name##_store(struct kobject *kobj, \
2403 struct kobj_attribute *attr, const char *buf, size_t count) \
2406 unsigned long enable; \
2407 err = kstrtoul(buf, 10, &enable); \
2410 ramster_##_name = enable; \
2413 static struct kobj_attribute ramster_##_name##_attr = { \
2414 .attr = { .name = __stringify(_name), .mode = 0644 }, \
2415 .show = ramster_##_name##_show, \
2416 .store = ramster_##_name##_store, \
2419 #define RAMSTER_SYSFS_RO_ATOMIC(_name) \
2420 static ssize_t ramster_##_name##_show(struct kobject *kobj, \
2421 struct kobj_attribute *attr, char *buf) \
2423 return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \
2425 static struct kobj_attribute ramster_##_name##_attr = { \
2426 .attr = { .name = __stringify(_name), .mode = 0444 }, \
2427 .show = ramster_##_name##_show, \
2430 RAMSTER_SYSFS_RO(interface_revision
);
2431 RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages
);
2432 RAMSTER_SYSFS_RW(pers_remotify_enable
);
2433 RAMSTER_SYSFS_RW(eph_remotify_enable
);
2434 RAMSTER_SYSFS_RO(eph_pages_remoted
);
2435 RAMSTER_SYSFS_RO(eph_pages_remote_failed
);
2436 RAMSTER_SYSFS_RO(pers_pages_remoted
);
2437 RAMSTER_SYSFS_RO(pers_pages_remote_failed
);
2438 RAMSTER_SYSFS_RO(pers_pages_remote_nomem
);
2439 RAMSTER_SYSFS_RO(remote_pages_flushed
);
2440 RAMSTER_SYSFS_RO(remote_page_flushes_failed
);
2441 RAMSTER_SYSFS_RO(remote_objects_flushed
);
2442 RAMSTER_SYSFS_RO(remote_object_flushes_failed
);
2443 RAMSTER_SYSFS_RO(remote_eph_pages_succ_get
);
2444 RAMSTER_SYSFS_RO(remote_eph_pages_unsucc_get
);
2445 RAMSTER_SYSFS_RO(remote_pers_pages_succ_get
);
2446 RAMSTER_SYSFS_RO(remote_pers_pages_unsucc_get
);
2447 RAMSTER_SYSFS_RO_ATOMIC(foreign_eph_pampd_count
);
2448 RAMSTER_SYSFS_RO(foreign_eph_pampd_count_max
);
2449 RAMSTER_SYSFS_RO_ATOMIC(foreign_pers_pampd_count
);
2450 RAMSTER_SYSFS_RO(foreign_pers_pampd_count_max
);
2451 RAMSTER_SYSFS_RO_ATOMIC(curr_flnode_count
);
2452 RAMSTER_SYSFS_RO(curr_flnode_count_max
);
2454 #define MANUAL_NODES 8
2455 static bool ramster_nodes_manual_up
[MANUAL_NODES
];
2456 static ssize_t
ramster_manual_node_up_show(struct kobject
*kobj
,
2457 struct kobj_attribute
*attr
, char *buf
)
2461 for (i
= 0; i
< MANUAL_NODES
; i
++)
2462 if (ramster_nodes_manual_up
[i
])
2463 p
+= sprintf(p
, "%d ", i
);
2464 p
+= sprintf(p
, "\n");
2468 static ssize_t
ramster_manual_node_up_store(struct kobject
*kobj
,
2469 struct kobj_attribute
*attr
, const char *buf
, size_t count
)
2472 unsigned long node_num
;
2474 err
= kstrtoul(buf
, 10, &node_num
);
2476 pr_err("ramster: bad strtoul?\n");
2479 if (node_num
>= MANUAL_NODES
) {
2480 pr_err("ramster: bad node_num=%lu?\n", node_num
);
2483 if (ramster_nodes_manual_up
[node_num
]) {
2484 pr_err("ramster: node %d already up, ignoring\n",
2487 ramster_nodes_manual_up
[node_num
] = true;
2488 r2net_hb_node_up_manual((int)node_num
);
2493 static struct kobj_attribute ramster_manual_node_up_attr
= {
2494 .attr
= { .name
= "manual_node_up", .mode
= 0644 },
2495 .show
= ramster_manual_node_up_show
,
2496 .store
= ramster_manual_node_up_store
,
2499 static ssize_t
ramster_remote_target_nodenum_show(struct kobject
*kobj
,
2500 struct kobj_attribute
*attr
, char *buf
)
2502 if (ramster_remote_target_nodenum
== -1UL)
2503 return sprintf(buf
, "unset\n");
2505 return sprintf(buf
, "%d\n", ramster_remote_target_nodenum
);
2508 static ssize_t
ramster_remote_target_nodenum_store(struct kobject
*kobj
,
2509 struct kobj_attribute
*attr
, const char *buf
, size_t count
)
2512 unsigned long node_num
;
2514 err
= kstrtoul(buf
, 10, &node_num
);
2516 pr_err("ramster: bad strtoul?\n");
2518 } else if (node_num
== -1UL) {
2519 pr_err("ramster: disabling all remotification, "
2520 "data may still reside on remote nodes however\n");
2522 } else if (node_num
>= MANUAL_NODES
) {
2523 pr_err("ramster: bad node_num=%lu?\n", node_num
);
2525 } else if (!ramster_nodes_manual_up
[node_num
]) {
2526 pr_err("ramster: node %d not up, ignoring setting "
2527 "of remotification target\n", (int)node_num
);
2528 } else if (r2net_remote_target_node_set((int)node_num
) >= 0) {
2529 pr_info("ramster: node %d set as remotification target\n",
2531 ramster_remote_target_nodenum
= (int)node_num
;
2533 pr_err("ramster: bad num to node node_num=%d?\n",
2540 static struct kobj_attribute ramster_remote_target_nodenum_attr
= {
2541 .attr
= { .name
= "remote_target_nodenum", .mode
= 0644 },
2542 .show
= ramster_remote_target_nodenum_show
,
2543 .store
= ramster_remote_target_nodenum_store
,
2547 static struct attribute
*ramster_attrs
[] = {
2548 &ramster_interface_revision_attr
.attr
,
2549 &ramster_pers_remotify_enable_attr
.attr
,
2550 &ramster_eph_remotify_enable_attr
.attr
,
2551 &ramster_remote_pers_pages_attr
.attr
,
2552 &ramster_eph_pages_remoted_attr
.attr
,
2553 &ramster_eph_pages_remote_failed_attr
.attr
,
2554 &ramster_pers_pages_remoted_attr
.attr
,
2555 &ramster_pers_pages_remote_failed_attr
.attr
,
2556 &ramster_pers_pages_remote_nomem_attr
.attr
,
2557 &ramster_remote_pages_flushed_attr
.attr
,
2558 &ramster_remote_page_flushes_failed_attr
.attr
,
2559 &ramster_remote_objects_flushed_attr
.attr
,
2560 &ramster_remote_object_flushes_failed_attr
.attr
,
2561 &ramster_remote_eph_pages_succ_get_attr
.attr
,
2562 &ramster_remote_eph_pages_unsucc_get_attr
.attr
,
2563 &ramster_remote_pers_pages_succ_get_attr
.attr
,
2564 &ramster_remote_pers_pages_unsucc_get_attr
.attr
,
2565 &ramster_foreign_eph_pampd_count_attr
.attr
,
2566 &ramster_foreign_eph_pampd_count_max_attr
.attr
,
2567 &ramster_foreign_pers_pampd_count_attr
.attr
,
2568 &ramster_foreign_pers_pampd_count_max_attr
.attr
,
2569 &ramster_curr_flnode_count_attr
.attr
,
2570 &ramster_curr_flnode_count_max_attr
.attr
,
2571 &ramster_manual_node_up_attr
.attr
,
2572 &ramster_remote_target_nodenum_attr
.attr
,
2576 static struct attribute_group ramster_attr_group
= {
2577 .attrs
= ramster_attrs
,
2581 #endif /* CONFIG_SYSFS */
2583 * When zcache is disabled ("frozen"), pools can be created and destroyed,
2584 * but all puts (and thus all other operations that require memory allocation)
2585 * must fail. If zcache is unfrozen, accepts puts, then frozen again,
2586 * data consistency requires all puts while frozen to be converted into
2589 static bool zcache_freeze
;
2592 * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
2594 static int shrink_zcache_memory(struct shrinker
*shrink
,
2595 struct shrink_control
*sc
)
2598 int nr
= sc
->nr_to_scan
;
2599 gfp_t gfp_mask
= sc
->gfp_mask
;
2602 if (!(gfp_mask
& __GFP_FS
))
2603 /* does this case really need to be skipped? */
2605 zbud_evict_pages(nr
);
2607 ret
= (int)atomic_read(&zcache_zbud_curr_raw_pages
);
2612 static struct shrinker zcache_shrinker
= {
2613 .shrink
= shrink_zcache_memory
,
2614 .seeks
= DEFAULT_SEEKS
,
2618 * zcache shims between cleancache/frontswap ops and tmem
2621 int zcache_put(int cli_id
, int pool_id
, struct tmem_oid
*oidp
,
2622 uint32_t index
, char *data
, size_t size
,
2623 bool raw
, int ephemeral
)
2625 struct tmem_pool
*pool
;
2628 BUG_ON(!irqs_disabled());
2629 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
2630 if (unlikely(pool
== NULL
))
2632 if (!zcache_freeze
&& zcache_do_preload(pool
) == 0) {
2633 /* preload does preempt_disable on success */
2634 ret
= tmem_put(pool
, oidp
, index
, data
, size
, raw
, ephemeral
);
2636 if (is_ephemeral(pool
))
2637 zcache_failed_eph_puts
++;
2639 zcache_failed_pers_puts
++;
2641 zcache_put_pool(pool
);
2642 preempt_enable_no_resched();
2644 zcache_put_to_flush
++;
2645 if (atomic_read(&pool
->obj_count
) > 0)
2646 /* the put fails whether the flush succeeds or not */
2647 (void)tmem_flush_page(pool
, oidp
, index
);
2648 zcache_put_pool(pool
);
2654 int zcache_get(int cli_id
, int pool_id
, struct tmem_oid
*oidp
,
2655 uint32_t index
, char *data
, size_t *sizep
,
2656 bool raw
, int get_and_free
)
2658 struct tmem_pool
*pool
;
2663 BUG_ON(irqs_disabled());
2664 BUG_ON(in_softirq());
2666 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
2667 eph
= is_ephemeral(pool
);
2668 if (likely(pool
!= NULL
)) {
2669 if (atomic_read(&pool
->obj_count
) > 0)
2670 ret
= tmem_get(pool
, oidp
, index
, data
, sizep
,
2672 zcache_put_pool(pool
);
2674 WARN_ONCE((!eph
&& (ret
!= 0)), "zcache_get fails on persistent pool, "
2675 "bad things are very likely to happen soon\n");
2676 #ifdef RAMSTER_TESTING
2677 if (ret
!= 0 && ret
!= -1 && !(ret
== -EINVAL
&& is_ephemeral(pool
)))
2678 pr_err("TESTING zcache_get tmem_get returns ret=%d\n", ret
);
2681 BUG(); /* FIXME... don't need this anymore??? let's ensure */
2685 int zcache_flush(int cli_id
, int pool_id
,
2686 struct tmem_oid
*oidp
, uint32_t index
)
2688 struct tmem_pool
*pool
;
2690 unsigned long flags
;
2692 local_irq_save(flags
);
2693 zcache_flush_total
++;
2694 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
2695 ramster_do_preload_flnode_only(pool
);
2696 if (likely(pool
!= NULL
)) {
2697 if (atomic_read(&pool
->obj_count
) > 0)
2698 ret
= tmem_flush_page(pool
, oidp
, index
);
2699 zcache_put_pool(pool
);
2702 zcache_flush_found
++;
2703 local_irq_restore(flags
);
2707 int zcache_flush_object(int cli_id
, int pool_id
, struct tmem_oid
*oidp
)
2709 struct tmem_pool
*pool
;
2711 unsigned long flags
;
2713 local_irq_save(flags
);
2714 zcache_flobj_total
++;
2715 pool
= zcache_get_pool_by_id(cli_id
, pool_id
);
2716 ramster_do_preload_flnode_only(pool
);
2717 if (likely(pool
!= NULL
)) {
2718 if (atomic_read(&pool
->obj_count
) > 0)
2719 ret
= tmem_flush_object(pool
, oidp
);
2720 zcache_put_pool(pool
);
2723 zcache_flobj_found
++;
2724 local_irq_restore(flags
);
2728 int zcache_client_destroy_pool(int cli_id
, int pool_id
)
2730 struct tmem_pool
*pool
= NULL
;
2731 struct zcache_client
*cli
= NULL
;
2736 if (cli_id
== LOCAL_CLIENT
)
2738 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
2739 cli
= &zcache_clients
[cli_id
];
2742 atomic_inc(&cli
->refcount
);
2743 pool
= cli
->tmem_pools
[pool_id
];
2746 cli
->tmem_pools
[pool_id
] = NULL
;
2747 /* wait for pool activity on other cpus to quiesce */
2748 while (atomic_read(&pool
->refcount
) != 0)
2750 atomic_dec(&cli
->refcount
);
2752 ret
= tmem_destroy_pool(pool
);
2755 pr_info("ramster: destroyed pool id=%d cli_id=%d\n", pool_id
, cli_id
);
2760 static int zcache_destroy_pool(int pool_id
)
2762 return zcache_client_destroy_pool(LOCAL_CLIENT
, pool_id
);
2765 int zcache_new_pool(uint16_t cli_id
, uint32_t flags
)
2768 struct tmem_pool
*pool
;
2769 struct zcache_client
*cli
= NULL
;
2771 if (cli_id
== LOCAL_CLIENT
)
2773 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
2774 cli
= &zcache_clients
[cli_id
];
2777 atomic_inc(&cli
->refcount
);
2778 pool
= kmalloc(sizeof(struct tmem_pool
), GFP_ATOMIC
);
2780 pr_info("ramster: pool creation failed: out of memory\n");
2784 for (poolid
= 0; poolid
< MAX_POOLS_PER_CLIENT
; poolid
++)
2785 if (cli
->tmem_pools
[poolid
] == NULL
)
2787 if (poolid
>= MAX_POOLS_PER_CLIENT
) {
2788 pr_info("ramster: pool creation failed: max exceeded\n");
2793 atomic_set(&pool
->refcount
, 0);
2795 pool
->pool_id
= poolid
;
2796 tmem_new_pool(pool
, flags
);
2797 cli
->tmem_pools
[poolid
] = pool
;
2798 if (cli_id
== LOCAL_CLIENT
)
2799 pr_info("ramster: created %s tmem pool, id=%d, local client\n",
2800 flags
& TMEM_POOL_PERSIST
? "persistent" : "ephemeral",
2803 pr_info("ramster: created %s tmem pool, id=%d, client=%d\n",
2804 flags
& TMEM_POOL_PERSIST
? "persistent" : "ephemeral",
2808 atomic_dec(&cli
->refcount
);
2812 static int zcache_local_new_pool(uint32_t flags
)
2814 return zcache_new_pool(LOCAL_CLIENT
, flags
);
2817 int zcache_autocreate_pool(int cli_id
, int pool_id
, bool ephemeral
)
2819 struct tmem_pool
*pool
;
2820 struct zcache_client
*cli
= NULL
;
2821 uint32_t flags
= ephemeral
? 0 : TMEM_POOL_PERSIST
;
2824 if (cli_id
== LOCAL_CLIENT
)
2826 if (pool_id
>= MAX_POOLS_PER_CLIENT
)
2828 else if ((unsigned int)cli_id
< MAX_CLIENTS
)
2829 cli
= &zcache_clients
[cli_id
];
2830 if ((ephemeral
&& !use_cleancache
) || (!ephemeral
&& !use_frontswap
))
2831 BUG(); /* FIXME, handle more gracefully later */
2832 if (!cli
->allocated
) {
2833 if (zcache_new_client(cli_id
))
2834 BUG(); /* FIXME, handle more gracefully later */
2835 cli
= &zcache_clients
[cli_id
];
2837 atomic_inc(&cli
->refcount
);
2838 pool
= cli
->tmem_pools
[pool_id
];
2840 if (pool
->persistent
&& ephemeral
) {
2841 pr_err("zcache_autocreate_pool: type mismatch\n");
2847 pool
= kmalloc(sizeof(struct tmem_pool
), GFP_KERNEL
);
2849 pr_info("ramster: pool creation failed: out of memory\n");
2852 atomic_set(&pool
->refcount
, 0);
2854 pool
->pool_id
= pool_id
;
2855 tmem_new_pool(pool
, flags
);
2856 cli
->tmem_pools
[pool_id
] = pool
;
2857 pr_info("ramster: AUTOcreated %s tmem poolid=%d, for remote client=%d\n",
2858 flags
& TMEM_POOL_PERSIST
? "persistent" : "ephemeral",
2863 BUG(); /* FIXME, handle more gracefully later */
2864 /* pr_err("zcache_autocreate_pool: failed\n"); */
2866 atomic_dec(&cli
->refcount
);
2871 * Two kernel functionalities currently can be layered on top of tmem.
2872 * These are "cleancache" which is used as a second-chance cache for clean
2873 * page cache pages; and "frontswap" which is used for swap pages
2874 * to avoid writes to disk. A generic "shim" is provided here for each
2875 * to translate in-kernel semantics to zcache semantics.
2878 #ifdef CONFIG_CLEANCACHE
2879 static void zcache_cleancache_put_page(int pool_id
,
2880 struct cleancache_filekey key
,
2881 pgoff_t index
, struct page
*page
)
2883 u32 ind
= (u32
) index
;
2884 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
2886 #ifdef __PG_WAS_ACTIVE
2887 if (!PageWasActive(page
)) {
2888 zcache_nonactive_puts
++;
2892 if (likely(ind
== index
)) {
2893 char *kva
= page_address(page
);
2895 (void)zcache_put(LOCAL_CLIENT
, pool_id
, &oid
, index
,
2896 kva
, PAGE_SIZE
, 0, 1);
2900 static int zcache_cleancache_get_page(int pool_id
,
2901 struct cleancache_filekey key
,
2902 pgoff_t index
, struct page
*page
)
2904 u32 ind
= (u32
) index
;
2905 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
2909 if (likely(ind
== index
)) {
2910 char *kva
= page_address(page
);
2911 size_t size
= PAGE_SIZE
;
2913 ret
= zcache_get(LOCAL_CLIENT
, pool_id
, &oid
, index
,
2915 #ifdef __PG_WAS_ACTIVE
2917 SetPageWasActive(page
);
2924 static void zcache_cleancache_flush_page(int pool_id
,
2925 struct cleancache_filekey key
,
2928 u32 ind
= (u32
) index
;
2929 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
2931 if (likely(ind
== index
))
2932 (void)zcache_flush(LOCAL_CLIENT
, pool_id
, &oid
, ind
);
2935 static void zcache_cleancache_flush_inode(int pool_id
,
2936 struct cleancache_filekey key
)
2938 struct tmem_oid oid
= *(struct tmem_oid
*)&key
;
2940 (void)zcache_flush_object(LOCAL_CLIENT
, pool_id
, &oid
);
2943 static void zcache_cleancache_flush_fs(int pool_id
)
2946 (void)zcache_destroy_pool(pool_id
);
2949 static int zcache_cleancache_init_fs(size_t pagesize
)
2951 BUG_ON(sizeof(struct cleancache_filekey
) !=
2952 sizeof(struct tmem_oid
));
2953 BUG_ON(pagesize
!= PAGE_SIZE
);
2954 return zcache_local_new_pool(0);
2957 static int zcache_cleancache_init_shared_fs(char *uuid
, size_t pagesize
)
2959 /* shared pools are unsupported and map to private */
2960 BUG_ON(sizeof(struct cleancache_filekey
) !=
2961 sizeof(struct tmem_oid
));
2962 BUG_ON(pagesize
!= PAGE_SIZE
);
2963 return zcache_local_new_pool(0);
2966 static struct cleancache_ops zcache_cleancache_ops
= {
2967 .put_page
= zcache_cleancache_put_page
,
2968 .get_page
= zcache_cleancache_get_page
,
2969 .invalidate_page
= zcache_cleancache_flush_page
,
2970 .invalidate_inode
= zcache_cleancache_flush_inode
,
2971 .invalidate_fs
= zcache_cleancache_flush_fs
,
2972 .init_shared_fs
= zcache_cleancache_init_shared_fs
,
2973 .init_fs
= zcache_cleancache_init_fs
2976 struct cleancache_ops
zcache_cleancache_register_ops(void)
2978 struct cleancache_ops old_ops
=
2979 cleancache_register_ops(&zcache_cleancache_ops
);
2985 #ifdef CONFIG_FRONTSWAP
2986 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
2987 static int zcache_frontswap_poolid
= -1;
2990 * Swizzling increases objects per swaptype, increasing tmem concurrency
2991 * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
2994 #define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
2995 #define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
2996 #define iswiz(_ind) (_ind >> SWIZ_BITS)
2998 static inline struct tmem_oid
oswiz(unsigned type
, u32 ind
)
3000 struct tmem_oid oid
= { .oid
= { 0 } };
3001 oid
.oid
[0] = _oswiz(type
, ind
);
3005 static int zcache_frontswap_store(unsigned type
, pgoff_t offset
,
3008 u64 ind64
= (u64
)offset
;
3009 u32 ind
= (u32
)offset
;
3010 struct tmem_oid oid
= oswiz(type
, ind
);
3012 unsigned long flags
;
3015 BUG_ON(!PageLocked(page
));
3016 if (likely(ind64
== ind
)) {
3017 local_irq_save(flags
);
3018 kva
= page_address(page
);
3019 ret
= zcache_put(LOCAL_CLIENT
, zcache_frontswap_poolid
,
3020 &oid
, iswiz(ind
), kva
, PAGE_SIZE
, 0, 0);
3021 local_irq_restore(flags
);
3026 /* returns 0 if the page was successfully gotten from frontswap, -1 if
3027 * was not present (should never happen!) */
3028 static int zcache_frontswap_load(unsigned type
, pgoff_t offset
,
3031 u64 ind64
= (u64
)offset
;
3032 u32 ind
= (u32
)offset
;
3033 struct tmem_oid oid
= oswiz(type
, ind
);
3036 preempt_disable(); /* FIXME, remove this? */
3037 BUG_ON(!PageLocked(page
));
3038 if (likely(ind64
== ind
)) {
3039 char *kva
= page_address(page
);
3040 size_t size
= PAGE_SIZE
;
3042 ret
= zcache_get(LOCAL_CLIENT
, zcache_frontswap_poolid
,
3043 &oid
, iswiz(ind
), kva
, &size
, 0, -1);
3045 preempt_enable(); /* FIXME, remove this? */
3049 /* flush a single page from frontswap */
3050 static void zcache_frontswap_flush_page(unsigned type
, pgoff_t offset
)
3052 u64 ind64
= (u64
)offset
;
3053 u32 ind
= (u32
)offset
;
3054 struct tmem_oid oid
= oswiz(type
, ind
);
3056 if (likely(ind64
== ind
))
3057 (void)zcache_flush(LOCAL_CLIENT
, zcache_frontswap_poolid
,
3061 /* flush all pages from the passed swaptype */
3062 static void zcache_frontswap_flush_area(unsigned type
)
3064 struct tmem_oid oid
;
3067 for (ind
= SWIZ_MASK
; ind
>= 0; ind
--) {
3068 oid
= oswiz(type
, ind
);
3069 (void)zcache_flush_object(LOCAL_CLIENT
,
3070 zcache_frontswap_poolid
, &oid
);
3074 static void zcache_frontswap_init(unsigned ignored
)
3076 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
3077 if (zcache_frontswap_poolid
< 0)
3078 zcache_frontswap_poolid
=
3079 zcache_local_new_pool(TMEM_POOL_PERSIST
);
3082 static struct frontswap_ops zcache_frontswap_ops
= {
3083 .store
= zcache_frontswap_store
,
3084 .load
= zcache_frontswap_load
,
3085 .invalidate_page
= zcache_frontswap_flush_page
,
3086 .invalidate_area
= zcache_frontswap_flush_area
,
3087 .init
= zcache_frontswap_init
3090 struct frontswap_ops
zcache_frontswap_register_ops(void)
3092 struct frontswap_ops old_ops
=
3093 frontswap_register_ops(&zcache_frontswap_ops
);
3100 * frontswap selfshrinking
3103 #ifdef CONFIG_FRONTSWAP
3104 /* In HZ, controls frequency of worker invocation. */
3105 static unsigned int selfshrink_interval __read_mostly
= 5;
3107 static void selfshrink_process(struct work_struct
*work
);
3108 static DECLARE_DELAYED_WORK(selfshrink_worker
, selfshrink_process
);
3110 /* Enable/disable with sysfs. */
3111 static bool frontswap_selfshrinking __read_mostly
;
3113 /* Enable/disable with kernel boot option. */
3114 static bool use_frontswap_selfshrink __initdata
= true;
3117 * The default values for the following parameters were deemed reasonable
3118 * by experimentation, may be workload-dependent, and can all be
3119 * adjusted via sysfs.
3122 /* Control rate for frontswap shrinking. Higher hysteresis is slower. */
3123 static unsigned int frontswap_hysteresis __read_mostly
= 20;
3126 * Number of selfshrink worker invocations to wait before observing that
3127 * frontswap selfshrinking should commence. Note that selfshrinking does
3128 * not use a separate worker thread.
3130 static unsigned int frontswap_inertia __read_mostly
= 3;
3132 /* Countdown to next invocation of frontswap_shrink() */
3133 static unsigned long frontswap_inertia_counter
;
3136 * Invoked by the selfshrink worker thread, uses current number of pages
3137 * in frontswap (frontswap_curr_pages()), previous status, and control
3138 * values (hysteresis and inertia) to determine if frontswap should be
3139 * shrunk and what the new frontswap size should be. Note that
3140 * frontswap_shrink is essentially a partial swapoff that immediately
3141 * transfers pages from the "swap device" (frontswap) back into kernel
3142 * RAM; despite the name, frontswap "shrinking" is very different from
3143 * the "shrinker" interface used by the kernel MM subsystem to reclaim
3146 static void frontswap_selfshrink(void)
3148 static unsigned long cur_frontswap_pages
;
3149 static unsigned long last_frontswap_pages
;
3150 static unsigned long tgt_frontswap_pages
;
3152 last_frontswap_pages
= cur_frontswap_pages
;
3153 cur_frontswap_pages
= frontswap_curr_pages();
3154 if (!cur_frontswap_pages
||
3155 (cur_frontswap_pages
> last_frontswap_pages
)) {
3156 frontswap_inertia_counter
= frontswap_inertia
;
3159 if (frontswap_inertia_counter
&& --frontswap_inertia_counter
)
3161 if (cur_frontswap_pages
<= frontswap_hysteresis
)
3162 tgt_frontswap_pages
= 0;
3164 tgt_frontswap_pages
= cur_frontswap_pages
-
3165 (cur_frontswap_pages
/ frontswap_hysteresis
);
3166 frontswap_shrink(tgt_frontswap_pages
);
3169 static int __init
ramster_nofrontswap_selfshrink_setup(char *s
)
3171 use_frontswap_selfshrink
= false;
3175 __setup("noselfshrink", ramster_nofrontswap_selfshrink_setup
);
3177 static void selfshrink_process(struct work_struct
*work
)
3179 if (frontswap_selfshrinking
&& frontswap_enabled
) {
3180 frontswap_selfshrink();
3181 schedule_delayed_work(&selfshrink_worker
,
3182 selfshrink_interval
* HZ
);
3186 static int ramster_enabled
;
3188 static int __init
ramster_selfshrink_init(void)
3190 frontswap_selfshrinking
= ramster_enabled
&& use_frontswap_selfshrink
;
3191 if (frontswap_selfshrinking
)
3192 pr_info("ramster: Initializing frontswap "
3193 "selfshrinking driver.\n");
3197 schedule_delayed_work(&selfshrink_worker
, selfshrink_interval
* HZ
);
3202 subsys_initcall(ramster_selfshrink_init
);
3206 * zcache initialization
3207 * NOTE FOR NOW ramster MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
3211 static int ramster_enabled
;
3213 static int __init
enable_ramster(char *s
)
3215 ramster_enabled
= 1;
3218 __setup("ramster", enable_ramster
);
3220 /* allow independent dynamic disabling of cleancache and frontswap */
3222 static int use_cleancache
= 1;
3224 static int __init
no_cleancache(char *s
)
3226 pr_info("INIT no_cleancache called\n");
3232 * FIXME: need to guarantee this gets checked before zcache_init is called
3233 * What is the correct way to achieve this?
3235 early_param("nocleancache", no_cleancache
);
3237 static int use_frontswap
= 1;
3239 static int __init
no_frontswap(char *s
)
3241 pr_info("INIT no_frontswap called\n");
3246 __setup("nofrontswap", no_frontswap
);
3248 static int __init
zcache_init(void)
3253 ret
= sysfs_create_group(mm_kobj
, &zcache_attr_group
);
3254 ret
= sysfs_create_group(mm_kobj
, &ramster_attr_group
);
3256 pr_err("ramster: can't create sysfs\n");
3259 #endif /* CONFIG_SYSFS */
3260 #if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
3261 if (ramster_enabled
) {
3264 (void)r2net_register_handlers();
3265 tmem_register_hostops(&zcache_hostops
);
3266 tmem_register_pamops(&zcache_pamops
);
3267 ret
= register_cpu_notifier(&zcache_cpu_notifier_block
);
3269 pr_err("ramster: can't register cpu notifier\n");
3272 for_each_online_cpu(cpu
) {
3273 void *pcpu
= (void *)(long)cpu
;
3274 zcache_cpu_notifier(&zcache_cpu_notifier_block
,
3275 CPU_UP_PREPARE
, pcpu
);
3278 zcache_objnode_cache
= kmem_cache_create("zcache_objnode",
3279 sizeof(struct tmem_objnode
), 0, 0, NULL
);
3280 zcache_obj_cache
= kmem_cache_create("zcache_obj",
3281 sizeof(struct tmem_obj
), 0, 0, NULL
);
3282 ramster_flnode_cache
= kmem_cache_create("ramster_flnode",
3283 sizeof(struct flushlist_node
), 0, 0, NULL
);
3285 #ifdef CONFIG_CLEANCACHE
3286 pr_info("INIT ramster_enabled=%d use_cleancache=%d\n",
3287 ramster_enabled
, use_cleancache
);
3288 if (ramster_enabled
&& use_cleancache
) {
3289 struct cleancache_ops old_ops
;
3292 register_shrinker(&zcache_shrinker
);
3293 old_ops
= zcache_cleancache_register_ops();
3294 pr_info("ramster: cleancache enabled using kernel "
3295 "transcendent memory and compression buddies\n");
3296 if (old_ops
.init_fs
!= NULL
)
3297 pr_warning("ramster: cleancache_ops overridden");
3300 #ifdef CONFIG_FRONTSWAP
3301 pr_info("INIT ramster_enabled=%d use_frontswap=%d\n",
3302 ramster_enabled
, use_frontswap
);
3303 if (ramster_enabled
&& use_frontswap
) {
3304 struct frontswap_ops old_ops
;
3306 zcache_new_client(LOCAL_CLIENT
);
3307 old_ops
= zcache_frontswap_register_ops();
3308 pr_info("ramster: frontswap enabled using kernel "
3309 "transcendent memory and xvmalloc\n");
3310 if (old_ops
.init
!= NULL
)
3311 pr_warning("ramster: frontswap_ops overridden");
3313 if (ramster_enabled
&& (use_frontswap
|| use_cleancache
))
3314 ramster_remotify_init();
3320 module_init(zcache_init
)