2 * In-kernel transcendent memory (generic implementation)
4 * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp.
6 * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
7 * "handles" (triples containing a pool id, and object id, and an index), to
8 * pages in a page-accessible memory (PAM). Tmem references the PAM pages via
9 * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
10 * set of functions (pamops). Each pampd contains some representation of
11 * PAGE_SIZE bytes worth of data. For those familiar with key-value stores,
12 * the tmem handle is a three-level hierarchical key, and the value is always
13 * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is
14 * referenced in the datastore by the pampd. The hierarchy is required
15 * to ensure that certain invalidation functions can be performed efficiently
16 * (i.e. flush all indexes associated with this object_id, or
17 * flush all objects associated with this pool).
19 * Tmem must support potentially millions of pages and must be able to insert,
20 * find, and delete these pages at a potential frequency of thousands per
21 * second concurrently across many CPUs, (and, if used with KVM, across many
22 * vcpus across many guests). Tmem is tracked with a hierarchy of data
23 * structures, organized by the elements in the handle-tuple: pool_id,
24 * object_id, and page index. One or more "clients" (e.g. guests) each
25 * provide one or more tmem_pools. Each pool, contains a hash table of
26 * rb_trees of tmem_objs. Each tmem_obj contains a radix-tree-like tree
27 * of pointers, with intermediate nodes called tmem_objnodes. Each leaf
28 * pointer in this tree points to a pampd, which is accessible only through
29 * a small set of callbacks registered by the PAM implementation (see
30 * tmem_register_pamops). Tmem only needs to memory allocation for objs
31 * and objnodes and this is done via a set of callbacks that must be
32 * registered by the tmem host implementation (e.g. see tmem_register_hostops).
35 #include <linux/list.h>
36 #include <linux/spinlock.h>
37 #include <linux/atomic.h>
39 #include <linux/delay.h>
44 /* data structure sentinels used for debugging... see tmem.h */
45 #define POOL_SENTINEL 0x87658765
46 #define OBJ_SENTINEL 0x12345678
47 #define OBJNODE_SENTINEL 0xfedcba09
50 * A tmem host implementation must use this function to register callbacks
51 * for memory allocation.
53 static struct tmem_hostops tmem_hostops
;
55 static void tmem_objnode_tree_init(void);
57 void tmem_register_hostops(struct tmem_hostops
*m
)
59 tmem_objnode_tree_init();
64 * A tmem host implementation must use this function to register
65 * callbacks for a page-accessible memory (PAM) implementation.
67 static struct tmem_pamops tmem_pamops
;
69 void tmem_register_pamops(struct tmem_pamops
*m
)
75 * Oid's are potentially very sparse and tmem_objs may have an indeterminately
76 * short life, being added and deleted at a relatively high frequency.
77 * So an rb_tree is an ideal data structure to manage tmem_objs. But because
78 * of the potentially huge number of tmem_objs, each pool manages a hashtable
79 * of rb_trees to reduce search, insert, delete, and rebalancing time.
80 * Each hashbucket also has a lock to manage concurrent access and no
81 * searches, inserts, or deletions can be performed unless the lock is held.
82 * As a result, care must be taken to ensure tmem routines are not called
83 * recursively; the vast majority of the time, a recursive call may work
84 * but a deadlock will occur a small fraction of the time due to the
87 * The following routines manage tmem_objs. In all of these routines,
88 * the hashbucket lock is already held.
91 /* Search for object==oid in pool, returns object if found. */
92 static struct tmem_obj
*__tmem_obj_find(struct tmem_hashbucket
*hb
,
93 struct tmem_oid
*oidp
,
94 struct rb_node
**parent
,
95 struct rb_node
***link
)
97 struct rb_node
*_parent
= NULL
, **rbnode
;
98 struct tmem_obj
*obj
= NULL
;
100 rbnode
= &hb
->obj_rb_root
.rb_node
;
102 BUG_ON(RB_EMPTY_NODE(*rbnode
));
104 obj
= rb_entry(*rbnode
, struct tmem_obj
,
106 switch (tmem_oid_compare(oidp
, &obj
->oid
)) {
110 rbnode
= &(*rbnode
)->rb_left
;
113 rbnode
= &(*rbnode
)->rb_right
;
127 static struct tmem_obj
*tmem_obj_find(struct tmem_hashbucket
*hb
,
128 struct tmem_oid
*oidp
)
130 return __tmem_obj_find(hb
, oidp
, NULL
, NULL
);
133 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj
*, bool);
135 /* Free an object that has no more pampds in it. */
136 static void tmem_obj_free(struct tmem_obj
*obj
, struct tmem_hashbucket
*hb
)
138 struct tmem_pool
*pool
;
141 ASSERT_SENTINEL(obj
, OBJ
);
142 BUG_ON(obj
->pampd_count
> 0);
144 BUG_ON(pool
== NULL
);
145 if (obj
->objnode_tree_root
!= NULL
) /* may be "stump" with no leaves */
146 tmem_pampd_destroy_all_in_obj(obj
, false);
147 BUG_ON(obj
->objnode_tree_root
!= NULL
);
148 BUG_ON((long)obj
->objnode_count
!= 0);
149 atomic_dec(&pool
->obj_count
);
150 BUG_ON(atomic_read(&pool
->obj_count
) < 0);
151 INVERT_SENTINEL(obj
, OBJ
);
153 tmem_oid_set_invalid(&obj
->oid
);
154 rb_erase(&obj
->rb_tree_node
, &hb
->obj_rb_root
);
158 * Initialize, and insert an tmem_object_root (called only if find failed).
160 static void tmem_obj_init(struct tmem_obj
*obj
, struct tmem_hashbucket
*hb
,
161 struct tmem_pool
*pool
,
162 struct tmem_oid
*oidp
)
164 struct rb_root
*root
= &hb
->obj_rb_root
;
165 struct rb_node
**new = NULL
, *parent
= NULL
;
167 BUG_ON(pool
== NULL
);
168 atomic_inc(&pool
->obj_count
);
169 obj
->objnode_tree_height
= 0;
170 obj
->objnode_tree_root
= NULL
;
173 obj
->objnode_count
= 0;
174 obj
->pampd_count
= 0;
175 #ifdef CONFIG_RAMSTER
176 if (tmem_pamops
.new_obj
!= NULL
)
177 (*tmem_pamops
.new_obj
)(obj
);
179 SET_SENTINEL(obj
, OBJ
);
181 if (__tmem_obj_find(hb
, oidp
, &parent
, &new))
184 rb_link_node(&obj
->rb_tree_node
, parent
, new);
185 rb_insert_color(&obj
->rb_tree_node
, root
);
189 * Tmem is managed as a set of tmem_pools with certain attributes, such as
190 * "ephemeral" vs "persistent". These attributes apply to all tmem_objs
191 * and all pampds that belong to a tmem_pool. A tmem_pool is created
192 * or deleted relatively rarely (for example, when a filesystem is
193 * mounted or unmounted).
196 /* flush all data from a pool and, optionally, free it */
197 static void tmem_pool_flush(struct tmem_pool
*pool
, bool destroy
)
199 struct rb_node
*rbnode
;
200 struct tmem_obj
*obj
;
201 struct tmem_hashbucket
*hb
= &pool
->hashbucket
[0];
204 BUG_ON(pool
== NULL
);
205 for (i
= 0; i
< TMEM_HASH_BUCKETS
; i
++, hb
++) {
206 spin_lock(&hb
->lock
);
207 rbnode
= rb_first(&hb
->obj_rb_root
);
208 while (rbnode
!= NULL
) {
209 obj
= rb_entry(rbnode
, struct tmem_obj
, rb_tree_node
);
210 rbnode
= rb_next(rbnode
);
211 tmem_pampd_destroy_all_in_obj(obj
, true);
212 tmem_obj_free(obj
, hb
);
213 (*tmem_hostops
.obj_free
)(obj
, pool
);
215 spin_unlock(&hb
->lock
);
218 list_del(&pool
->pool_list
);
222 * A tmem_obj contains a radix-tree-like tree in which the intermediate
223 * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation
224 * is very specialized and tuned for specific uses and is not particularly
225 * suited for use from this code, though some code from the core algorithms has
226 * been reused, thus the copyright notices below). Each tmem_objnode contains
227 * a set of pointers which point to either a set of intermediate tmem_objnodes
228 * or a set of of pampds.
230 * Portions Copyright (C) 2001 Momchil Velikov
231 * Portions Copyright (C) 2001 Christoph Hellwig
232 * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
235 struct tmem_objnode_tree_path
{
236 struct tmem_objnode
*objnode
;
240 /* objnode height_to_maxindex translation */
241 static unsigned long tmem_objnode_tree_h2max
[OBJNODE_TREE_MAX_PATH
+ 1];
243 static void tmem_objnode_tree_init(void)
245 unsigned int ht
, tmp
;
247 for (ht
= 0; ht
< ARRAY_SIZE(tmem_objnode_tree_h2max
); ht
++) {
248 tmp
= ht
* OBJNODE_TREE_MAP_SHIFT
;
249 if (tmp
>= OBJNODE_TREE_INDEX_BITS
)
250 tmem_objnode_tree_h2max
[ht
] = ~0UL;
252 tmem_objnode_tree_h2max
[ht
] =
253 (~0UL >> (OBJNODE_TREE_INDEX_BITS
- tmp
- 1)) >> 1;
257 static struct tmem_objnode
*tmem_objnode_alloc(struct tmem_obj
*obj
)
259 struct tmem_objnode
*objnode
;
261 ASSERT_SENTINEL(obj
, OBJ
);
262 BUG_ON(obj
->pool
== NULL
);
263 ASSERT_SENTINEL(obj
->pool
, POOL
);
264 objnode
= (*tmem_hostops
.objnode_alloc
)(obj
->pool
);
265 if (unlikely(objnode
== NULL
))
268 SET_SENTINEL(objnode
, OBJNODE
);
269 memset(&objnode
->slots
, 0, sizeof(objnode
->slots
));
270 objnode
->slots_in_use
= 0;
271 obj
->objnode_count
++;
276 static void tmem_objnode_free(struct tmem_objnode
*objnode
)
278 struct tmem_pool
*pool
;
281 BUG_ON(objnode
== NULL
);
282 for (i
= 0; i
< OBJNODE_TREE_MAP_SIZE
; i
++)
283 BUG_ON(objnode
->slots
[i
] != NULL
);
284 ASSERT_SENTINEL(objnode
, OBJNODE
);
285 INVERT_SENTINEL(objnode
, OBJNODE
);
286 BUG_ON(objnode
->obj
== NULL
);
287 ASSERT_SENTINEL(objnode
->obj
, OBJ
);
288 pool
= objnode
->obj
->pool
;
289 BUG_ON(pool
== NULL
);
290 ASSERT_SENTINEL(pool
, POOL
);
291 objnode
->obj
->objnode_count
--;
293 (*tmem_hostops
.objnode_free
)(objnode
, pool
);
297 * Lookup index in object and return associated pampd (or NULL if not found).
299 static void **__tmem_pampd_lookup_in_obj(struct tmem_obj
*obj
, uint32_t index
)
301 unsigned int height
, shift
;
302 struct tmem_objnode
**slot
= NULL
;
305 ASSERT_SENTINEL(obj
, OBJ
);
306 BUG_ON(obj
->pool
== NULL
);
307 ASSERT_SENTINEL(obj
->pool
, POOL
);
309 height
= obj
->objnode_tree_height
;
310 if (index
> tmem_objnode_tree_h2max
[obj
->objnode_tree_height
])
312 if (height
== 0 && obj
->objnode_tree_root
) {
313 slot
= &obj
->objnode_tree_root
;
316 shift
= (height
-1) * OBJNODE_TREE_MAP_SHIFT
;
317 slot
= &obj
->objnode_tree_root
;
321 slot
= (struct tmem_objnode
**)
323 ((index
>> shift
) & OBJNODE_TREE_MAP_MASK
));
324 shift
-= OBJNODE_TREE_MAP_SHIFT
;
328 return slot
!= NULL
? (void **)slot
: NULL
;
331 static void *tmem_pampd_lookup_in_obj(struct tmem_obj
*obj
, uint32_t index
)
333 struct tmem_objnode
**slot
;
335 slot
= (struct tmem_objnode
**)__tmem_pampd_lookup_in_obj(obj
, index
);
336 return slot
!= NULL
? *slot
: NULL
;
339 #ifdef CONFIG_RAMSTER
340 static void *tmem_pampd_replace_in_obj(struct tmem_obj
*obj
, uint32_t index
,
341 void *new_pampd
, bool no_free
)
343 struct tmem_objnode
**slot
;
346 slot
= (struct tmem_objnode
**)__tmem_pampd_lookup_in_obj(obj
, index
);
347 if ((slot
!= NULL
) && (*slot
!= NULL
)) {
348 void *old_pampd
= *(void **)slot
;
349 *(void **)slot
= new_pampd
;
351 (*tmem_pamops
.free
)(old_pampd
, obj
->pool
,
359 static int tmem_pampd_add_to_obj(struct tmem_obj
*obj
, uint32_t index
,
363 struct tmem_objnode
*objnode
= NULL
, *newnode
, *slot
;
364 unsigned int height
, shift
;
367 /* if necessary, extend the tree to be higher */
368 if (index
> tmem_objnode_tree_h2max
[obj
->objnode_tree_height
]) {
369 height
= obj
->objnode_tree_height
+ 1;
370 if (index
> tmem_objnode_tree_h2max
[height
])
371 while (index
> tmem_objnode_tree_h2max
[height
])
373 if (obj
->objnode_tree_root
== NULL
) {
374 obj
->objnode_tree_height
= height
;
378 newnode
= tmem_objnode_alloc(obj
);
383 newnode
->slots
[0] = obj
->objnode_tree_root
;
384 newnode
->slots_in_use
= 1;
385 obj
->objnode_tree_root
= newnode
;
386 obj
->objnode_tree_height
++;
387 } while (height
> obj
->objnode_tree_height
);
390 slot
= obj
->objnode_tree_root
;
391 height
= obj
->objnode_tree_height
;
392 shift
= (height
-1) * OBJNODE_TREE_MAP_SHIFT
;
395 /* add a child objnode. */
396 slot
= tmem_objnode_alloc(obj
);
403 objnode
->slots
[offset
] = slot
;
404 objnode
->slots_in_use
++;
406 obj
->objnode_tree_root
= slot
;
408 /* go down a level */
409 offset
= (index
>> shift
) & OBJNODE_TREE_MAP_MASK
;
411 slot
= objnode
->slots
[offset
];
412 shift
-= OBJNODE_TREE_MAP_SHIFT
;
415 BUG_ON(slot
!= NULL
);
417 objnode
->slots_in_use
++;
418 objnode
->slots
[offset
] = pampd
;
420 obj
->objnode_tree_root
= pampd
;
426 static void *tmem_pampd_delete_from_obj(struct tmem_obj
*obj
, uint32_t index
)
428 struct tmem_objnode_tree_path path
[OBJNODE_TREE_MAX_PATH
+ 1];
429 struct tmem_objnode_tree_path
*pathp
= path
;
430 struct tmem_objnode
*slot
= NULL
;
431 unsigned int height
, shift
;
435 ASSERT_SENTINEL(obj
, OBJ
);
436 BUG_ON(obj
->pool
== NULL
);
437 ASSERT_SENTINEL(obj
->pool
, POOL
);
438 height
= obj
->objnode_tree_height
;
439 if (index
> tmem_objnode_tree_h2max
[height
])
441 slot
= obj
->objnode_tree_root
;
442 if (height
== 0 && obj
->objnode_tree_root
) {
443 obj
->objnode_tree_root
= NULL
;
446 shift
= (height
- 1) * OBJNODE_TREE_MAP_SHIFT
;
447 pathp
->objnode
= NULL
;
452 offset
= (index
>> shift
) & OBJNODE_TREE_MAP_MASK
;
453 pathp
->offset
= offset
;
454 pathp
->objnode
= slot
;
455 slot
= slot
->slots
[offset
];
456 shift
-= OBJNODE_TREE_MAP_SHIFT
;
458 } while (height
> 0);
461 while (pathp
->objnode
) {
462 pathp
->objnode
->slots
[pathp
->offset
] = NULL
;
463 pathp
->objnode
->slots_in_use
--;
464 if (pathp
->objnode
->slots_in_use
) {
465 if (pathp
->objnode
== obj
->objnode_tree_root
) {
466 while (obj
->objnode_tree_height
> 0 &&
467 obj
->objnode_tree_root
->slots_in_use
== 1 &&
468 obj
->objnode_tree_root
->slots
[0]) {
469 struct tmem_objnode
*to_free
=
470 obj
->objnode_tree_root
;
472 obj
->objnode_tree_root
=
474 obj
->objnode_tree_height
--;
475 to_free
->slots
[0] = NULL
;
476 to_free
->slots_in_use
= 0;
477 tmem_objnode_free(to_free
);
482 tmem_objnode_free(pathp
->objnode
); /* 0 slots used, free it */
485 obj
->objnode_tree_height
= 0;
486 obj
->objnode_tree_root
= NULL
;
491 BUG_ON(obj
->pampd_count
< 0);
495 /* Recursively walk the objnode_tree destroying pampds and objnodes. */
496 static void tmem_objnode_node_destroy(struct tmem_obj
*obj
,
497 struct tmem_objnode
*objnode
,
504 for (i
= 0; i
< OBJNODE_TREE_MAP_SIZE
; i
++) {
505 if (objnode
->slots
[i
]) {
508 (*tmem_pamops
.free
)(objnode
->slots
[i
],
509 obj
->pool
, NULL
, 0, true);
510 objnode
->slots
[i
] = NULL
;
513 tmem_objnode_node_destroy(obj
, objnode
->slots
[i
], ht
-1);
514 tmem_objnode_free(objnode
->slots
[i
]);
515 objnode
->slots
[i
] = NULL
;
520 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj
*obj
,
523 if (obj
->objnode_tree_root
== NULL
)
525 if (obj
->objnode_tree_height
== 0) {
527 (*tmem_pamops
.free
)(obj
->objnode_tree_root
,
528 obj
->pool
, NULL
, 0, true);
530 tmem_objnode_node_destroy(obj
, obj
->objnode_tree_root
,
531 obj
->objnode_tree_height
);
532 tmem_objnode_free(obj
->objnode_tree_root
);
533 obj
->objnode_tree_height
= 0;
535 obj
->objnode_tree_root
= NULL
;
536 #ifdef CONFIG_RAMSTER
537 if (tmem_pamops
.free_obj
!= NULL
)
538 (*tmem_pamops
.free_obj
)(obj
->pool
, obj
, pool_destroy
);
543 * Tmem is operated on by a set of well-defined actions:
544 * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
545 * (The tmem ABI allows for subpages and exchanges but these operations
546 * are not included in this implementation.)
548 * These "tmem core" operations are implemented in the following functions.
552 * "Put" a page, e.g. associate the passed pampd with the passed handle.
553 * Tmem_put is complicated by a corner case: What if a page with matching
554 * handle already exists in tmem? To guarantee coherency, one of two
555 * actions is necessary: Either the data for the page must be overwritten,
556 * or the page must be "flushed" so that the data is not accessible to a
557 * subsequent "get". Since these "duplicate puts" are relatively rare,
558 * this implementation always flushes for simplicity.
560 int tmem_put(struct tmem_pool
*pool
, struct tmem_oid
*oidp
, uint32_t index
,
561 bool raw
, void *pampd_to_use
)
563 struct tmem_obj
*obj
= NULL
, *objfound
= NULL
, *objnew
= NULL
;
564 void *pampd
= NULL
, *pampd_del
= NULL
;
566 struct tmem_hashbucket
*hb
;
568 hb
= &pool
->hashbucket
[tmem_oid_hash(oidp
)];
569 spin_lock(&hb
->lock
);
570 obj
= objfound
= tmem_obj_find(hb
, oidp
);
572 pampd
= tmem_pampd_lookup_in_obj(objfound
, index
);
574 /* if found, is a dup put, flush the old one */
575 pampd_del
= tmem_pampd_delete_from_obj(obj
, index
);
576 BUG_ON(pampd_del
!= pampd
);
577 (*tmem_pamops
.free
)(pampd
, pool
, oidp
, index
, true);
578 if (obj
->pampd_count
== 0) {
585 obj
= objnew
= (*tmem_hostops
.obj_alloc
)(pool
);
586 if (unlikely(obj
== NULL
)) {
590 tmem_obj_init(obj
, hb
, pool
, oidp
);
593 BUG_ON(((objnew
!= obj
) && (objfound
!= obj
)) || (objnew
== objfound
));
594 pampd
= pampd_to_use
;
595 BUG_ON(pampd_to_use
== NULL
);
596 ret
= tmem_pampd_add_to_obj(obj
, index
, pampd
);
597 if (unlikely(ret
== -ENOMEM
))
598 /* may have partially built objnode tree ("stump") */
599 goto delete_and_free
;
600 (*tmem_pamops
.create_finish
)(pampd
, is_ephemeral(pool
));
604 (void)tmem_pampd_delete_from_obj(obj
, index
);
606 (*tmem_pamops
.free
)(pampd
, pool
, NULL
, 0, true);
608 tmem_obj_free(objnew
, hb
);
609 (*tmem_hostops
.obj_free
)(objnew
, pool
);
612 spin_unlock(&hb
->lock
);
616 #ifdef CONFIG_RAMSTER
618 * For ramster only: The following routines provide a two-step sequence
619 * to allow the caller to replace a pampd in the tmem data structures with
620 * another pampd. Here, we lookup the passed handle and, if found, return the
621 * associated pampd and object, leaving the hashbucket locked and returning
622 * a reference to it. The caller is expected to immediately call the
623 * matching tmem_localify_finish routine which will handles the replacement
624 * and unlocks the hashbucket.
626 void *tmem_localify_get_pampd(struct tmem_pool
*pool
, struct tmem_oid
*oidp
,
627 uint32_t index
, struct tmem_obj
**ret_obj
,
630 struct tmem_hashbucket
*hb
;
631 struct tmem_obj
*obj
= NULL
;
634 hb
= &pool
->hashbucket
[tmem_oid_hash(oidp
)];
635 spin_lock(&hb
->lock
);
636 obj
= tmem_obj_find(hb
, oidp
);
637 if (likely(obj
!= NULL
))
638 pampd
= tmem_pampd_lookup_in_obj(obj
, index
);
640 *saved_hb
= (void *)hb
;
641 /* note, hashbucket remains locked */
645 void tmem_localify_finish(struct tmem_obj
*obj
, uint32_t index
,
646 void *pampd
, void *saved_hb
, bool delete)
648 struct tmem_hashbucket
*hb
= (struct tmem_hashbucket
*)saved_hb
;
650 BUG_ON(!spin_is_locked(&hb
->lock
));
653 (void)tmem_pampd_replace_in_obj(obj
, index
, pampd
, 1);
654 (*tmem_pamops
.create_finish
)(pampd
, is_ephemeral(obj
->pool
));
657 (void)tmem_pampd_delete_from_obj(obj
, index
);
659 spin_unlock(&hb
->lock
);
663 * For ramster only. Helper function to support asynchronous tmem_get.
665 static int tmem_repatriate(void **ppampd
, struct tmem_hashbucket
*hb
,
666 struct tmem_pool
*pool
, struct tmem_oid
*oidp
,
667 uint32_t index
, bool free
, char *data
)
669 void *old_pampd
= *ppampd
, *new_pampd
= NULL
;
670 bool intransit
= false;
673 if (!is_ephemeral(pool
))
674 new_pampd
= (*tmem_pamops
.repatriate_preload
)(
675 old_pampd
, pool
, oidp
, index
, &intransit
);
678 else if (new_pampd
!= NULL
)
680 /* must release the hb->lock else repatriate can't sleep */
681 spin_unlock(&hb
->lock
);
683 ret
= (*tmem_pamops
.repatriate
)(old_pampd
, new_pampd
, pool
,
684 oidp
, index
, free
, data
);
685 if (ret
== -EAGAIN
) {
686 /* rare I think, but should cond_resched()??? */
687 usleep_range(10, 1000);
688 } else if (ret
== -ENOTCONN
|| ret
== -EHOSTDOWN
) {
690 } else if (ret
!= 0 && ret
!= -ENOENT
) {
693 /* note hb->lock has now been unlocked */
698 * For ramster only. If a page in tmem matches the handle, replace the
699 * page so that any subsequent "get" gets the new page. Returns 0 if
700 * there was a page to replace, else returns -1.
702 int tmem_replace(struct tmem_pool
*pool
, struct tmem_oid
*oidp
,
703 uint32_t index
, void *new_pampd
)
705 struct tmem_obj
*obj
;
707 struct tmem_hashbucket
*hb
;
709 hb
= &pool
->hashbucket
[tmem_oid_hash(oidp
)];
710 spin_lock(&hb
->lock
);
711 obj
= tmem_obj_find(hb
, oidp
);
714 new_pampd
= tmem_pampd_replace_in_obj(obj
, index
, new_pampd
, 0);
715 /* if we bug here, pamops wasn't properly set up for ramster */
716 BUG_ON(tmem_pamops
.replace_in_obj
== NULL
);
717 ret
= (*tmem_pamops
.replace_in_obj
)(new_pampd
, obj
);
719 spin_unlock(&hb
->lock
);
725 * "Get" a page, e.g. if a pampd can be found matching the passed handle,
726 * use a pamops callback to recreated the page from the pampd with the
727 * matching handle. By tmem definition, when a "get" is successful on
728 * an ephemeral page, the page is "flushed", and when a "get" is successful
729 * on a persistent page, the page is retained in tmem. Note that to preserve
730 * coherency, "get" can never be skipped if tmem contains the data.
731 * That is, if a get is done with a certain handle and fails, any
732 * subsequent "get" must also fail (unless of course there is a
733 * "put" done with the same handle).
735 int tmem_get(struct tmem_pool
*pool
, struct tmem_oid
*oidp
, uint32_t index
,
736 char *data
, size_t *sizep
, bool raw
, int get_and_free
)
738 struct tmem_obj
*obj
;
740 bool ephemeral
= is_ephemeral(pool
);
742 struct tmem_hashbucket
*hb
;
743 bool free
= (get_and_free
== 1) || ((get_and_free
== 0) && ephemeral
);
744 bool lock_held
= false;
748 hb
= &pool
->hashbucket
[tmem_oid_hash(oidp
)];
749 spin_lock(&hb
->lock
);
751 obj
= tmem_obj_find(hb
, oidp
);
754 ppampd
= __tmem_pampd_lookup_in_obj(obj
, index
);
757 #ifdef CONFIG_RAMSTER
758 if ((tmem_pamops
.is_remote
!= NULL
) &&
759 tmem_pamops
.is_remote(*ppampd
)) {
760 ret
= tmem_repatriate(ppampd
, hb
, pool
, oidp
,
762 /* tmem_repatriate releases hb->lock */
769 } while (ret
== -EAGAIN
);
771 pampd
= tmem_pampd_delete_from_obj(obj
, index
);
773 pampd
= tmem_pampd_lookup_in_obj(obj
, index
);
777 if (obj
->pampd_count
== 0) {
778 tmem_obj_free(obj
, hb
);
779 (*tmem_hostops
.obj_free
)(obj
, pool
);
784 ret
= (*tmem_pamops
.get_data_and_free
)(
785 data
, sizep
, raw
, pampd
, pool
, oidp
, index
);
787 ret
= (*tmem_pamops
.get_data
)(
788 data
, sizep
, raw
, pampd
, pool
, oidp
, index
);
794 spin_unlock(&hb
->lock
);
799 * If a page in tmem matches the handle, "flush" this page from tmem such
800 * that any subsequent "get" does not succeed (unless, of course, there
801 * was another "put" with the same handle).
803 int tmem_flush_page(struct tmem_pool
*pool
,
804 struct tmem_oid
*oidp
, uint32_t index
)
806 struct tmem_obj
*obj
;
809 struct tmem_hashbucket
*hb
;
811 hb
= &pool
->hashbucket
[tmem_oid_hash(oidp
)];
812 spin_lock(&hb
->lock
);
813 obj
= tmem_obj_find(hb
, oidp
);
816 pampd
= tmem_pampd_delete_from_obj(obj
, index
);
819 (*tmem_pamops
.free
)(pampd
, pool
, oidp
, index
, true);
820 if (obj
->pampd_count
== 0) {
821 tmem_obj_free(obj
, hb
);
822 (*tmem_hostops
.obj_free
)(obj
, pool
);
827 spin_unlock(&hb
->lock
);
832 * "Flush" all pages in tmem matching this oid.
834 int tmem_flush_object(struct tmem_pool
*pool
, struct tmem_oid
*oidp
)
836 struct tmem_obj
*obj
;
837 struct tmem_hashbucket
*hb
;
840 hb
= &pool
->hashbucket
[tmem_oid_hash(oidp
)];
841 spin_lock(&hb
->lock
);
842 obj
= tmem_obj_find(hb
, oidp
);
845 tmem_pampd_destroy_all_in_obj(obj
, false);
846 tmem_obj_free(obj
, hb
);
847 (*tmem_hostops
.obj_free
)(obj
, pool
);
851 spin_unlock(&hb
->lock
);
856 * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
857 * all subsequent access to this tmem_pool.
859 int tmem_destroy_pool(struct tmem_pool
*pool
)
865 tmem_pool_flush(pool
, 1);
871 static LIST_HEAD(tmem_global_pool_list
);
874 * Create a new tmem_pool with the provided flag and return
875 * a pool id provided by the tmem host implementation.
877 void tmem_new_pool(struct tmem_pool
*pool
, uint32_t flags
)
879 int persistent
= flags
& TMEM_POOL_PERSIST
;
880 int shared
= flags
& TMEM_POOL_SHARED
;
881 struct tmem_hashbucket
*hb
= &pool
->hashbucket
[0];
884 for (i
= 0; i
< TMEM_HASH_BUCKETS
; i
++, hb
++) {
885 hb
->obj_rb_root
= RB_ROOT
;
886 spin_lock_init(&hb
->lock
);
888 INIT_LIST_HEAD(&pool
->pool_list
);
889 atomic_set(&pool
->obj_count
, 0);
890 SET_SENTINEL(pool
, POOL
);
891 list_add_tail(&pool
->pool_list
, &tmem_global_pool_list
);
892 pool
->persistent
= persistent
;
893 pool
->shared
= shared
;