net: move procfs code to net/core/net-procfs.c
[linux/fpc-iii.git] / drivers / staging / ramster / tmem.c
bloba2b7e03b606279b76fc7ea5f1d39ddcafe967433
1 /*
2 * In-kernel transcendent memory (generic implementation)
4 * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp.
6 * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
7 * "handles" (triples containing a pool id, and object id, and an index), to
8 * pages in a page-accessible memory (PAM). Tmem references the PAM pages via
9 * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
10 * set of functions (pamops). Each pampd contains some representation of
11 * PAGE_SIZE bytes worth of data. For those familiar with key-value stores,
12 * the tmem handle is a three-level hierarchical key, and the value is always
13 * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is
14 * referenced in the datastore by the pampd. The hierarchy is required
15 * to ensure that certain invalidation functions can be performed efficiently
16 * (i.e. flush all indexes associated with this object_id, or
17 * flush all objects associated with this pool).
19 * Tmem must support potentially millions of pages and must be able to insert,
20 * find, and delete these pages at a potential frequency of thousands per
21 * second concurrently across many CPUs, (and, if used with KVM, across many
22 * vcpus across many guests). Tmem is tracked with a hierarchy of data
23 * structures, organized by the elements in the handle-tuple: pool_id,
24 * object_id, and page index. One or more "clients" (e.g. guests) each
25 * provide one or more tmem_pools. Each pool, contains a hash table of
26 * rb_trees of tmem_objs. Each tmem_obj contains a radix-tree-like tree
27 * of pointers, with intermediate nodes called tmem_objnodes. Each leaf
28 * pointer in this tree points to a pampd, which is accessible only through
29 * a small set of callbacks registered by the PAM implementation (see
30 * tmem_register_pamops). Tmem only needs to memory allocation for objs
31 * and objnodes and this is done via a set of callbacks that must be
32 * registered by the tmem host implementation (e.g. see tmem_register_hostops).
35 #include <linux/list.h>
36 #include <linux/spinlock.h>
37 #include <linux/atomic.h>
38 #ifdef CONFIG_RAMSTER
39 #include <linux/delay.h>
40 #endif
42 #include "tmem.h"
44 /* data structure sentinels used for debugging... see tmem.h */
45 #define POOL_SENTINEL 0x87658765
46 #define OBJ_SENTINEL 0x12345678
47 #define OBJNODE_SENTINEL 0xfedcba09
50 * A tmem host implementation must use this function to register callbacks
51 * for memory allocation.
53 static struct tmem_hostops tmem_hostops;
55 static void tmem_objnode_tree_init(void);
57 void tmem_register_hostops(struct tmem_hostops *m)
59 tmem_objnode_tree_init();
60 tmem_hostops = *m;
64 * A tmem host implementation must use this function to register
65 * callbacks for a page-accessible memory (PAM) implementation.
67 static struct tmem_pamops tmem_pamops;
69 void tmem_register_pamops(struct tmem_pamops *m)
71 tmem_pamops = *m;
75 * Oid's are potentially very sparse and tmem_objs may have an indeterminately
76 * short life, being added and deleted at a relatively high frequency.
77 * So an rb_tree is an ideal data structure to manage tmem_objs. But because
78 * of the potentially huge number of tmem_objs, each pool manages a hashtable
79 * of rb_trees to reduce search, insert, delete, and rebalancing time.
80 * Each hashbucket also has a lock to manage concurrent access and no
81 * searches, inserts, or deletions can be performed unless the lock is held.
82 * As a result, care must be taken to ensure tmem routines are not called
83 * recursively; the vast majority of the time, a recursive call may work
84 * but a deadlock will occur a small fraction of the time due to the
85 * hashbucket lock.
87 * The following routines manage tmem_objs. In all of these routines,
88 * the hashbucket lock is already held.
91 /* Search for object==oid in pool, returns object if found. */
92 static struct tmem_obj *__tmem_obj_find(struct tmem_hashbucket *hb,
93 struct tmem_oid *oidp,
94 struct rb_node **parent,
95 struct rb_node ***link)
97 struct rb_node *_parent = NULL, **rbnode;
98 struct tmem_obj *obj = NULL;
100 rbnode = &hb->obj_rb_root.rb_node;
101 while (*rbnode) {
102 BUG_ON(RB_EMPTY_NODE(*rbnode));
103 _parent = *rbnode;
104 obj = rb_entry(*rbnode, struct tmem_obj,
105 rb_tree_node);
106 switch (tmem_oid_compare(oidp, &obj->oid)) {
107 case 0: /* equal */
108 goto out;
109 case -1:
110 rbnode = &(*rbnode)->rb_left;
111 break;
112 case 1:
113 rbnode = &(*rbnode)->rb_right;
114 break;
118 if (parent)
119 *parent = _parent;
120 if (link)
121 *link = rbnode;
122 obj = NULL;
123 out:
124 return obj;
127 static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
128 struct tmem_oid *oidp)
130 return __tmem_obj_find(hb, oidp, NULL, NULL);
133 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *, bool);
135 /* Free an object that has no more pampds in it. */
136 static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
138 struct tmem_pool *pool;
140 BUG_ON(obj == NULL);
141 ASSERT_SENTINEL(obj, OBJ);
142 BUG_ON(obj->pampd_count > 0);
143 pool = obj->pool;
144 BUG_ON(pool == NULL);
145 if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
146 tmem_pampd_destroy_all_in_obj(obj, false);
147 BUG_ON(obj->objnode_tree_root != NULL);
148 BUG_ON((long)obj->objnode_count != 0);
149 atomic_dec(&pool->obj_count);
150 BUG_ON(atomic_read(&pool->obj_count) < 0);
151 INVERT_SENTINEL(obj, OBJ);
152 obj->pool = NULL;
153 tmem_oid_set_invalid(&obj->oid);
154 rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
158 * Initialize, and insert an tmem_object_root (called only if find failed).
160 static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
161 struct tmem_pool *pool,
162 struct tmem_oid *oidp)
164 struct rb_root *root = &hb->obj_rb_root;
165 struct rb_node **new = NULL, *parent = NULL;
167 BUG_ON(pool == NULL);
168 atomic_inc(&pool->obj_count);
169 obj->objnode_tree_height = 0;
170 obj->objnode_tree_root = NULL;
171 obj->pool = pool;
172 obj->oid = *oidp;
173 obj->objnode_count = 0;
174 obj->pampd_count = 0;
175 #ifdef CONFIG_RAMSTER
176 if (tmem_pamops.new_obj != NULL)
177 (*tmem_pamops.new_obj)(obj);
178 #endif
179 SET_SENTINEL(obj, OBJ);
181 if (__tmem_obj_find(hb, oidp, &parent, &new))
182 BUG();
184 rb_link_node(&obj->rb_tree_node, parent, new);
185 rb_insert_color(&obj->rb_tree_node, root);
189 * Tmem is managed as a set of tmem_pools with certain attributes, such as
190 * "ephemeral" vs "persistent". These attributes apply to all tmem_objs
191 * and all pampds that belong to a tmem_pool. A tmem_pool is created
192 * or deleted relatively rarely (for example, when a filesystem is
193 * mounted or unmounted).
196 /* flush all data from a pool and, optionally, free it */
197 static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
199 struct rb_node *rbnode;
200 struct tmem_obj *obj;
201 struct tmem_hashbucket *hb = &pool->hashbucket[0];
202 int i;
204 BUG_ON(pool == NULL);
205 for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
206 spin_lock(&hb->lock);
207 rbnode = rb_first(&hb->obj_rb_root);
208 while (rbnode != NULL) {
209 obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
210 rbnode = rb_next(rbnode);
211 tmem_pampd_destroy_all_in_obj(obj, true);
212 tmem_obj_free(obj, hb);
213 (*tmem_hostops.obj_free)(obj, pool);
215 spin_unlock(&hb->lock);
217 if (destroy)
218 list_del(&pool->pool_list);
222 * A tmem_obj contains a radix-tree-like tree in which the intermediate
223 * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation
224 * is very specialized and tuned for specific uses and is not particularly
225 * suited for use from this code, though some code from the core algorithms has
226 * been reused, thus the copyright notices below). Each tmem_objnode contains
227 * a set of pointers which point to either a set of intermediate tmem_objnodes
228 * or a set of of pampds.
230 * Portions Copyright (C) 2001 Momchil Velikov
231 * Portions Copyright (C) 2001 Christoph Hellwig
232 * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
235 struct tmem_objnode_tree_path {
236 struct tmem_objnode *objnode;
237 int offset;
240 /* objnode height_to_maxindex translation */
241 static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
243 static void tmem_objnode_tree_init(void)
245 unsigned int ht, tmp;
247 for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
248 tmp = ht * OBJNODE_TREE_MAP_SHIFT;
249 if (tmp >= OBJNODE_TREE_INDEX_BITS)
250 tmem_objnode_tree_h2max[ht] = ~0UL;
251 else
252 tmem_objnode_tree_h2max[ht] =
253 (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
257 static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
259 struct tmem_objnode *objnode;
261 ASSERT_SENTINEL(obj, OBJ);
262 BUG_ON(obj->pool == NULL);
263 ASSERT_SENTINEL(obj->pool, POOL);
264 objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
265 if (unlikely(objnode == NULL))
266 goto out;
267 objnode->obj = obj;
268 SET_SENTINEL(objnode, OBJNODE);
269 memset(&objnode->slots, 0, sizeof(objnode->slots));
270 objnode->slots_in_use = 0;
271 obj->objnode_count++;
272 out:
273 return objnode;
276 static void tmem_objnode_free(struct tmem_objnode *objnode)
278 struct tmem_pool *pool;
279 int i;
281 BUG_ON(objnode == NULL);
282 for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
283 BUG_ON(objnode->slots[i] != NULL);
284 ASSERT_SENTINEL(objnode, OBJNODE);
285 INVERT_SENTINEL(objnode, OBJNODE);
286 BUG_ON(objnode->obj == NULL);
287 ASSERT_SENTINEL(objnode->obj, OBJ);
288 pool = objnode->obj->pool;
289 BUG_ON(pool == NULL);
290 ASSERT_SENTINEL(pool, POOL);
291 objnode->obj->objnode_count--;
292 objnode->obj = NULL;
293 (*tmem_hostops.objnode_free)(objnode, pool);
297 * Lookup index in object and return associated pampd (or NULL if not found).
299 static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
301 unsigned int height, shift;
302 struct tmem_objnode **slot = NULL;
304 BUG_ON(obj == NULL);
305 ASSERT_SENTINEL(obj, OBJ);
306 BUG_ON(obj->pool == NULL);
307 ASSERT_SENTINEL(obj->pool, POOL);
309 height = obj->objnode_tree_height;
310 if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
311 goto out;
312 if (height == 0 && obj->objnode_tree_root) {
313 slot = &obj->objnode_tree_root;
314 goto out;
316 shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
317 slot = &obj->objnode_tree_root;
318 while (height > 0) {
319 if (*slot == NULL)
320 goto out;
321 slot = (struct tmem_objnode **)
322 ((*slot)->slots +
323 ((index >> shift) & OBJNODE_TREE_MAP_MASK));
324 shift -= OBJNODE_TREE_MAP_SHIFT;
325 height--;
327 out:
328 return slot != NULL ? (void **)slot : NULL;
331 static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
333 struct tmem_objnode **slot;
335 slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
336 return slot != NULL ? *slot : NULL;
339 #ifdef CONFIG_RAMSTER
340 static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
341 void *new_pampd, bool no_free)
343 struct tmem_objnode **slot;
344 void *ret = NULL;
346 slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
347 if ((slot != NULL) && (*slot != NULL)) {
348 void *old_pampd = *(void **)slot;
349 *(void **)slot = new_pampd;
350 if (!no_free)
351 (*tmem_pamops.free)(old_pampd, obj->pool,
352 NULL, 0, false);
353 ret = new_pampd;
355 return ret;
357 #endif
359 static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
360 void *pampd)
362 int ret = 0;
363 struct tmem_objnode *objnode = NULL, *newnode, *slot;
364 unsigned int height, shift;
365 int offset = 0;
367 /* if necessary, extend the tree to be higher */
368 if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
369 height = obj->objnode_tree_height + 1;
370 if (index > tmem_objnode_tree_h2max[height])
371 while (index > tmem_objnode_tree_h2max[height])
372 height++;
373 if (obj->objnode_tree_root == NULL) {
374 obj->objnode_tree_height = height;
375 goto insert;
377 do {
378 newnode = tmem_objnode_alloc(obj);
379 if (!newnode) {
380 ret = -ENOMEM;
381 goto out;
383 newnode->slots[0] = obj->objnode_tree_root;
384 newnode->slots_in_use = 1;
385 obj->objnode_tree_root = newnode;
386 obj->objnode_tree_height++;
387 } while (height > obj->objnode_tree_height);
389 insert:
390 slot = obj->objnode_tree_root;
391 height = obj->objnode_tree_height;
392 shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
393 while (height > 0) {
394 if (slot == NULL) {
395 /* add a child objnode. */
396 slot = tmem_objnode_alloc(obj);
397 if (!slot) {
398 ret = -ENOMEM;
399 goto out;
401 if (objnode) {
403 objnode->slots[offset] = slot;
404 objnode->slots_in_use++;
405 } else
406 obj->objnode_tree_root = slot;
408 /* go down a level */
409 offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
410 objnode = slot;
411 slot = objnode->slots[offset];
412 shift -= OBJNODE_TREE_MAP_SHIFT;
413 height--;
415 BUG_ON(slot != NULL);
416 if (objnode) {
417 objnode->slots_in_use++;
418 objnode->slots[offset] = pampd;
419 } else
420 obj->objnode_tree_root = pampd;
421 obj->pampd_count++;
422 out:
423 return ret;
426 static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
428 struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
429 struct tmem_objnode_tree_path *pathp = path;
430 struct tmem_objnode *slot = NULL;
431 unsigned int height, shift;
432 int offset;
434 BUG_ON(obj == NULL);
435 ASSERT_SENTINEL(obj, OBJ);
436 BUG_ON(obj->pool == NULL);
437 ASSERT_SENTINEL(obj->pool, POOL);
438 height = obj->objnode_tree_height;
439 if (index > tmem_objnode_tree_h2max[height])
440 goto out;
441 slot = obj->objnode_tree_root;
442 if (height == 0 && obj->objnode_tree_root) {
443 obj->objnode_tree_root = NULL;
444 goto out;
446 shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
447 pathp->objnode = NULL;
448 do {
449 if (slot == NULL)
450 goto out;
451 pathp++;
452 offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
453 pathp->offset = offset;
454 pathp->objnode = slot;
455 slot = slot->slots[offset];
456 shift -= OBJNODE_TREE_MAP_SHIFT;
457 height--;
458 } while (height > 0);
459 if (slot == NULL)
460 goto out;
461 while (pathp->objnode) {
462 pathp->objnode->slots[pathp->offset] = NULL;
463 pathp->objnode->slots_in_use--;
464 if (pathp->objnode->slots_in_use) {
465 if (pathp->objnode == obj->objnode_tree_root) {
466 while (obj->objnode_tree_height > 0 &&
467 obj->objnode_tree_root->slots_in_use == 1 &&
468 obj->objnode_tree_root->slots[0]) {
469 struct tmem_objnode *to_free =
470 obj->objnode_tree_root;
472 obj->objnode_tree_root =
473 to_free->slots[0];
474 obj->objnode_tree_height--;
475 to_free->slots[0] = NULL;
476 to_free->slots_in_use = 0;
477 tmem_objnode_free(to_free);
480 goto out;
482 tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
483 pathp--;
485 obj->objnode_tree_height = 0;
486 obj->objnode_tree_root = NULL;
488 out:
489 if (slot != NULL)
490 obj->pampd_count--;
491 BUG_ON(obj->pampd_count < 0);
492 return slot;
495 /* Recursively walk the objnode_tree destroying pampds and objnodes. */
496 static void tmem_objnode_node_destroy(struct tmem_obj *obj,
497 struct tmem_objnode *objnode,
498 unsigned int ht)
500 int i;
502 if (ht == 0)
503 return;
504 for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
505 if (objnode->slots[i]) {
506 if (ht == 1) {
507 obj->pampd_count--;
508 (*tmem_pamops.free)(objnode->slots[i],
509 obj->pool, NULL, 0, true);
510 objnode->slots[i] = NULL;
511 continue;
513 tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
514 tmem_objnode_free(objnode->slots[i]);
515 objnode->slots[i] = NULL;
520 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj,
521 bool pool_destroy)
523 if (obj->objnode_tree_root == NULL)
524 return;
525 if (obj->objnode_tree_height == 0) {
526 obj->pampd_count--;
527 (*tmem_pamops.free)(obj->objnode_tree_root,
528 obj->pool, NULL, 0, true);
529 } else {
530 tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
531 obj->objnode_tree_height);
532 tmem_objnode_free(obj->objnode_tree_root);
533 obj->objnode_tree_height = 0;
535 obj->objnode_tree_root = NULL;
536 #ifdef CONFIG_RAMSTER
537 if (tmem_pamops.free_obj != NULL)
538 (*tmem_pamops.free_obj)(obj->pool, obj, pool_destroy);
539 #endif
543 * Tmem is operated on by a set of well-defined actions:
544 * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
545 * (The tmem ABI allows for subpages and exchanges but these operations
546 * are not included in this implementation.)
548 * These "tmem core" operations are implemented in the following functions.
552 * "Put" a page, e.g. associate the passed pampd with the passed handle.
553 * Tmem_put is complicated by a corner case: What if a page with matching
554 * handle already exists in tmem? To guarantee coherency, one of two
555 * actions is necessary: Either the data for the page must be overwritten,
556 * or the page must be "flushed" so that the data is not accessible to a
557 * subsequent "get". Since these "duplicate puts" are relatively rare,
558 * this implementation always flushes for simplicity.
560 int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
561 bool raw, void *pampd_to_use)
563 struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
564 void *pampd = NULL, *pampd_del = NULL;
565 int ret = -ENOMEM;
566 struct tmem_hashbucket *hb;
568 hb = &pool->hashbucket[tmem_oid_hash(oidp)];
569 spin_lock(&hb->lock);
570 obj = objfound = tmem_obj_find(hb, oidp);
571 if (obj != NULL) {
572 pampd = tmem_pampd_lookup_in_obj(objfound, index);
573 if (pampd != NULL) {
574 /* if found, is a dup put, flush the old one */
575 pampd_del = tmem_pampd_delete_from_obj(obj, index);
576 BUG_ON(pampd_del != pampd);
577 (*tmem_pamops.free)(pampd, pool, oidp, index, true);
578 if (obj->pampd_count == 0) {
579 objnew = obj;
580 objfound = NULL;
582 pampd = NULL;
584 } else {
585 obj = objnew = (*tmem_hostops.obj_alloc)(pool);
586 if (unlikely(obj == NULL)) {
587 ret = -ENOMEM;
588 goto out;
590 tmem_obj_init(obj, hb, pool, oidp);
592 BUG_ON(obj == NULL);
593 BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
594 pampd = pampd_to_use;
595 BUG_ON(pampd_to_use == NULL);
596 ret = tmem_pampd_add_to_obj(obj, index, pampd);
597 if (unlikely(ret == -ENOMEM))
598 /* may have partially built objnode tree ("stump") */
599 goto delete_and_free;
600 (*tmem_pamops.create_finish)(pampd, is_ephemeral(pool));
601 goto out;
603 delete_and_free:
604 (void)tmem_pampd_delete_from_obj(obj, index);
605 if (pampd)
606 (*tmem_pamops.free)(pampd, pool, NULL, 0, true);
607 if (objnew) {
608 tmem_obj_free(objnew, hb);
609 (*tmem_hostops.obj_free)(objnew, pool);
611 out:
612 spin_unlock(&hb->lock);
613 return ret;
616 #ifdef CONFIG_RAMSTER
618 * For ramster only: The following routines provide a two-step sequence
619 * to allow the caller to replace a pampd in the tmem data structures with
620 * another pampd. Here, we lookup the passed handle and, if found, return the
621 * associated pampd and object, leaving the hashbucket locked and returning
622 * a reference to it. The caller is expected to immediately call the
623 * matching tmem_localify_finish routine which will handles the replacement
624 * and unlocks the hashbucket.
626 void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,
627 uint32_t index, struct tmem_obj **ret_obj,
628 void **saved_hb)
630 struct tmem_hashbucket *hb;
631 struct tmem_obj *obj = NULL;
632 void *pampd = NULL;
634 hb = &pool->hashbucket[tmem_oid_hash(oidp)];
635 spin_lock(&hb->lock);
636 obj = tmem_obj_find(hb, oidp);
637 if (likely(obj != NULL))
638 pampd = tmem_pampd_lookup_in_obj(obj, index);
639 *ret_obj = obj;
640 *saved_hb = (void *)hb;
641 /* note, hashbucket remains locked */
642 return pampd;
645 void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
646 void *pampd, void *saved_hb, bool delete)
648 struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb;
650 BUG_ON(!spin_is_locked(&hb->lock));
651 if (pampd != NULL) {
652 BUG_ON(obj == NULL);
653 (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);
654 (*tmem_pamops.create_finish)(pampd, is_ephemeral(obj->pool));
655 } else if (delete) {
656 BUG_ON(obj == NULL);
657 (void)tmem_pampd_delete_from_obj(obj, index);
659 spin_unlock(&hb->lock);
663 * For ramster only. Helper function to support asynchronous tmem_get.
665 static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
666 struct tmem_pool *pool, struct tmem_oid *oidp,
667 uint32_t index, bool free, char *data)
669 void *old_pampd = *ppampd, *new_pampd = NULL;
670 bool intransit = false;
671 int ret = 0;
673 if (!is_ephemeral(pool))
674 new_pampd = (*tmem_pamops.repatriate_preload)(
675 old_pampd, pool, oidp, index, &intransit);
676 if (intransit)
677 ret = -EAGAIN;
678 else if (new_pampd != NULL)
679 *ppampd = new_pampd;
680 /* must release the hb->lock else repatriate can't sleep */
681 spin_unlock(&hb->lock);
682 if (!intransit)
683 ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,
684 oidp, index, free, data);
685 if (ret == -EAGAIN) {
686 /* rare I think, but should cond_resched()??? */
687 usleep_range(10, 1000);
688 } else if (ret == -ENOTCONN || ret == -EHOSTDOWN) {
689 ret = -1;
690 } else if (ret != 0 && ret != -ENOENT) {
691 ret = -1;
693 /* note hb->lock has now been unlocked */
694 return ret;
698 * For ramster only. If a page in tmem matches the handle, replace the
699 * page so that any subsequent "get" gets the new page. Returns 0 if
700 * there was a page to replace, else returns -1.
702 int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
703 uint32_t index, void *new_pampd)
705 struct tmem_obj *obj;
706 int ret = -1;
707 struct tmem_hashbucket *hb;
709 hb = &pool->hashbucket[tmem_oid_hash(oidp)];
710 spin_lock(&hb->lock);
711 obj = tmem_obj_find(hb, oidp);
712 if (obj == NULL)
713 goto out;
714 new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
715 /* if we bug here, pamops wasn't properly set up for ramster */
716 BUG_ON(tmem_pamops.replace_in_obj == NULL);
717 ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
718 out:
719 spin_unlock(&hb->lock);
720 return ret;
722 #endif
725 * "Get" a page, e.g. if a pampd can be found matching the passed handle,
726 * use a pamops callback to recreated the page from the pampd with the
727 * matching handle. By tmem definition, when a "get" is successful on
728 * an ephemeral page, the page is "flushed", and when a "get" is successful
729 * on a persistent page, the page is retained in tmem. Note that to preserve
730 * coherency, "get" can never be skipped if tmem contains the data.
731 * That is, if a get is done with a certain handle and fails, any
732 * subsequent "get" must also fail (unless of course there is a
733 * "put" done with the same handle).
735 int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
736 char *data, size_t *sizep, bool raw, int get_and_free)
738 struct tmem_obj *obj;
739 void *pampd = NULL;
740 bool ephemeral = is_ephemeral(pool);
741 int ret = -1;
742 struct tmem_hashbucket *hb;
743 bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
744 bool lock_held = false;
745 void **ppampd;
747 do {
748 hb = &pool->hashbucket[tmem_oid_hash(oidp)];
749 spin_lock(&hb->lock);
750 lock_held = true;
751 obj = tmem_obj_find(hb, oidp);
752 if (obj == NULL)
753 goto out;
754 ppampd = __tmem_pampd_lookup_in_obj(obj, index);
755 if (ppampd == NULL)
756 goto out;
757 #ifdef CONFIG_RAMSTER
758 if ((tmem_pamops.is_remote != NULL) &&
759 tmem_pamops.is_remote(*ppampd)) {
760 ret = tmem_repatriate(ppampd, hb, pool, oidp,
761 index, free, data);
762 /* tmem_repatriate releases hb->lock */
763 lock_held = false;
764 *sizep = PAGE_SIZE;
765 if (ret != -EAGAIN)
766 goto out;
768 #endif
769 } while (ret == -EAGAIN);
770 if (free)
771 pampd = tmem_pampd_delete_from_obj(obj, index);
772 else
773 pampd = tmem_pampd_lookup_in_obj(obj, index);
774 if (pampd == NULL)
775 goto out;
776 if (free) {
777 if (obj->pampd_count == 0) {
778 tmem_obj_free(obj, hb);
779 (*tmem_hostops.obj_free)(obj, pool);
780 obj = NULL;
783 if (free)
784 ret = (*tmem_pamops.get_data_and_free)(
785 data, sizep, raw, pampd, pool, oidp, index);
786 else
787 ret = (*tmem_pamops.get_data)(
788 data, sizep, raw, pampd, pool, oidp, index);
789 if (ret < 0)
790 goto out;
791 ret = 0;
792 out:
793 if (lock_held)
794 spin_unlock(&hb->lock);
795 return ret;
799 * If a page in tmem matches the handle, "flush" this page from tmem such
800 * that any subsequent "get" does not succeed (unless, of course, there
801 * was another "put" with the same handle).
803 int tmem_flush_page(struct tmem_pool *pool,
804 struct tmem_oid *oidp, uint32_t index)
806 struct tmem_obj *obj;
807 void *pampd;
808 int ret = -1;
809 struct tmem_hashbucket *hb;
811 hb = &pool->hashbucket[tmem_oid_hash(oidp)];
812 spin_lock(&hb->lock);
813 obj = tmem_obj_find(hb, oidp);
814 if (obj == NULL)
815 goto out;
816 pampd = tmem_pampd_delete_from_obj(obj, index);
817 if (pampd == NULL)
818 goto out;
819 (*tmem_pamops.free)(pampd, pool, oidp, index, true);
820 if (obj->pampd_count == 0) {
821 tmem_obj_free(obj, hb);
822 (*tmem_hostops.obj_free)(obj, pool);
824 ret = 0;
826 out:
827 spin_unlock(&hb->lock);
828 return ret;
832 * "Flush" all pages in tmem matching this oid.
834 int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
836 struct tmem_obj *obj;
837 struct tmem_hashbucket *hb;
838 int ret = -1;
840 hb = &pool->hashbucket[tmem_oid_hash(oidp)];
841 spin_lock(&hb->lock);
842 obj = tmem_obj_find(hb, oidp);
843 if (obj == NULL)
844 goto out;
845 tmem_pampd_destroy_all_in_obj(obj, false);
846 tmem_obj_free(obj, hb);
847 (*tmem_hostops.obj_free)(obj, pool);
848 ret = 0;
850 out:
851 spin_unlock(&hb->lock);
852 return ret;
856 * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
857 * all subsequent access to this tmem_pool.
859 int tmem_destroy_pool(struct tmem_pool *pool)
861 int ret = -1;
863 if (pool == NULL)
864 goto out;
865 tmem_pool_flush(pool, 1);
866 ret = 0;
867 out:
868 return ret;
871 static LIST_HEAD(tmem_global_pool_list);
874 * Create a new tmem_pool with the provided flag and return
875 * a pool id provided by the tmem host implementation.
877 void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
879 int persistent = flags & TMEM_POOL_PERSIST;
880 int shared = flags & TMEM_POOL_SHARED;
881 struct tmem_hashbucket *hb = &pool->hashbucket[0];
882 int i;
884 for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
885 hb->obj_rb_root = RB_ROOT;
886 spin_lock_init(&hb->lock);
888 INIT_LIST_HEAD(&pool->pool_list);
889 atomic_set(&pool->obj_count, 0);
890 SET_SENTINEL(pool, POOL);
891 list_add_tail(&pool->pool_list, &tmem_global_pool_list);
892 pool->persistent = persistent;
893 pool->shared = shared;