Update NEWS for 1.6.22
[pkg-k5-afs_openafs.git] / src / mcas / stm_fraser.c
blobc74e24504a0832d42a4fdbd94ec69b268e6fd324
1 /******************************************************************************
2 * stm_fraser.c
4 * Lock-free software transactional memory (STM).
6 * Copyright (c) 2002-2003, K A Fraser
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions are
10 met:
12 * Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials provided
17 * with the distribution. Neither the name of the Keir Fraser
18 * nor the names of its contributors may be used to endorse or
19 * promote products derived from this software without specific
20 * prior written permission.
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 #include "portable_defns.h"
36 #include "ptst.h"
37 #include "gc.h"
38 #include <assert.h>
39 #include <stdarg.h>
40 #include <stdlib.h>
41 #include <stdio.h>
42 #include <string.h>
43 #include <setjmp.h>
44 #include <signal.h>
46 typedef struct stm_blk_st stm_blk;
47 typedef struct stm_tx_entry_st stm_tx_entry;
48 typedef struct stm_tx_st stm_tx;
49 typedef struct stm_st stm;
51 struct stm_blk_st {
52 void *data;
55 struct stm_tx_entry_st {
56 stm_blk *b;
57 void *old;
58 void *new;
59 stm_tx_entry *next;
62 struct stm_tx_st {
63 int status;
64 int rc;
65 stm_tx *next_free;
66 stm_tx_entry *reads;
67 stm_tx_entry *writes;
68 stm_tx_entry *alloc_ptr, *check;
69 int gc_data_id, blk_size; /* copied from 'stm' structure */
70 sigjmp_buf *penv;
73 struct stm_st {
74 int gc_data_id;
75 int blk_size;
78 /* Private per-thread state. The array is indexed off ptst->id. */
79 typedef struct {
80 void *arena, *arena_lim;
81 stm_tx *next_descriptor;
82 stm_tx *cur_tx;
83 CACHE_PAD(0);
84 } priv_t;
86 static priv_t priv_ptst[MAX_THREADS];
87 static int gc_blk_id; /* Allocation id for block descriptors. */
88 static int do_padding; /* Should all allocations be padded to a cache line? */
90 #define ALLOCATOR_SIZE(_s) (do_padding ? CACHE_LINE_SIZE : (_s))
92 #define ARENA_SIZE 40960
93 #define DESCRIPTOR_SIZE 4096
95 #define TXS_IN_PROGRESS 0
96 #define TXS_READ_PHASE 1
97 #define TXS_FAILED 2
98 #define TXS_SUCCESSFUL 3
100 #define is_descriptor(_p) ((unsigned long)(_p) & 1)
101 #define ptr_to_descriptor(_p) ((stm_tx *)((unsigned long)(_p) & ~1))
102 #define make_marked_ptr(_p) ((void *)((unsigned long)(_p) | 1))
104 /* Is transaction read-only? */
105 #define read_only(_t) ((_t)->writes == NULL)
107 bool_t commit_stm_tx(ptst_t *ptst, stm_tx *t);
109 static void new_arena (priv_t *priv, int size)
111 priv->arena = malloc(size);
112 if ( priv->arena == NULL ) abort();
113 priv->arena_lim = (((char *) priv->arena) + size);
116 static void release_descriptor(ptst_t *ptst, stm_tx *t)
118 stm_tx_entry *ent;
119 priv_t *priv = &priv_ptst[ptst->id];
120 void *data;
122 assert(t->status >= TXS_FAILED);
124 t->next_free = priv->next_descriptor;
125 priv->next_descriptor = t;
127 if ( t->status == TXS_SUCCESSFUL )
129 for ( ent = t->writes; ent != NULL; ent = ent->next )
131 gc_free(ptst, ent->old, t->gc_data_id);
134 else
136 for ( ent = t->writes; ent != NULL; ent = ent->next )
138 gc_unsafe_free(ptst, ent->new, t->gc_data_id);
143 static int rc_delta_descriptor(stm_tx *t, int delta)
145 int rc, new_rc = t->rc;
147 do { rc = new_rc; }
148 while ( (new_rc = CASIO (&t->rc, rc, rc + delta)) != rc );
150 return rc;
153 static void rc_up_descriptor(stm_tx *t)
155 rc_delta_descriptor(t, 2);
156 MB();
159 static void rc_down_descriptor(ptst_t *ptst, stm_tx *t)
161 int old_rc, new_rc, cur_rc = t->rc;
163 WMB();
165 do {
166 old_rc = cur_rc;
167 new_rc = old_rc - 2;
168 if ( new_rc == 0 ) new_rc = 1;
170 while ( (cur_rc = CASIO (&t->rc, old_rc, new_rc)) != old_rc );
172 if ( old_rc == 2 ) release_descriptor(ptst, t);
175 static stm_tx *new_descriptor(priv_t *priv)
177 stm_tx *t;
179 t = priv->next_descriptor;
181 if ( t != NULL )
183 priv->next_descriptor = t->next_free;
184 /* 'Unfree' descriptor, if it was previously freed. */
185 if ( (t->rc & 1) == 1 ) rc_delta_descriptor(t, 1);
187 else
189 t = (stm_tx *) priv->arena;
190 priv->arena = ((char *) (priv->arena)) + DESCRIPTOR_SIZE;
192 if ( priv->arena >= priv->arena_lim )
194 new_arena(priv, ARENA_SIZE);
195 t = (stm_tx *) priv->arena;
196 priv->arena = ((char *) (priv->arena)) + DESCRIPTOR_SIZE;
199 t->next_free = NULL;
200 t->rc = 2;
203 return t;
207 static stm_tx_entry *alloc_stm_tx_entry(stm_tx *t)
209 stm_tx_entry *ent = t->alloc_ptr++;
210 assert(((unsigned long)t->alloc_ptr - (unsigned long)t) <=
211 DESCRIPTOR_SIZE);
212 return ent;
216 static stm_tx_entry **search_stm_tx_entry(stm_tx_entry **pnext, stm_blk *b)
218 stm_tx_entry *next = *pnext;
220 while ( (next != NULL) && ((unsigned long)next->b < (unsigned long)b) )
222 pnext = &next->next;
223 next = *pnext;
226 return pnext;
230 static void *read_blk_data(ptst_t *ptst, stm_blk *b)
232 void *data;
233 stm_tx *t;
234 int status;
235 stm_tx_entry **pent;
237 for ( ; ; )
239 data = b->data;
240 if ( !is_descriptor(data) ) return data;
242 t = ptr_to_descriptor(data);
243 rc_up_descriptor(t);
244 if ( b->data != data )
246 rc_down_descriptor(ptst, t);
247 continue;
251 * Commit even when we could just read from descriptor, as it gets
252 * the descriptor out of the way in future.
254 commit_stm_tx(ptst, t);
259 stm *new_stm(ptst_t *ptst, int blk_size)
261 stm *mem = malloc(CACHE_LINE_SIZE);
262 mem->blk_size = blk_size;
263 mem->gc_data_id = gc_add_allocator(ALLOCATOR_SIZE(blk_size));
264 return mem;
268 void free_stm(ptst_t *ptst, stm *mem)
270 gc_remove_allocator(mem->gc_data_id);
271 free(mem);
275 stm_blk *new_stm_blk(ptst_t *ptst, stm *mem)
277 stm_blk *b;
278 b = gc_alloc(ptst, gc_blk_id);
279 b->data = gc_alloc(ptst, mem->gc_data_id);
280 return b;
284 void free_stm_blk(ptst_t *ptst, stm *mem, stm_blk *b)
287 * We have to use read_stm_blk(), as some doomed transaction may still
288 * install a marked pointer here while in its write phase.
290 void *data = read_blk_data(ptst, b);
291 assert(!is_descriptor(data));
292 gc_free(ptst, data, mem->gc_data_id);
293 gc_free(ptst, b, gc_blk_id);
297 void *init_stm_blk(ptst_t *ptst, stm *mem, stm_blk *b)
299 return b->data;
303 int sizeof_stm_blk(ptst_t *ptst, stm *mem, stm_blk *b)
305 return mem->blk_size;
309 stm_tx *new_stm_tx(ptst_t *ptst, stm *mem, sigjmp_buf *penv)
311 priv_t *priv = &priv_ptst[ptst->id];
312 stm_tx *t;
314 if ( priv->cur_tx != NULL ) goto nesting;
315 t = new_descriptor(priv);
316 t->status = TXS_IN_PROGRESS;
317 t->reads = t->writes = NULL;
318 t->alloc_ptr = t->check = (stm_tx_entry *)(t + 1);
319 t->gc_data_id = mem->gc_data_id;
320 t->blk_size = mem->blk_size;
321 t->penv = penv;
322 priv->cur_tx = t;
323 return t;
325 nesting:
326 fprintf(stderr, "No nesting of transactions is allowed\n");
327 return NULL;
331 bool_t commit_stm_tx(ptst_t *ptst, stm_tx *t)
333 int desired_status, other_status, old_status, new_status, final_status;
334 void *marked_tx, *data;
335 stm_tx *other;
336 stm_tx_entry **other_pent, *ent;
337 priv_t *priv = &priv_ptst[ptst->id];
339 if ( priv->cur_tx == t ) priv->cur_tx = NULL;
341 marked_tx = make_marked_ptr(t);
342 desired_status = TXS_FAILED;
345 * PHASE 1: WRITE-CHECKING PHASE.
347 if ( (t->status == TXS_IN_PROGRESS) && ((ent = t->writes) != NULL) )
349 /* Others should see up-to-date contents of descriptor. */
350 WMB();
352 do {
353 for ( ; ; )
355 data = CASPO(&ent->b->data, ent->old, marked_tx);
356 if ( (data == ent->old) || (data == marked_tx) ) break;
358 if ( !is_descriptor(data) ) goto fail;
360 other = ptr_to_descriptor(data);
361 rc_up_descriptor(other);
362 if ( ent->b->data != data )
364 rc_down_descriptor(ptst, other);
365 continue;
368 commit_stm_tx(ptst, other);
371 while ( (ent = ent->next) != NULL );
374 /* On success we linearise at this point. */
375 WEAK_DEP_ORDER_WMB();
378 * PHASE 2: READ-CHECKING PHASE.
380 if ( (t->status <= TXS_READ_PHASE) && (t->reads != NULL) )
382 if ( !read_only(t) )
384 CASIO(&t->status, TXS_IN_PROGRESS, TXS_READ_PHASE);
385 MB_NEAR_CAS();
387 else MB();
389 for ( ent = t->reads; ent != NULL; ent = ent->next )
391 for ( ; ; )
393 data = ent->b->data;
394 if ( data == ent->old ) break;
396 /* Someone else made progress at our expense. */
397 if ( !is_descriptor(data) ) goto fail;
398 other = ptr_to_descriptor(data);
401 * Descriptor always belongs to a contending operation.
402 * Before continuing, we must increment the reference count.
404 assert(other != t);
405 rc_up_descriptor(other);
406 if ( ent->b->data != data )
408 rc_down_descriptor(ptst, other);
409 continue;
413 * What we do now depends on the status of the contending
414 * operation. This is easy for any status other than
415 * TXS_READ_PHASE -- usually we just check against the
416 * appropriate 'old' or 'new' data pointer. Transactions
417 * in their read-checking phase must be aborted, or helped
418 * to completion, depending on relative ordering of the
419 * transaction descriptors.
421 while ( (other_status = other->status) == TXS_READ_PHASE )
423 if ( t < other )
425 CASIO(&other->status, TXS_READ_PHASE, TXS_FAILED);
427 else
429 rc_up_descriptor(other);
430 commit_stm_tx(ptst, other);
434 other_pent = search_stm_tx_entry(&other->writes, ent->b);
435 assert(*other_pent != NULL);
436 data = (other_status == TXS_SUCCESSFUL) ?
437 (*other_pent)->new : (*other_pent)->old;
438 rc_down_descriptor(ptst, other);
439 if ( data != ent->old ) goto fail;
441 break;
446 desired_status = TXS_SUCCESSFUL;
448 fail:
449 if ( read_only(t) )
451 /* A very fast path: we can immediately reuse the descriptor. */
452 t->next_free = priv->next_descriptor;
453 priv->next_descriptor = t;
454 return desired_status == TXS_SUCCESSFUL;
457 /* Loop until we push the status to a "final decision" value. */
458 old_status = t->status;
459 while ( old_status <= TXS_READ_PHASE )
461 new_status = CASIO(&t->status, old_status, desired_status);
462 if ( old_status == new_status ) break;
463 old_status = new_status;
465 WMB_NEAR_CAS();
468 * PHASE 3: CLEAN-UP.
470 final_status = t->status;
471 for ( ent = t->writes; ent != NULL; ent = ent->next )
473 /* If CAS fails, someone did it for us already. */
474 (void)CASPO(&ent->b->data, marked_tx,
475 (final_status == TXS_SUCCESSFUL) ? ent->new: ent->old);
478 rc_down_descriptor(ptst, t);
479 return final_status == TXS_SUCCESSFUL;
483 bool_t validate_stm_tx(ptst_t *ptst, stm_tx *t)
485 stm_tx_entry *ent;
487 RMB();
489 for ( ent = t->reads; ent != NULL; ent = ent->next )
491 if ( read_blk_data(ptst, ent->b) != ent->old ) goto fail;
494 for ( ent = t->writes; ent != NULL; ent = ent->next )
496 if ( read_blk_data(ptst, ent->b) != ent->old ) goto fail;
499 return TRUE;
501 fail:
502 t->status = TXS_FAILED;
503 return FALSE;
507 void abort_stm_tx(ptst_t *ptst, stm_tx *t)
509 t->status = TXS_FAILED;
513 void *read_stm_blk(ptst_t *ptst, stm_tx *t, stm_blk *b)
515 stm_tx_entry **pent, *ent;
516 sigjmp_buf *penv;
517 void *result;
519 pent = search_stm_tx_entry(&t->writes, b);
520 ent = *pent;
521 if ( (ent != NULL) && (ent->b == b) ) goto found;
523 pent = search_stm_tx_entry(&t->reads, b);
524 ent = *pent;
525 if ( (ent != NULL) && (ent->b == b) ) goto found;
527 ent = alloc_stm_tx_entry(t);
528 ent->b = b;
529 ent->old = read_blk_data(ptst, b);
530 ent->new = ent->old;
531 ent->next = *pent;
532 *pent = ent;
534 assert(!is_descriptor(ent->new));
535 return ent->new;
537 found:
538 result = ent->new;
539 ent = t->check;
540 if ( read_blk_data(ptst, ent->b) != ent->old ) goto fail;
541 if ( ++t->check == t->alloc_ptr ) t->check = (stm_tx_entry *)(t + 1);
542 return result;
544 fail:
545 penv = t->penv;
546 abort_stm_tx(ptst, t);
547 commit_stm_tx(ptst, t);
548 siglongjmp(*penv, 0);
549 assert(0);
550 return NULL;
554 void *write_stm_blk(ptst_t *ptst, stm_tx *t, stm_blk *b)
556 stm_tx_entry **r_pent, **w_pent, *ent;
557 sigjmp_buf *penv;
558 void *result;
560 w_pent = search_stm_tx_entry(&t->writes, b);
561 ent = *w_pent;
562 if ( (ent != NULL) && (ent->b == b) ) goto found;
564 r_pent = search_stm_tx_entry(&t->reads, b);
565 ent = *r_pent;
566 if ( (ent != NULL) && (ent->b == b) )
568 *r_pent = ent->next;
570 else
572 ent = alloc_stm_tx_entry(t);
573 ent->b = b;
574 ent->old = read_blk_data(ptst, b);
577 ent->new = gc_alloc(ptst, t->gc_data_id);
578 ent->next = *w_pent;
579 *w_pent = ent;
580 memcpy(ent->new, ent->old, t->blk_size);
582 assert(!is_descriptor(ent->old));
583 assert(!is_descriptor(ent->new));
584 return ent->new;
586 found:
587 result = ent->new;
588 ent = t->check;
589 if ( read_blk_data(ptst, ent->b) != ent->old ) goto fail;
590 if ( ++t->check == t->alloc_ptr ) t->check = (stm_tx_entry *)(t + 1);
591 return result;
593 fail:
594 penv = t->penv;
595 abort_stm_tx(ptst, t);
596 commit_stm_tx(ptst, t);
597 siglongjmp(*penv, 0);
598 assert(0);
599 return NULL;
603 void remove_from_tx(ptst_t *ptst, stm_tx *t, stm_blk *b)
605 stm_tx_entry **pent, *ent;
606 void *data;
608 pent = search_stm_tx_entry(&t->writes, b);
609 ent = *pent;
610 if ( (ent != NULL) && (ent->b == b) )
612 *pent = ent->next;
613 data = ent->new;
614 assert(!is_descriptor(data));
615 gc_free(ptst, data, t->gc_data_id);
616 return;
619 pent = search_stm_tx_entry(&t->reads, b);
620 ent = *pent;
621 if ( (ent != NULL) && (ent->b == b) )
623 *pent = ent->next;
628 static void handle_fault(int sig)
630 ptst_t *ptst;
631 stm_tx *t;
633 ptst = critical_enter();
634 t = priv_ptst[ptst->id].cur_tx;
635 if ( (t != NULL) && !validate_stm_tx(ptst, t) )
637 sigjmp_buf *penv = t->penv;
638 commit_stm_tx(ptst, t);
639 critical_exit(ptst);
640 siglongjmp(*penv, 0);
643 fail:
644 fprintf(stderr, "Error: unhandleable SIGSEGV!\n");
645 abort();
649 void _init_stm_subsystem(int pad_data)
651 struct sigaction act;
653 do_padding = pad_data;
654 gc_blk_id = gc_add_allocator(ALLOCATOR_SIZE(sizeof(stm_blk)));
655 memset(priv_ptst, 0, sizeof(priv_ptst));
657 act.sa_handler = handle_fault;
658 sigemptyset(&act.sa_mask);
659 act.sa_flags = 0;
660 sigaction(SIGSEGV, &act, NULL);