Update NEWS for 1.6.22
[pkg-k5-afs_openafs.git] / src / mcas / mcas.c
blobb954bd91660ede5c04121651fff345fa954f3f97
1 /******************************************************************************
2 * mcas.c
4 * MCAS implemented as described in:
5 * A Practical Multi-Word Compare-and-Swap Operation
6 * Timothy Harris, Keir Fraser and Ian Pratt
7 * Proceedings of the IEEE Symposium on Distributed Computing, Oct 2002
9 * Copyright (c) 2002-2003, K A Fraser
11 Redistribution and use in source and binary forms, with or without
12 modification, are permitted provided that the following conditions are
13 met:
15 * Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution. Neither the name of the Keir Fraser
21 * nor the names of its contributors may be used to endorse or
22 * promote products derived from this software without specific
23 * prior written permission.
25 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 #include <sys/resource.h>
39 #include <assert.h>
40 #include <stdarg.h>
41 #include <stdio.h>
42 #include <string.h>
44 typedef struct CasDescriptor CasDescriptor_t;
45 typedef struct CasEntry CasEntry_t;
46 typedef struct per_thread_state_t per_thread_state_t;
48 extern int num_threads;
50 #define ARENA_SIZE 40960
52 struct per_thread_state_t
54 int id;
55 CasDescriptor_t *next_descriptor;
56 void *arena;
57 void *arena_lim;
61 static pthread_key_t mcas_ptst_key;
63 typedef struct pad128 { char pad[128]; } pad128_t;
66 /* CAS descriptors. */
68 #define STATUS_IN_PROGRESS 0
69 #define STATUS_SUCCEEDED 1
70 #define STATUS_FAILED 2
71 #define STATUS_ABORTED 3
73 struct CasEntry {
74 void **ptr;
75 void *old;
76 void *new;
79 struct CasDescriptor {
80 int status;
81 int length;
82 CasDescriptor_t *pt[MAX_THREADS];
83 int rc;
84 CasDescriptor_t *fc; /* free chain */
85 CasEntry_t entries[1];
88 /* Marked pointers. */
89 typedef unsigned long ptr_int;
90 #ifndef MARK_IN_PROGRESS
91 #define MARK_IN_PROGRESS 1
92 #endif
93 #ifndef MARK_PTR_TO_CD
94 #define MARK_PTR_TO_CD 2
95 #endif
97 #define get_markedness(p) (((ptr_int) (p)) & 3)
98 #define get_unmarked_reference(p) ((void *) (((ptr_int) (p)) & (~3)))
99 #define get_marked_reference(p,m) ((void *) (((ptr_int) (p)) | m))
101 static bool_t mcas0 (per_thread_state_t *ptst, CasDescriptor_t *cd);
102 static per_thread_state_t *get_ptst (void);
104 pad128_t p0; /* I'm worried these important RO vars might be false shared */
105 static int cas_sz;
106 static int num_ptrs = 1024;
107 static int ptr_mult = 1;
108 pad128_t p1;
110 static void *ALLOC(int size)
112 void *a = calloc(1, size);
113 if ( a == NULL ) abort();
114 return a;
117 static void *ALLOC_ALONE (int size)
119 int ps = sysconf(_SC_PAGESIZE);
120 int req = ps + size + ps;
121 char *res = ALLOC(req);
122 return (void *)(res + ps);
125 static int next_thread_id = 0;
126 static per_thread_state_t *ptsts = NULL;
128 static void new_arena (per_thread_state_t *ptst, int size)
130 ptst->arena = ALLOC(size);
131 if ( !ptst->arena ) abort();
132 ptst->arena_lim = (((char *) ptst->arena) + size);
135 static per_thread_state_t *get_ptst (void)
137 per_thread_state_t *result;
138 int r;
140 result = pthread_getspecific(mcas_ptst_key);
142 if ( result == NULL )
144 int my_id;
145 int largest = sysconf(_SC_PAGESIZE);
147 if ( largest < sizeof (per_thread_state_t) )
148 largest = sizeof (per_thread_state_t);
150 ALLOC (largest);
151 result = ALLOC (largest);
152 ALLOC (largest);
154 do { my_id = next_thread_id; }
155 while ( CASIO (&next_thread_id, my_id, my_id + 1) != my_id );
157 result->id = my_id;
158 ptsts = result;
160 new_arena(result, ARENA_SIZE);
162 r = pthread_setspecific(mcas_ptst_key, result);
163 assert(r == 0);
166 return result;
169 static void release_descriptor (CasDescriptor_t *cd)
171 per_thread_state_t *ptst = get_ptst ();
172 cd->fc = ptst->next_descriptor;
173 ptst->next_descriptor = cd;
176 static int rc_delta_descriptor (CasDescriptor_t *cd,
177 int delta)
179 int rc, new_rc = cd->rc;
181 do { rc = new_rc; }
182 while ( (new_rc = CASIO (&(cd->rc), rc, rc + delta)) != rc );
184 return rc;
187 static void rc_up_descriptor (CasDescriptor_t *cd)
189 rc_delta_descriptor(cd, 2);
190 MB();
193 static void rc_down_descriptor (CasDescriptor_t *cd)
195 int old_rc, new_rc, cur_rc = cd->rc;
197 do {
198 old_rc = cur_rc;
199 new_rc = old_rc - 2;
200 if ( new_rc == 0 ) new_rc = 1; else MB();
202 while ( (cur_rc = CASIO(&(cd->rc), old_rc, new_rc)) != old_rc );
204 if ( old_rc == 2 )
205 release_descriptor(cd);
208 static CasDescriptor_t *new_descriptor (per_thread_state_t *ptst, int length)
210 CasDescriptor_t *result;
211 int i;
213 CasDescriptor_t **ptr = &(ptst->next_descriptor);
214 result = *ptr;
215 while ( (result != NULL) && (result->length != length) )
217 ptr = &(result->fc);
218 result = *ptr;
221 if ( result == NULL )
223 int alloc_size;
225 alloc_size = sizeof (CasDescriptor_t) +
226 ((length - 1) * sizeof (CasEntry_t));
228 result = (CasDescriptor_t *) ptst->arena;
229 ptst->arena = ((char *) (ptst->arena)) + alloc_size;
231 if ( ptst->arena >= ptst->arena_lim )
233 new_arena(ptst, ARENA_SIZE);
234 result = (CasDescriptor_t *) ptst->arena;
235 ptst->arena = ((char *) (ptst->arena)) + alloc_size;
238 for ( i = 0; i < num_threads; i++ )
239 result->pt[i] = result;
241 result->length = length;
242 result->rc = 2;
244 else
246 *ptr = result->fc;
247 assert((result->rc & 1) == 1);
248 rc_delta_descriptor(result, 1); /* clears lowest bit */
251 assert(result->length == length);
253 return result;
256 static void *read_from_cd (void **ptr, CasDescriptor_t *cd, bool_t get_old)
258 CasEntry_t *ce;
259 int i;
260 int n;
262 n = cd->length;
263 for ( i = 0; i < n; i++ )
265 ce = &(cd->entries[i]);
266 if ( ce->ptr == ptr )
267 return get_old ? ce->old : ce->new;
270 assert(0);
271 return NULL;
274 static void *read_barrier_lite (void **ptr)
276 CasDescriptor_t *cd;
277 void *v;
278 int m;
280 retry_read_barrier:
281 v = *ptr;
282 m = get_markedness(v);
284 if ( m == MARK_PTR_TO_CD )
286 WEAK_DEP_ORDER_RMB();
287 cd = get_unmarked_reference(v);
289 rc_up_descriptor(cd);
290 if ( *ptr != v )
292 rc_down_descriptor(cd);
293 goto retry_read_barrier;
296 v = read_from_cd(ptr, cd, (cd->status != STATUS_SUCCEEDED));
298 rc_down_descriptor(cd);
300 else if ( m == MARK_IN_PROGRESS )
302 WEAK_DEP_ORDER_RMB();
303 cd = *(CasDescriptor_t **)get_unmarked_reference(v);
305 rc_up_descriptor(cd);
306 if ( *ptr != v )
308 rc_down_descriptor(cd);
309 goto retry_read_barrier;
312 v = read_from_cd(ptr, cd, (cd->status != STATUS_SUCCEEDED));
314 rc_down_descriptor(cd);
317 return v;
320 static void clean_descriptor (CasDescriptor_t *cd)
322 int i;
323 void *mcd;
324 int status;
326 status = cd->status;
327 assert(status == STATUS_SUCCEEDED || status == STATUS_FAILED);
329 mcd = get_marked_reference(cd, MARK_PTR_TO_CD);
331 if (status == STATUS_SUCCEEDED)
332 for ( i = 0; i < cd->length; i++ )
333 CASPO (cd->entries[i].ptr, mcd, cd->entries[i].new);
334 else
335 for ( i = 0; i < cd->length; i++ )
336 CASPO(cd->entries[i].ptr, mcd, cd->entries[i].old);
339 static bool_t mcas_fixup (void **ptr,
340 void *value_read)
342 int m;
344 retry_mcas_fixup:
345 m = get_markedness(value_read);
346 if ( m == MARK_PTR_TO_CD )
348 CasDescriptor_t *helpee;
349 helpee = get_unmarked_reference(value_read);
351 rc_up_descriptor(helpee);
352 if ( *ptr != value_read )
354 rc_down_descriptor(helpee);
355 value_read = *ptr;
356 goto retry_mcas_fixup;
359 mcas0(NULL, helpee);
361 rc_down_descriptor(helpee);
363 return TRUE;
365 else if ( m == MARK_IN_PROGRESS )
367 CasDescriptor_t *other_cd;
369 WEAK_DEP_ORDER_RMB();
370 other_cd = *(CasDescriptor_t **)get_unmarked_reference(value_read);
372 rc_up_descriptor(other_cd);
373 if ( *ptr != value_read )
375 rc_down_descriptor(other_cd);
376 value_read = *ptr;
377 goto retry_mcas_fixup;
380 if ( other_cd->status == STATUS_IN_PROGRESS )
381 CASPO(ptr,
382 value_read,
383 get_marked_reference(other_cd, MARK_PTR_TO_CD));
384 else
385 CASPO(ptr,
386 value_read,
387 read_from_cd(ptr, other_cd, TRUE));
389 rc_down_descriptor (other_cd);
390 return TRUE;
393 return FALSE;
396 static void *read_barrier (void **ptr)
398 void *v;
400 do { v = *ptr; }
401 while ( mcas_fixup(ptr, v) );
403 return v;
406 static bool_t mcas0 (per_thread_state_t *ptst, CasDescriptor_t *cd)
408 int i;
409 int n;
410 int desired_status;
411 bool_t final_success;
412 void *mcd;
413 void *dmcd;
414 int old_status;
416 if ( ptst == NULL )
417 ptst = get_ptst();
419 MB(); /* required for sequential consistency */
421 if ( cd->status == STATUS_SUCCEEDED )
423 clean_descriptor(cd);
424 final_success = TRUE;
425 goto out;
427 else if ( cd->status == STATUS_FAILED )
429 clean_descriptor(cd);
430 final_success = FALSE;
431 goto out;
434 /* Attempt to link in all entries in the descriptor. */
435 mcd = get_marked_reference(cd, MARK_PTR_TO_CD);
436 dmcd = get_marked_reference(&(cd->pt[ptst->id]), MARK_IN_PROGRESS);
438 desired_status = STATUS_SUCCEEDED;
440 retry:
441 n = cd->length;
442 for (i = 0; i < n; i ++)
444 CasEntry_t *ce = &(cd->entries[i]);
445 void *value_read = CASPO(ce->ptr, ce->old, dmcd);
447 if ( (value_read != ce->old) &&
448 (value_read != dmcd) &&
449 (value_read != mcd) )
451 if ( mcas_fixup(ce->ptr, value_read) )
452 goto retry;
453 desired_status = STATUS_FAILED;
454 break;
457 RMB_NEAR_CAS(); /* ensure check of status occurs after CASPO. */
458 if ( cd->status != STATUS_IN_PROGRESS )
460 CASPO(ce->ptr, dmcd, ce->old);
461 break;
464 if ( value_read != mcd )
466 value_read = CASPO(ce->ptr, dmcd, mcd);
467 assert((value_read == dmcd) ||
468 (value_read == mcd) ||
469 (cd->status != STATUS_IN_PROGRESS));
474 * All your ptrs are belong to us (or we've been helped and
475 * already known to have succeeded or failed). Try to
476 * propagate our desired result into the status field.
480 * When changing to success, we must have all pointer ownerships
481 * globally visible. But we get this without a memory barrier, as
482 * 'desired_status' is dependent on the outcome of each CASPO
483 * to MARK_IN_PROGRESS.
485 * Architectures providing CAS natively all specify that the operation
486 * is _indivisible_. That is, the write will be done when the CAS
487 * completes.
489 * Architectures providing LL/SC are even better: any following
490 * instruction in program order is control-dependent on the CAS, because
491 * CAS may be retried if SC fails. All we need is that SC gets to point
492 * of coherency before producing its result: even Alpha provides this!
494 WEAK_DEP_ORDER_WMB();
495 old_status = CASIO((int *)&cd->status,
496 STATUS_IN_PROGRESS,
497 desired_status);
499 * This ensures final sequential consistency.
500 * Also ensures that the status update is visible before cleanup.
502 WMB_NEAR_CAS();
504 clean_descriptor(cd);
505 final_success = (cd->status == STATUS_SUCCEEDED);
507 out:
508 return final_success;
512 void mcas_init (void)
514 int r = pthread_key_create(&mcas_ptst_key, NULL);
515 if ( r != 0 ) abort();
518 /***********************************************************************/
520 bool_t mcas (int n,
521 void **ptr, void *old, void *new,
522 ...)
524 va_list ap;
525 int i;
526 CasDescriptor_t *cd;
527 CasEntry_t *ce;
528 int result = 0;
529 per_thread_state_t *ptst = get_ptst();
531 cd = new_descriptor(ptst, n);
533 cd->status = STATUS_IN_PROGRESS;
534 cd->length = n;
536 ce = cd->entries;
537 ce->ptr = ptr;
538 ce->old = old;
539 ce->new = new;
541 va_start(ap, new);
542 for ( i = 1; i < n; i++ )
544 ce ++;
545 ce->ptr = va_arg(ap, void **);
546 ce->old = va_arg(ap, void *);
547 ce->new = va_arg(ap, void *);
549 va_end (ap);
551 /* Insertion sort. Fail on non-unique pointers. */
552 for ( i = 1, ce = &cd->entries[1]; i < n; i++, ce++ )
554 int j;
555 CasEntry_t *cei, tmp;
556 for ( j = i-1, cei = ce-1; j >= 0; j--, cei-- )
557 if ( cei->ptr <= ce->ptr ) break;
558 if ( cei->ptr == ce->ptr ) goto out;
559 if ( ++cei != ce )
561 tmp = *ce;
562 memmove(cei+1, cei, (ce-cei)*sizeof(CasEntry_t));
563 *cei = tmp;
567 result = mcas0(ptst, cd);
568 assert(cd->status != STATUS_IN_PROGRESS);
570 out:
571 rc_down_descriptor (cd);
572 return result;