kernel/locking/qspinlock_paravirt.h

   1 #ifndef _GEN_PV_LOCK_SLOWPATH
   2 #error "do not include this file"
   3 #endif
   4
   5 #include <linux/hash.h>
   6 #include <linux/bootmem.h>
   7 #include <linux/debug_locks.h>
   8
   9 /*
  10  * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
  11  * of spinning them.
  12  *
  13  * This relies on the architecture to provide two paravirt hypercalls:
  14  *
  15  *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
  16  *   pv_kick(cpu)             -- wakes a suspended vcpu
  17  *
  18  * Using these we implement __pv_queued_spin_lock_slowpath() and
  19  * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
  20  * native_queued_spin_unlock().
  21  */
  22
  23 #define _Q_SLOW_VAL     (3U << _Q_LOCKED_OFFSET)
  24
  25 /*
  26  * Queue node uses: vcpu_running & vcpu_halted.
  27  * Queue head uses: vcpu_running & vcpu_hashed.
  28  */
  29 enum vcpu_state {
  30         vcpu_running = 0,
  31         vcpu_halted,            /* Used only in pv_wait_node */
  32         vcpu_hashed,            /* = pv_hash'ed + vcpu_halted */
  33 };
  34
  35 struct pv_node {
  36         struct mcs_spinlock     mcs;
  37         struct mcs_spinlock     __res[3];
  38
  39         int                     cpu;
  40         u8                      state;
  41 };
  42
  43 /*
  44  * Lock and MCS node addresses hash table for fast lookup
  45  *
  46  * Hashing is done on a per-cacheline basis to minimize the need to access
  47  * more than one cacheline.
  48  *
  49  * Dynamically allocate a hash table big enough to hold at least 4X the
  50  * number of possible cpus in the system. Allocation is done on page
  51  * granularity. So the minimum number of hash buckets should be at least
  52  * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
  53  *
  54  * Since we should not be holding locks from NMI context (very rare indeed) the
  55  * max load factor is 0.75, which is around the point where open addressing
  56  * breaks down.
  57  *
  58  */
  59 struct pv_hash_entry {
  60         struct qspinlock *lock;
  61         struct pv_node   *node;
  62 };
  63
  64 #define PV_HE_PER_LINE  (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
  65 #define PV_HE_MIN       (PAGE_SIZE / sizeof(struct pv_hash_entry))
  66
  67 static struct pv_hash_entry *pv_lock_hash;
  68 static unsigned int pv_lock_hash_bits __read_mostly;
  69
  70 /*
  71  * Allocate memory for the PV qspinlock hash buckets
  72  *
  73  * This function should be called from the paravirt spinlock initialization
  74  * routine.
  75  */
  76 void __init __pv_init_lock_hash(void)
  77 {
  78         int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
  79
  80         if (pv_hash_size < PV_HE_MIN)
  81                 pv_hash_size = PV_HE_MIN;
  82
  83         /*
  84          * Allocate space from bootmem which should be page-size aligned
  85          * and hence cacheline aligned.
  86          */
  87         pv_lock_hash = alloc_large_system_hash("PV qspinlock",
  88                                                sizeof(struct pv_hash_entry),
  89                                                pv_hash_size, 0, HASH_EARLY,
  90                                                &pv_lock_hash_bits, NULL,
  91                                                pv_hash_size, pv_hash_size);
  92 }
  93
  94 #define for_each_hash_entry(he, offset, hash)                                           \
  95         for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;       \
  96              offset < (1 << pv_lock_hash_bits);                                         \
  97              offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
  98
  99 static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
 100 {
 101         unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
 102         struct pv_hash_entry *he;
 103
 104         for_each_hash_entry(he, offset, hash) {
 105                 if (!cmpxchg(&he->lock, NULL, lock)) {
 106                         WRITE_ONCE(he->node, node);
 107                         return &he->lock;
 108                 }
 109         }
 110         /*
 111          * Hard assume there is a free entry for us.
 112          *
 113          * This is guaranteed by ensuring every blocked lock only ever consumes
 114          * a single entry, and since we only have 4 nesting levels per CPU
 115          * and allocated 4*nr_possible_cpus(), this must be so.
 116          *
 117          * The single entry is guaranteed by having the lock owner unhash
 118          * before it releases.
 119          */
 120         BUG();
 121 }
 122
 123 static struct pv_node *pv_unhash(struct qspinlock *lock)
 124 {
 125         unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
 126         struct pv_hash_entry *he;
 127         struct pv_node *node;
 128
 129         for_each_hash_entry(he, offset, hash) {
 130                 if (READ_ONCE(he->lock) == lock) {
 131                         node = READ_ONCE(he->node);
 132                         WRITE_ONCE(he->lock, NULL);
 133                         return node;
 134                 }
 135         }
 136         /*
 137          * Hard assume we'll find an entry.
 138          *
 139          * This guarantees a limited lookup time and is itself guaranteed by
 140          * having the lock owner do the unhash -- IFF the unlock sees the
 141          * SLOW flag, there MUST be a hash entry.
 142          */
 143         BUG();
 144 }
 145
 146 /*
 147  * Initialize the PV part of the mcs_spinlock node.
 148  */
 149 static void pv_init_node(struct mcs_spinlock *node)
 150 {
 151         struct pv_node *pn = (struct pv_node *)node;
 152
 153         BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
 154
 155         pn->cpu = smp_processor_id();
 156         pn->state = vcpu_running;
 157 }
 158
 159 /*
 160  * Wait for node->locked to become true, halt the vcpu after a short spin.
 161  * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
 162  * behalf.
 163  */
 164 static void pv_wait_node(struct mcs_spinlock *node)
 165 {
 166         struct pv_node *pn = (struct pv_node *)node;
 167         int loop;
 168
 169         for (;;) {
 170                 for (loop = SPIN_THRESHOLD; loop; loop--) {
 171                         if (READ_ONCE(node->locked))
 172                                 return;
 173                         cpu_relax();
 174                 }
 175
 176                 /*
 177                  * Order pn->state vs pn->locked thusly:
 178                  *
 179                  * [S] pn->state = vcpu_halted    [S] next->locked = 1
 180                  *     MB                             MB
 181                  * [L] pn->locked               [RmW] pn->state = vcpu_hashed
 182                  *
 183                  * Matches the cmpxchg() from pv_kick_node().
 184                  */
 185                 smp_store_mb(pn->state, vcpu_halted);
 186
 187                 if (!READ_ONCE(node->locked))
 188                         pv_wait(&pn->state, vcpu_halted);
 189
 190                 /*
 191                  * If pv_kick_node() changed us to vcpu_hashed, retain that value
 192                  * so that pv_wait_head() knows to not also try to hash this lock.
 193                  */
 194                 cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 195
 196                 /*
 197                  * If the locked flag is still not set after wakeup, it is a
 198                  * spurious wakeup and the vCPU should wait again. However,
 199                  * there is a pretty high overhead for CPU halting and kicking.
 200                  * So it is better to spin for a while in the hope that the
 201                  * MCS lock will be released soon.
 202                  */
 203         }
 204
 205         /*
 206          * By now our node->locked should be 1 and our caller will not actually
 207          * spin-wait for it. We do however rely on our caller to do a
 208          * load-acquire for us.
 209          */
 210 }
 211
 212 /*
 213  * Called after setting next->locked = 1 when we're the lock owner.
 214  *
 215  * Instead of waking the waiters stuck in pv_wait_node() advance their state such
 216  * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
 217  */
 218 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 219 {
 220         struct pv_node *pn = (struct pv_node *)node;
 221         struct __qspinlock *l = (void *)lock;
 222
 223         /*
 224          * If the vCPU is indeed halted, advance its state to match that of
 225          * pv_wait_node(). If OTOH this fails, the vCPU was running and will
 226          * observe its next->locked value and advance itself.
 227          *
 228          * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
 229          */
 230         if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
 231                 return;
 232
 233         /*
 234          * Put the lock into the hash table and set the _Q_SLOW_VAL.
 235          *
 236          * As this is the same vCPU that will check the _Q_SLOW_VAL value and
 237          * the hash table later on at unlock time, no atomic instruction is
 238          * needed.
 239          */
 240         WRITE_ONCE(l->locked, _Q_SLOW_VAL);
 241         (void)pv_hash(lock, pn);
 242 }
 243
 244 /*
 245  * Wait for l->locked to become clear; halt the vcpu after a short spin.
 246  * __pv_queued_spin_unlock() will wake us.
 247  */
 248 static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 249 {
 250         struct pv_node *pn = (struct pv_node *)node;
 251         struct __qspinlock *l = (void *)lock;
 252         struct qspinlock **lp = NULL;
 253         int loop;
 254
 255         /*
 256          * If pv_kick_node() already advanced our state, we don't need to
 257          * insert ourselves into the hash table anymore.
 258          */
 259         if (READ_ONCE(pn->state) == vcpu_hashed)
 260                 lp = (struct qspinlock **)1;
 261
 262         for (;;) {
 263                 for (loop = SPIN_THRESHOLD; loop; loop--) {
 264                         if (!READ_ONCE(l->locked))
 265                                 return;
 266                         cpu_relax();
 267                 }
 268
 269                 if (!lp) { /* ONCE */
 270                         WRITE_ONCE(pn->state, vcpu_hashed);
 271                         lp = pv_hash(lock, pn);
 272
 273                         /*
 274                          * We must hash before setting _Q_SLOW_VAL, such that
 275                          * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
 276                          * we'll be sure to be able to observe our hash entry.
 277                          *
 278                          *   [S] pn->state
 279                          *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
 280                          *       MB                           RMB
 281                          * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
 282                          *                                [L] pn->state
 283                          *
 284                          * Matches the smp_rmb() in __pv_queued_spin_unlock().
 285                          */
 286                         if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
 287                                 /*
 288                                  * The lock is free and _Q_SLOW_VAL has never
 289                                  * been set. Therefore we need to unhash before
 290                                  * getting the lock.
 291                                  */
 292                                 WRITE_ONCE(*lp, NULL);
 293                                 return;
 294                         }
 295                 }
 296                 pv_wait(&l->locked, _Q_SLOW_VAL);
 297
 298                 /*
 299                  * The unlocker should have freed the lock before kicking the
 300                  * CPU. So if the lock is still not free, it is a spurious
 301                  * wakeup and so the vCPU should wait again after spinning for
 302                  * a while.
 303                  */
 304         }
 305
 306         /*
 307          * Lock is unlocked now; the caller will acquire it without waiting.
 308          * As with pv_wait_node() we rely on the caller to do a load-acquire
 309          * for us.
 310          */
 311 }
 312
 313 /*
 314  * PV version of the unlock function to be used in stead of
 315  * queued_spin_unlock().
 316  */
 317 __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 318 {
 319         struct __qspinlock *l = (void *)lock;
 320         struct pv_node *node;
 321         u8 locked;
 322
 323         /*
 324          * We must not unlock if SLOW, because in that case we must first
 325          * unhash. Otherwise it would be possible to have multiple @lock
 326          * entries, which would be BAD.
 327          */
 328         locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
 329         if (likely(locked == _Q_LOCKED_VAL))
 330                 return;
 331
 332         if (unlikely(locked != _Q_SLOW_VAL)) {
 333                 WARN(!debug_locks_silent,
 334                      "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
 335                      (unsigned long)lock, atomic_read(&lock->val));
 336                 return;
 337         }
 338
 339         /*
 340          * A failed cmpxchg doesn't provide any memory-ordering guarantees,
 341          * so we need a barrier to order the read of the node data in
 342          * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
 343          *
 344          * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
 345          */
 346         smp_rmb();
 347
 348         /*
 349          * Since the above failed to release, this must be the SLOW path.
 350          * Therefore start by looking up the blocked node and unhashing it.
 351          */
 352         node = pv_unhash(lock);
 353
 354         /*
 355          * Now that we have a reference to the (likely) blocked pv_node,
 356          * release the lock.
 357          */
 358         smp_store_release(&l->locked, 0);
 359
 360         /*
 361          * At this point the memory pointed at by lock can be freed/reused,
 362          * however we can still use the pv_node to kick the CPU.
 363          * The other vCPU may not really be halted, but kicking an active
 364          * vCPU is harmless other than the additional latency in completing
 365          * the unlock.
 366          */
 367         if (READ_ONCE(node->state) == vcpu_hashed)
 368                 pv_kick(node->cpu);
 369 }
 370 /*
 371  * Include the architecture specific callee-save thunk of the
 372  * __pv_queued_spin_unlock(). This thunk is put together with
 373  * __pv_queued_spin_unlock() near the top of the file to make sure
 374  * that the callee-save thunk and the real unlock function are close
 375  * to each other sharing consecutive instruction cachelines.
 376  */
 377 #include <asm/qspinlock_paravirt.h>
 378