kernel/sched/membarrier.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   4  *
   5  * membarrier system call
   6  */
   7 #include "sched.h"
   8
   9 /*
  10  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  11  * except MEMBARRIER_CMD_QUERY.
  12  */
  13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
  14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK                  \
  15         (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE                     \
  16         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
  17 #else
  18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK  0
  19 #endif
  20
  21 #define MEMBARRIER_CMD_BITMASK                                          \
  22         (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED        \
  23         | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED                      \
  24         | MEMBARRIER_CMD_PRIVATE_EXPEDITED                              \
  25         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED                     \
  26         | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
  27
  28 static void ipi_mb(void *info)
  29 {
  30         smp_mb();       /* IPIs should be serializing but paranoid. */
  31 }
  32
  33 static void ipi_sync_rq_state(void *info)
  34 {
  35         struct mm_struct *mm = (struct mm_struct *) info;
  36
  37         if (current->mm != mm)
  38                 return;
  39         this_cpu_write(runqueues.membarrier_state,
  40                        atomic_read(&mm->membarrier_state));
  41         /*
  42          * Issue a memory barrier after setting
  43          * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
  44          * guarantee that no memory access following registration is reordered
  45          * before registration.
  46          */
  47         smp_mb();
  48 }
  49
  50 void membarrier_exec_mmap(struct mm_struct *mm)
  51 {
  52         /*
  53          * Issue a memory barrier before clearing membarrier_state to
  54          * guarantee that no memory access prior to exec is reordered after
  55          * clearing this state.
  56          */
  57         smp_mb();
  58         atomic_set(&mm->membarrier_state, 0);
  59         /*
  60          * Keep the runqueue membarrier_state in sync with this mm
  61          * membarrier_state.
  62          */
  63         this_cpu_write(runqueues.membarrier_state, 0);
  64 }
  65
  66 static int membarrier_global_expedited(void)
  67 {
  68         int cpu;
  69         cpumask_var_t tmpmask;
  70
  71         if (num_online_cpus() == 1)
  72                 return 0;
  73
  74         /*
  75          * Matches memory barriers around rq->curr modification in
  76          * scheduler.
  77          */
  78         smp_mb();       /* system call entry is not a mb. */
  79
  80         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
  81                 return -ENOMEM;
  82
  83         cpus_read_lock();
  84         rcu_read_lock();
  85         for_each_online_cpu(cpu) {
  86                 struct task_struct *p;
  87
  88                 /*
  89                  * Skipping the current CPU is OK even through we can be
  90                  * migrated at any point. The current CPU, at the point
  91                  * where we read raw_smp_processor_id(), is ensured to
  92                  * be in program order with respect to the caller
  93                  * thread. Therefore, we can skip this CPU from the
  94                  * iteration.
  95                  */
  96                 if (cpu == raw_smp_processor_id())
  97                         continue;
  98
  99                 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
 100                     MEMBARRIER_STATE_GLOBAL_EXPEDITED))
 101                         continue;
 102
 103                 /*
 104                  * Skip the CPU if it runs a kernel thread. The scheduler
 105                  * leaves the prior task mm in place as an optimization when
 106                  * scheduling a kthread.
 107                  */
 108                 p = rcu_dereference(cpu_rq(cpu)->curr);
 109                 if (p->flags & PF_KTHREAD)
 110                         continue;
 111
 112                 __cpumask_set_cpu(cpu, tmpmask);
 113         }
 114         rcu_read_unlock();
 115
 116         preempt_disable();
 117         smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
 118         preempt_enable();
 119
 120         free_cpumask_var(tmpmask);
 121         cpus_read_unlock();
 122
 123         /*
 124          * Memory barrier on the caller thread _after_ we finished
 125          * waiting for the last IPI. Matches memory barriers around
 126          * rq->curr modification in scheduler.
 127          */
 128         smp_mb();       /* exit from system call is not a mb */
 129         return 0;
 130 }
 131
 132 static int membarrier_private_expedited(int flags)
 133 {
 134         int cpu;
 135         cpumask_var_t tmpmask;
 136         struct mm_struct *mm = current->mm;
 137
 138         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 139                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 140                         return -EINVAL;
 141                 if (!(atomic_read(&mm->membarrier_state) &
 142                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
 143                         return -EPERM;
 144         } else {
 145                 if (!(atomic_read(&mm->membarrier_state) &
 146                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
 147                         return -EPERM;
 148         }
 149
 150         if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
 151                 return 0;
 152
 153         /*
 154          * Matches memory barriers around rq->curr modification in
 155          * scheduler.
 156          */
 157         smp_mb();       /* system call entry is not a mb. */
 158
 159         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 160                 return -ENOMEM;
 161
 162         cpus_read_lock();
 163         rcu_read_lock();
 164         for_each_online_cpu(cpu) {
 165                 struct task_struct *p;
 166
 167                 /*
 168                  * Skipping the current CPU is OK even through we can be
 169                  * migrated at any point. The current CPU, at the point
 170                  * where we read raw_smp_processor_id(), is ensured to
 171                  * be in program order with respect to the caller
 172                  * thread. Therefore, we can skip this CPU from the
 173                  * iteration.
 174                  */
 175                 if (cpu == raw_smp_processor_id())
 176                         continue;
 177                 p = rcu_dereference(cpu_rq(cpu)->curr);
 178                 if (p && p->mm == mm)
 179                         __cpumask_set_cpu(cpu, tmpmask);
 180         }
 181         rcu_read_unlock();
 182
 183         preempt_disable();
 184         smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
 185         preempt_enable();
 186
 187         free_cpumask_var(tmpmask);
 188         cpus_read_unlock();
 189
 190         /*
 191          * Memory barrier on the caller thread _after_ we finished
 192          * waiting for the last IPI. Matches memory barriers around
 193          * rq->curr modification in scheduler.
 194          */
 195         smp_mb();       /* exit from system call is not a mb */
 196
 197         return 0;
 198 }
 199
 200 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
 201 {
 202         int membarrier_state = atomic_read(&mm->membarrier_state);
 203         cpumask_var_t tmpmask;
 204         int cpu;
 205
 206         if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
 207                 this_cpu_write(runqueues.membarrier_state, membarrier_state);
 208
 209                 /*
 210                  * For single mm user, we can simply issue a memory barrier
 211                  * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
 212                  * mm and in the current runqueue to guarantee that no memory
 213                  * access following registration is reordered before
 214                  * registration.
 215                  */
 216                 smp_mb();
 217                 return 0;
 218         }
 219
 220         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 221                 return -ENOMEM;
 222
 223         /*
 224          * For mm with multiple users, we need to ensure all future
 225          * scheduler executions will observe @mm's new membarrier
 226          * state.
 227          */
 228         synchronize_rcu();
 229
 230         /*
 231          * For each cpu runqueue, if the task's mm match @mm, ensure that all
 232          * @mm's membarrier state set bits are also set in in the runqueue's
 233          * membarrier state. This ensures that a runqueue scheduling
 234          * between threads which are users of @mm has its membarrier state
 235          * updated.
 236          */
 237         cpus_read_lock();
 238         rcu_read_lock();
 239         for_each_online_cpu(cpu) {
 240                 struct rq *rq = cpu_rq(cpu);
 241                 struct task_struct *p;
 242
 243                 p = rcu_dereference(rq->curr);
 244                 if (p && p->mm == mm)
 245                         __cpumask_set_cpu(cpu, tmpmask);
 246         }
 247         rcu_read_unlock();
 248
 249         preempt_disable();
 250         smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
 251         preempt_enable();
 252
 253         free_cpumask_var(tmpmask);
 254         cpus_read_unlock();
 255
 256         return 0;
 257 }
 258
 259 static int membarrier_register_global_expedited(void)
 260 {
 261         struct task_struct *p = current;
 262         struct mm_struct *mm = p->mm;
 263         int ret;
 264
 265         if (atomic_read(&mm->membarrier_state) &
 266             MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
 267                 return 0;
 268         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
 269         ret = sync_runqueues_membarrier_state(mm);
 270         if (ret)
 271                 return ret;
 272         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 273                   &mm->membarrier_state);
 274
 275         return 0;
 276 }
 277
 278 static int membarrier_register_private_expedited(int flags)
 279 {
 280         struct task_struct *p = current;
 281         struct mm_struct *mm = p->mm;
 282         int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
 283             set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
 284             ret;
 285
 286         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 287                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 288                         return -EINVAL;
 289                 ready_state =
 290                         MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
 291         }
 292
 293         /*
 294          * We need to consider threads belonging to different thread
 295          * groups, which use the same mm. (CLONE_VM but not
 296          * CLONE_THREAD).
 297          */
 298         if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
 299                 return 0;
 300         if (flags & MEMBARRIER_FLAG_SYNC_CORE)
 301                 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
 302         atomic_or(set_state, &mm->membarrier_state);
 303         ret = sync_runqueues_membarrier_state(mm);
 304         if (ret)
 305                 return ret;
 306         atomic_or(ready_state, &mm->membarrier_state);
 307
 308         return 0;
 309 }
 310
 311 /**
 312  * sys_membarrier - issue memory barriers on a set of threads
 313  * @cmd:   Takes command values defined in enum membarrier_cmd.
 314  * @flags: Currently needs to be 0. For future extensions.
 315  *
 316  * If this system call is not implemented, -ENOSYS is returned. If the
 317  * command specified does not exist, not available on the running
 318  * kernel, or if the command argument is invalid, this system call
 319  * returns -EINVAL. For a given command, with flags argument set to 0,
 320  * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
 321  * always return the same value until reboot. In addition, it can return
 322  * -ENOMEM if there is not enough memory available to perform the system
 323  * call.
 324  *
 325  * All memory accesses performed in program order from each targeted thread
 326  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
 327  * the semantic "barrier()" to represent a compiler barrier forcing memory
 328  * accesses to be performed in program order across the barrier, and
 329  * smp_mb() to represent explicit memory barriers forcing full memory
 330  * ordering across the barrier, we have the following ordering table for
 331  * each pair of barrier(), sys_membarrier() and smp_mb():
 332  *
 333  * The pair ordering is detailed as (O: ordered, X: not ordered):
 334  *
 335  *                        barrier()   smp_mb() sys_membarrier()
 336  *        barrier()          X           X            O
 337  *        smp_mb()           X           O            O
 338  *        sys_membarrier()   O           O            O
 339  */
 340 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 341 {
 342         if (unlikely(flags))
 343                 return -EINVAL;
 344         switch (cmd) {
 345         case MEMBARRIER_CMD_QUERY:
 346         {
 347                 int cmd_mask = MEMBARRIER_CMD_BITMASK;
 348
 349                 if (tick_nohz_full_enabled())
 350                         cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
 351                 return cmd_mask;
 352         }
 353         case MEMBARRIER_CMD_GLOBAL:
 354                 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
 355                 if (tick_nohz_full_enabled())
 356                         return -EINVAL;
 357                 if (num_online_cpus() > 1)
 358                         synchronize_rcu();
 359                 return 0;
 360         case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
 361                 return membarrier_global_expedited();
 362         case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
 363                 return membarrier_register_global_expedited();
 364         case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
 365                 return membarrier_private_expedited(0);
 366         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
 367                 return membarrier_register_private_expedited(0);
 368         case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
 369                 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 370         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
 371                 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 372         default:
 373                 return -EINVAL;
 374         }
 375 }