kernel/sched/membarrier.c

   1 /*
   2  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   3  *
   4  * membarrier system call
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  */
  16
  17 #include <linux/syscalls.h>
  18 #include <linux/membarrier.h>
  19 #include <linux/tick.h>
  20 #include <linux/cpumask.h>
  21 #include <linux/atomic.h>
  22
  23 #include "sched.h"      /* for cpu_rq(). */
  24
  25 /*
  26  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  27  * except MEMBARRIER_CMD_QUERY.
  28  */
  29 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
  30 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK  \
  31         (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
  32         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
  33 #else
  34 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK  0
  35 #endif
  36
  37 #define MEMBARRIER_CMD_BITMASK  \
  38         (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
  39         | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
  40         | MEMBARRIER_CMD_PRIVATE_EXPEDITED      \
  41         | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED     \
  42         | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
  43
  44 static void ipi_mb(void *info)
  45 {
  46         smp_mb();       /* IPIs should be serializing but paranoid. */
  47 }
  48
  49 static int membarrier_global_expedited(void)
  50 {
  51         int cpu;
  52         bool fallback = false;
  53         cpumask_var_t tmpmask;
  54
  55         if (num_online_cpus() == 1)
  56                 return 0;
  57
  58         /*
  59          * Matches memory barriers around rq->curr modification in
  60          * scheduler.
  61          */
  62         smp_mb();       /* system call entry is not a mb. */
  63
  64         /*
  65          * Expedited membarrier commands guarantee that they won't
  66          * block, hence the GFP_NOWAIT allocation flag and fallback
  67          * implementation.
  68          */
  69         if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
  70                 /* Fallback for OOM. */
  71                 fallback = true;
  72         }
  73
  74         cpus_read_lock();
  75         for_each_online_cpu(cpu) {
  76                 struct task_struct *p;
  77
  78                 /*
  79                  * Skipping the current CPU is OK even through we can be
  80                  * migrated at any point. The current CPU, at the point
  81                  * where we read raw_smp_processor_id(), is ensured to
  82                  * be in program order with respect to the caller
  83                  * thread. Therefore, we can skip this CPU from the
  84                  * iteration.
  85                  */
  86                 if (cpu == raw_smp_processor_id())
  87                         continue;
  88                 rcu_read_lock();
  89                 p = task_rcu_dereference(&cpu_rq(cpu)->curr);
  90                 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
  91                                    MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
  92                         if (!fallback)
  93                                 __cpumask_set_cpu(cpu, tmpmask);
  94                         else
  95                                 smp_call_function_single(cpu, ipi_mb, NULL, 1);
  96                 }
  97                 rcu_read_unlock();
  98         }
  99         if (!fallback) {
 100                 preempt_disable();
 101                 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
 102                 preempt_enable();
 103                 free_cpumask_var(tmpmask);
 104         }
 105         cpus_read_unlock();
 106
 107         /*
 108          * Memory barrier on the caller thread _after_ we finished
 109          * waiting for the last IPI. Matches memory barriers around
 110          * rq->curr modification in scheduler.
 111          */
 112         smp_mb();       /* exit from system call is not a mb */
 113         return 0;
 114 }
 115
 116 static int membarrier_private_expedited(int flags)
 117 {
 118         int cpu;
 119         bool fallback = false;
 120         cpumask_var_t tmpmask;
 121
 122         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 123                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 124                         return -EINVAL;
 125                 if (!(atomic_read(&current->mm->membarrier_state) &
 126                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
 127                         return -EPERM;
 128         } else {
 129                 if (!(atomic_read(&current->mm->membarrier_state) &
 130                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
 131                         return -EPERM;
 132         }
 133
 134         if (num_online_cpus() == 1)
 135                 return 0;
 136
 137         /*
 138          * Matches memory barriers around rq->curr modification in
 139          * scheduler.
 140          */
 141         smp_mb();       /* system call entry is not a mb. */
 142
 143         /*
 144          * Expedited membarrier commands guarantee that they won't
 145          * block, hence the GFP_NOWAIT allocation flag and fallback
 146          * implementation.
 147          */
 148         if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
 149                 /* Fallback for OOM. */
 150                 fallback = true;
 151         }
 152
 153         cpus_read_lock();
 154         for_each_online_cpu(cpu) {
 155                 struct task_struct *p;
 156
 157                 /*
 158                  * Skipping the current CPU is OK even through we can be
 159                  * migrated at any point. The current CPU, at the point
 160                  * where we read raw_smp_processor_id(), is ensured to
 161                  * be in program order with respect to the caller
 162                  * thread. Therefore, we can skip this CPU from the
 163                  * iteration.
 164                  */
 165                 if (cpu == raw_smp_processor_id())
 166                         continue;
 167                 rcu_read_lock();
 168                 p = task_rcu_dereference(&cpu_rq(cpu)->curr);
 169                 if (p && p->mm == current->mm) {
 170                         if (!fallback)
 171                                 __cpumask_set_cpu(cpu, tmpmask);
 172                         else
 173                                 smp_call_function_single(cpu, ipi_mb, NULL, 1);
 174                 }
 175                 rcu_read_unlock();
 176         }
 177         if (!fallback) {
 178                 preempt_disable();
 179                 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
 180                 preempt_enable();
 181                 free_cpumask_var(tmpmask);
 182         }
 183         cpus_read_unlock();
 184
 185         /*
 186          * Memory barrier on the caller thread _after_ we finished
 187          * waiting for the last IPI. Matches memory barriers around
 188          * rq->curr modification in scheduler.
 189          */
 190         smp_mb();       /* exit from system call is not a mb */
 191         return 0;
 192 }
 193
 194 static int membarrier_register_global_expedited(void)
 195 {
 196         struct task_struct *p = current;
 197         struct mm_struct *mm = p->mm;
 198
 199         if (atomic_read(&mm->membarrier_state) &
 200             MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
 201                 return 0;
 202         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
 203         if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
 204                 /*
 205                  * For single mm user, single threaded process, we can
 206                  * simply issue a memory barrier after setting
 207                  * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
 208                  * no memory access following registration is reordered
 209                  * before registration.
 210                  */
 211                 smp_mb();
 212         } else {
 213                 /*
 214                  * For multi-mm user threads, we need to ensure all
 215                  * future scheduler executions will observe the new
 216                  * thread flag state for this mm.
 217                  */
 218                 synchronize_sched();
 219         }
 220         atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 221                   &mm->membarrier_state);
 222         return 0;
 223 }
 224
 225 static int membarrier_register_private_expedited(int flags)
 226 {
 227         struct task_struct *p = current;
 228         struct mm_struct *mm = p->mm;
 229         int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
 230
 231         if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 232                 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 233                         return -EINVAL;
 234                 state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
 235         }
 236
 237         /*
 238          * We need to consider threads belonging to different thread
 239          * groups, which use the same mm. (CLONE_VM but not
 240          * CLONE_THREAD).
 241          */
 242         if (atomic_read(&mm->membarrier_state) & state)
 243                 return 0;
 244         atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
 245         if (flags & MEMBARRIER_FLAG_SYNC_CORE)
 246                 atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
 247                           &mm->membarrier_state);
 248         if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
 249                 /*
 250                  * Ensure all future scheduler executions will observe the
 251                  * new thread flag state for this process.
 252                  */
 253                 synchronize_sched();
 254         }
 255         atomic_or(state, &mm->membarrier_state);
 256         return 0;
 257 }
 258
 259 /**
 260  * sys_membarrier - issue memory barriers on a set of threads
 261  * @cmd:   Takes command values defined in enum membarrier_cmd.
 262  * @flags: Currently needs to be 0. For future extensions.
 263  *
 264  * If this system call is not implemented, -ENOSYS is returned. If the
 265  * command specified does not exist, not available on the running
 266  * kernel, or if the command argument is invalid, this system call
 267  * returns -EINVAL. For a given command, with flags argument set to 0,
 268  * this system call is guaranteed to always return the same value until
 269  * reboot.
 270  *
 271  * All memory accesses performed in program order from each targeted thread
 272  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
 273  * the semantic "barrier()" to represent a compiler barrier forcing memory
 274  * accesses to be performed in program order across the barrier, and
 275  * smp_mb() to represent explicit memory barriers forcing full memory
 276  * ordering across the barrier, we have the following ordering table for
 277  * each pair of barrier(), sys_membarrier() and smp_mb():
 278  *
 279  * The pair ordering is detailed as (O: ordered, X: not ordered):
 280  *
 281  *                        barrier()   smp_mb() sys_membarrier()
 282  *        barrier()          X           X            O
 283  *        smp_mb()           X           O            O
 284  *        sys_membarrier()   O           O            O
 285  */
 286 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 287 {
 288         if (unlikely(flags))
 289                 return -EINVAL;
 290         switch (cmd) {
 291         case MEMBARRIER_CMD_QUERY:
 292         {
 293                 int cmd_mask = MEMBARRIER_CMD_BITMASK;
 294
 295                 if (tick_nohz_full_enabled())
 296                         cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
 297                 return cmd_mask;
 298         }
 299         case MEMBARRIER_CMD_GLOBAL:
 300                 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
 301                 if (tick_nohz_full_enabled())
 302                         return -EINVAL;
 303                 if (num_online_cpus() > 1)
 304                         synchronize_sched();
 305                 return 0;
 306         case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
 307                 return membarrier_global_expedited();
 308         case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
 309                 return membarrier_register_global_expedited();
 310         case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
 311                 return membarrier_private_expedited(0);
 312         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
 313                 return membarrier_register_private_expedited(0);
 314         case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
 315                 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 316         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
 317                 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
 318         default:
 319                 return -EINVAL;
 320         }
 321 }