imgtec-ci20: genimage config/ u-boot env
[openadk.git] / target / linux / patches / 4.4.302 / patch-realtime
blob9fe252409528ac891790149142ec89ea0b0b4ba8
1 diff --git a/Documentation/hwlat_detector.txt b/Documentation/hwlat_detector.txt
2 new file mode 100644
3 index 000000000000..cb61516483d3
4 --- /dev/null
5 +++ b/Documentation/hwlat_detector.txt
6 @@ -0,0 +1,64 @@
7 +Introduction:
8 +-------------
10 +The module hwlat_detector is a special purpose kernel module that is used to
11 +detect large system latencies induced by the behavior of certain underlying
12 +hardware or firmware, independent of Linux itself. The code was developed
13 +originally to detect SMIs (System Management Interrupts) on x86 systems,
14 +however there is nothing x86 specific about this patchset. It was
15 +originally written for use by the "RT" patch since the Real Time
16 +kernel is highly latency sensitive.
18 +SMIs are usually not serviced by the Linux kernel, which typically does not
19 +even know that they are occuring. SMIs are instead are set up by BIOS code
20 +and are serviced by BIOS code, usually for "critical" events such as
21 +management of thermal sensors and fans. Sometimes though, SMIs are used for
22 +other tasks and those tasks can spend an inordinate amount of time in the
23 +handler (sometimes measured in milliseconds). Obviously this is a problem if
24 +you are trying to keep event service latencies down in the microsecond range.
26 +The hardware latency detector works by hogging all of the cpus for configurable
27 +amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
28 +for some period, then looking for gaps in the TSC data. Any gap indicates a
29 +time when the polling was interrupted and since the machine is stopped and
30 +interrupts turned off the only thing that could do that would be an SMI.
32 +Note that the SMI detector should *NEVER* be used in a production environment.
33 +It is intended to be run manually to determine if the hardware platform has a
34 +problem with long system firmware service routines.
36 +Usage:
37 +------
39 +Loading the module hwlat_detector passing the parameter "enabled=1" (or by
40 +setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
41 +step required to start the hwlat_detector. It is possible to redefine the
42 +threshold in microseconds (us) above which latency spikes will be taken
43 +into account (parameter "threshold=").
45 +Example:
47 +       # modprobe hwlat_detector enabled=1 threshold=100
49 +After the module is loaded, it creates a directory named "hwlat_detector" under
50 +the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
51 +to have debugfs mounted, which might be on /sys/debug on your system.
53 +The /debug/hwlat_detector interface contains the following files:
55 +count                  - number of latency spikes observed since last reset
56 +enable                 - a global enable/disable toggle (0/1), resets count
57 +max                    - maximum hardware latency actually observed (usecs)
58 +sample                 - a pipe from which to read current raw sample data
59 +                         in the format <timestamp> <latency observed usecs>
60 +                         (can be opened O_NONBLOCK for a single sample)
61 +threshold              - minimum latency value to be considered (usecs)
62 +width                  - time period to sample with CPUs held (usecs)
63 +                         must be less than the total window size (enforced)
64 +window                 - total period of sampling, width being inside (usecs)
66 +By default we will set width to 500,000 and window to 1,000,000, meaning that
67 +we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
68 +observe any latencies that exceed the threshold (initially 100 usecs),
69 +then we write to a global sample ring buffer of 8K samples, which is
70 +consumed by reading from the "sample" (pipe) debugfs file interface.
71 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
72 index 39280b72f27a..c46295a8d55b 100644
73 --- a/Documentation/kernel-parameters.txt
74 +++ b/Documentation/kernel-parameters.txt
75 @@ -1640,6 +1640,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
76         ip=             [IP_PNP]
77                         See Documentation/filesystems/nfs/nfsroot.txt.
79 +       irqaffinity=    [SMP] Set the default irq affinity mask
80 +                       Format:
81 +                       <cpu number>,...,<cpu number>
82 +                       or
83 +                       <cpu number>-<cpu number>
84 +                       (must be a positive range in ascending order)
85 +                       or a mixture
86 +                       <cpu number>,...,<cpu number>-<cpu number>
88         irqfixup        [HW]
89                         When an interrupt is not handled search all handlers
90                         for it. Intended to get systems with badly broken
91 diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
92 index 13f5619b2203..f64d075ba647 100644
93 --- a/Documentation/sysrq.txt
94 +++ b/Documentation/sysrq.txt
95 @@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
96  On other - If you know of the key combos for other architectures, please
97             let me know so I can add them to this section.
99 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
101 +On all -  write a character to /proc/sysrq-trigger, e.g.:
102                 echo t > /proc/sysrq-trigger
104 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
105 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
106 +        Send an ICMP echo request with this pattern plus the particular
107 +        SysRq command key. Example:
108 +               # ping -c1 -s57 -p0102030468
109 +        will trigger the SysRq-H (help) command.
112  *  What are the 'command' keys?
113  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
114  'b'     - Will immediately reboot the system without syncing or unmounting
115 diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
116 new file mode 100644
117 index 000000000000..6f2aeabf7faa
118 --- /dev/null
119 +++ b/Documentation/trace/histograms.txt
120 @@ -0,0 +1,186 @@
121 +               Using the Linux Kernel Latency Histograms
124 +This document gives a short explanation how to enable, configure and use
125 +latency histograms. Latency histograms are primarily relevant in the
126 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
127 +and are used in the quality management of the Linux real-time
128 +capabilities.
131 +* Purpose of latency histograms
133 +A latency histogram continuously accumulates the frequencies of latency
134 +data. There are two types of histograms
135 +- potential sources of latencies
136 +- effective latencies
139 +* Potential sources of latencies
141 +Potential sources of latencies are code segments where interrupts,
142 +preemption or both are disabled (aka critical sections). To create
143 +histograms of potential sources of latency, the kernel stores the time
144 +stamp at the start of a critical section, determines the time elapsed
145 +when the end of the section is reached, and increments the frequency
146 +counter of that latency value - irrespective of whether any concurrently
147 +running process is affected by latency or not.
148 +- Configuration items (in the Kernel hacking/Tracers submenu)
149 +  CONFIG_INTERRUPT_OFF_LATENCY
150 +  CONFIG_PREEMPT_OFF_LATENCY
153 +* Effective latencies
155 +Effective latencies are actually occuring during wakeup of a process. To
156 +determine effective latencies, the kernel stores the time stamp when a
157 +process is scheduled to be woken up, and determines the duration of the
158 +wakeup time shortly before control is passed over to this process. Note
159 +that the apparent latency in user space may be somewhat longer, since the
160 +process may be interrupted after control is passed over to it but before
161 +the execution in user space takes place. Simply measuring the interval
162 +between enqueuing and wakeup may also not appropriate in cases when a
163 +process is scheduled as a result of a timer expiration. The timer may have
164 +missed its deadline, e.g. due to disabled interrupts, but this latency
165 +would not be registered. Therefore, the offsets of missed timers are
166 +recorded in a separate histogram. If both wakeup latency and missed timer
167 +offsets are configured and enabled, a third histogram may be enabled that
168 +records the overall latency as a sum of the timer latency, if any, and the
169 +wakeup latency. This histogram is called "timerandwakeup".
170 +- Configuration items (in the Kernel hacking/Tracers submenu)
171 +  CONFIG_WAKEUP_LATENCY
172 +  CONFIG_MISSED_TIMER_OFSETS
175 +* Usage
177 +The interface to the administration of the latency histograms is located
178 +in the debugfs file system. To mount it, either enter
180 +mount -t sysfs nodev /sys
181 +mount -t debugfs nodev /sys/kernel/debug
183 +from shell command line level, or add
185 +nodev  /sys                    sysfs   defaults        0 0
186 +nodev  /sys/kernel/debug       debugfs defaults        0 0
188 +to the file /etc/fstab. All latency histogram related files are then
189 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
190 +particular histogram type is enabled by writing non-zero to the related
191 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
192 +Select "preemptirqsoff" for the histograms of potential sources of
193 +latencies and "wakeup" for histograms of effective latencies etc. The
194 +histogram data - one per CPU - are available in the files
196 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
197 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
198 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
199 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
200 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
201 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
202 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
204 +The histograms are reset by writing non-zero to the file "reset" in a
205 +particular latency directory. To reset all latency data, use
207 +#!/bin/sh
209 +TRACINGDIR=/sys/kernel/debug/tracing
210 +HISTDIR=$TRACINGDIR/latency_hist
212 +if test -d $HISTDIR
213 +then
214 +  cd $HISTDIR
215 +  for i in `find . | grep /reset$`
216 +  do
217 +    echo 1 >$i
218 +  done
222 +* Data format
224 +Latency data are stored with a resolution of one microsecond. The
225 +maximum latency is 10,240 microseconds. The data are only valid, if the
226 +overflow register is empty. Every output line contains the latency in
227 +microseconds in the first row and the number of samples in the second
228 +row. To display only lines with a positive latency count, use, for
229 +example,
231 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
233 +#Minimum latency: 0 microseconds.
234 +#Average latency: 0 microseconds.
235 +#Maximum latency: 25 microseconds.
236 +#Total samples: 3104770694
237 +#There are 0 samples greater or equal than 10240 microseconds
238 +#usecs          samples
239 +    0        2984486876
240 +    1          49843506
241 +    2          58219047
242 +    3           5348126
243 +    4           2187960
244 +    5           3388262
245 +    6            959289
246 +    7            208294
247 +    8             40420
248 +    9              4485
249 +   10             14918
250 +   11             18340
251 +   12             25052
252 +   13             19455
253 +   14              5602
254 +   15               969
255 +   16                47
256 +   17                18
257 +   18                14
258 +   19                 1
259 +   20                 3
260 +   21                 2
261 +   22                 5
262 +   23                 2
263 +   25                 1
266 +* Wakeup latency of a selected process
268 +To only collect wakeup latency data of a particular process, write the
269 +PID of the requested process to
271 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
273 +PIDs are not considered, if this variable is set to 0.
276 +* Details of the process with the highest wakeup latency so far
278 +Selected data of the process that suffered from the highest wakeup
279 +latency that occurred in a particular CPU are available in the file
281 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
283 +In addition, other relevant system data at the time when the
284 +latency occurred are given.
286 +The format of the data is (all in one line):
287 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
288 +<- <PID> <Priority> <Command> <Timestamp>
290 +The value of <Timeroffset> is only relevant in the combined timer
291 +and wakeup latency recording. In the wakeup recording, it is
292 +always 0, in the missed_timer_offsets recording, it is the same
293 +as <Latency>.
295 +When retrospectively searching for the origin of a latency and
296 +tracing was not enabled, it may be helpful to know the name and
297 +some basic data of the task that (finally) was switching to the
298 +late real-tlme task. In addition to the victim's data, also the
299 +data of the possible culprit are therefore displayed after the
300 +"<-" symbol.
302 +Finally, the timestamp of the time when the latency occurred
303 +in <seconds>.<microseconds> after the most recent system boot
304 +is provided.
306 +These data are also reset when the wakeup histogram is reset.
307 diff --git a/Makefile b/Makefile
308 index 07070a1e6292..f04a8b6444b9 100644
309 --- a/Makefile
310 +++ b/Makefile
311 @@ -797,6 +797,9 @@ KBUILD_CFLAGS   += $(call cc-option,-Werror=strict-prototypes)
312  # Prohibit date/time macros, which would make the build non-deterministic
313  KBUILD_CFLAGS   += $(call cc-option,-Werror=date-time)
315 +# enforce correct pointer usage
316 +KBUILD_CFLAGS   += $(call cc-option,-Werror=incompatible-pointer-types)
318  # use the deterministic mode of AR if available
319  KBUILD_ARFLAGS := $(call ar-option,D)
321 diff --git a/arch/Kconfig b/arch/Kconfig
322 index 4e949e58b192..3b26d76933fb 100644
323 --- a/arch/Kconfig
324 +++ b/arch/Kconfig
325 @@ -9,6 +9,7 @@ config OPROFILE
326         tristate "OProfile system profiling"
327         depends on PROFILING
328         depends on HAVE_OPROFILE
329 +       depends on !PREEMPT_RT_FULL
330         select RING_BUFFER
331         select RING_BUFFER_ALLOW_SWAP
332         help
333 @@ -52,6 +53,7 @@ config KPROBES
334  config JUMP_LABEL
335         bool "Optimize very unlikely/likely branches"
336         depends on HAVE_ARCH_JUMP_LABEL
337 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
338         help
339           This option enables a transparent branch optimization that
340          makes certain almost-always-true or almost-always-false branch
341 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
342 index 34e1569a11ee..79c4603e9453 100644
343 --- a/arch/arm/Kconfig
344 +++ b/arch/arm/Kconfig
345 @@ -33,7 +33,7 @@ config ARM
346         select HARDIRQS_SW_RESEND
347         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
348         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
349 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32
350 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !PREEMPT_RT_BASE
351         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32
352         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
353         select HAVE_ARCH_TRACEHOOK
354 @@ -68,6 +68,7 @@ config ARM
355         select HAVE_PERF_EVENTS
356         select HAVE_PERF_REGS
357         select HAVE_PERF_USER_STACK_DUMP
358 +       select HAVE_PREEMPT_LAZY
359         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
360         select HAVE_REGS_AND_STACK_ACCESS_API
361         select HAVE_SYSCALL_TRACEPOINTS
362 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
363 index 12ebfcc1d539..c962084605bc 100644
364 --- a/arch/arm/include/asm/switch_to.h
365 +++ b/arch/arm/include/asm/switch_to.h
366 @@ -3,6 +3,13 @@
368  #include <linux/thread_info.h>
370 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
371 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
372 +#else
373 +static inline void
374 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
375 +#endif
377  /*
378   * For v7 SMP cores running a preemptible kernel we may be pre-empted
379   * during a TLB maintenance operation, so execute an inner-shareable dsb
380 @@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
381  #define switch_to(prev,next,last)                                      \
382  do {                                                                   \
383         __complete_pending_tlbi();                                      \
384 +       switch_kmaps(prev, next);                                       \
385         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
386  } while (0)
388 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
389 index 776757d1604a..1f36a4eccc72 100644
390 --- a/arch/arm/include/asm/thread_info.h
391 +++ b/arch/arm/include/asm/thread_info.h
392 @@ -49,6 +49,7 @@ struct cpu_context_save {
393  struct thread_info {
394         unsigned long           flags;          /* low level flags */
395         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
396 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
397         mm_segment_t            addr_limit;     /* address limit */
398         struct task_struct      *task;          /* main task structure */
399         __u32                   cpu;            /* cpu */
400 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
401  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
402  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
403  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
404 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
405 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
406 +#define TIF_NEED_RESCHED_LAZY  7
408  #define TIF_NOHZ               12      /* in adaptive nohz mode */
409  #define TIF_USING_IWMMXT       17
410 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
411  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
412  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
413  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
414 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
415  #define _TIF_UPROBE            (1 << TIF_UPROBE)
416  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
417  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
418 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
419   * Change these and you break ASM code in entry-common.S
420   */
421  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
422 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
423 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
424 +                                _TIF_NEED_RESCHED_LAZY)
426  #endif /* __KERNEL__ */
427  #endif /* __ASM_ARM_THREAD_INFO_H */
428 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
429 index 871b8267d211..4dbe70de7318 100644
430 --- a/arch/arm/kernel/asm-offsets.c
431 +++ b/arch/arm/kernel/asm-offsets.c
432 @@ -65,6 +65,7 @@ int main(void)
433    BLANK();
434    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
435    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
436 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
437    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
438    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
439    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
440 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
441 index 3ce377f7251f..d044cea59f54 100644
442 --- a/arch/arm/kernel/entry-armv.S
443 +++ b/arch/arm/kernel/entry-armv.S
444 @@ -215,11 +215,18 @@ ENDPROC(__dabt_svc)
445  #ifdef CONFIG_PREEMPT
446         get_thread_info tsk
447         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
448 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
449         teq     r8, #0                          @ if preempt count != 0
450 +       bne     1f                              @ return from exeption
451 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
452 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
453 +       blne    svc_preempt                     @ preempt!
455 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
456 +       teq     r8, #0                          @ if preempt lazy count != 0
457         movne   r0, #0                          @ force flags to 0
458 -       tst     r0, #_TIF_NEED_RESCHED
459 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
460         blne    svc_preempt
462  #endif
464         svc_exit r5, irq = 1                    @ return from exception
465 @@ -234,8 +241,14 @@ ENDPROC(__irq_svc)
466  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
467         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
468         tst     r0, #_TIF_NEED_RESCHED
469 +       bne     1b
470 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
471         reteq   r8                              @ go again
472 -       b       1b
473 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
474 +       teq     r0, #0                          @ if preempt lazy count != 0
475 +       beq     1b
476 +       ret     r8                              @ go again
478  #endif
480  __und_fault:
481 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
482 index 30a7228eaceb..c3bd6cbfce4b 100644
483 --- a/arch/arm/kernel/entry-common.S
484 +++ b/arch/arm/kernel/entry-common.S
485 @@ -36,7 +36,9 @@
486   UNWIND(.cantunwind    )
487         disable_irq_notrace                     @ disable interrupts
488         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
489 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
490 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
491 +       bne     fast_work_pending
492 +       tst     r1, #_TIF_SECCOMP
493         bne     fast_work_pending
495         /* perform architecture specific actions before user return */
496 @@ -62,8 +64,11 @@ ENDPROC(ret_fast_syscall)
497         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
498         disable_irq_notrace                     @ disable interrupts
499         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
500 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
501 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
502 +       bne     do_slower_path
503 +       tst     r1, #_TIF_SECCOMP
504         beq     no_work_pending
505 +do_slower_path:
506   UNWIND(.fnend         )
507  ENDPROC(ret_fast_syscall)
509 diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
510 index 69bda1a5707e..1f665acaa6a9 100644
511 --- a/arch/arm/kernel/patch.c
512 +++ b/arch/arm/kernel/patch.c
513 @@ -15,7 +15,7 @@ struct patch {
514         unsigned int insn;
515  };
517 -static DEFINE_SPINLOCK(patch_lock);
518 +static DEFINE_RAW_SPINLOCK(patch_lock);
520  static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
521         __acquires(&patch_lock)
522 @@ -32,7 +32,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
523                 return addr;
525         if (flags)
526 -               spin_lock_irqsave(&patch_lock, *flags);
527 +               raw_spin_lock_irqsave(&patch_lock, *flags);
528         else
529                 __acquire(&patch_lock);
531 @@ -47,7 +47,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
532         clear_fixmap(fixmap);
534         if (flags)
535 -               spin_unlock_irqrestore(&patch_lock, *flags);
536 +               raw_spin_unlock_irqrestore(&patch_lock, *flags);
537         else
538                 __release(&patch_lock);
540 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
541 index 4adfb46e3ee9..15f1d94b47c5 100644
542 --- a/arch/arm/kernel/process.c
543 +++ b/arch/arm/kernel/process.c
544 @@ -319,6 +319,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
547  #ifdef CONFIG_MMU
549 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
550 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
551 + * fail.
552 + */
553 +static int __init vectors_user_mapping_init_page(void)
555 +       struct page *page;
556 +       unsigned long addr = 0xffff0000;
557 +       pgd_t *pgd;
558 +       pud_t *pud;
559 +       pmd_t *pmd;
561 +       pgd = pgd_offset_k(addr);
562 +       pud = pud_offset(pgd, addr);
563 +       pmd = pmd_offset(pud, addr);
564 +       page = pmd_page(*(pmd));
566 +       pgtable_page_ctor(page);
568 +       return 0;
570 +late_initcall(vectors_user_mapping_init_page);
572  #ifdef CONFIG_KUSER_HELPERS
573  /*
574   * The vectors page is always readable from user space for the
575 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
576 index 7b8f2141427b..96541e00b74a 100644
577 --- a/arch/arm/kernel/signal.c
578 +++ b/arch/arm/kernel/signal.c
579 @@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
580          */
581         trace_hardirqs_off();
582         do {
583 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
584 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
585 +                                          _TIF_NEED_RESCHED_LAZY))) {
586                         schedule();
587                 } else {
588                         if (unlikely(!user_mode(regs)))
589 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
590 index b26361355dae..e5754e3b03c4 100644
591 --- a/arch/arm/kernel/smp.c
592 +++ b/arch/arm/kernel/smp.c
593 @@ -230,8 +230,6 @@ int __cpu_disable(void)
594         flush_cache_louis();
595         local_flush_tlb_all();
597 -       clear_tasks_mm_cpumask(cpu);
599         return 0;
602 @@ -247,6 +245,9 @@ void __cpu_die(unsigned int cpu)
603                 pr_err("CPU%u: cpu didn't die\n", cpu);
604                 return;
605         }
607 +       clear_tasks_mm_cpumask(cpu);
609         pr_notice("CPU%u: shutdown\n", cpu);
611         /*
612 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
613 index 0bee233fef9a..314cfb232a63 100644
614 --- a/arch/arm/kernel/unwind.c
615 +++ b/arch/arm/kernel/unwind.c
616 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
617  static const struct unwind_idx *__origin_unwind_idx;
618  extern const struct unwind_idx __stop_unwind_idx[];
620 -static DEFINE_SPINLOCK(unwind_lock);
621 +static DEFINE_RAW_SPINLOCK(unwind_lock);
622  static LIST_HEAD(unwind_tables);
624  /* Convert a prel31 symbol to an absolute address */
625 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
626                 /* module unwind tables */
627                 struct unwind_table *table;
629 -               spin_lock_irqsave(&unwind_lock, flags);
630 +               raw_spin_lock_irqsave(&unwind_lock, flags);
631                 list_for_each_entry(table, &unwind_tables, list) {
632                         if (addr >= table->begin_addr &&
633                             addr < table->end_addr) {
634 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
635                                 break;
636                         }
637                 }
638 -               spin_unlock_irqrestore(&unwind_lock, flags);
639 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
640         }
642         pr_debug("%s: idx = %p\n", __func__, idx);
643 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
644         tab->begin_addr = text_addr;
645         tab->end_addr = text_addr + text_size;
647 -       spin_lock_irqsave(&unwind_lock, flags);
648 +       raw_spin_lock_irqsave(&unwind_lock, flags);
649         list_add_tail(&tab->list, &unwind_tables);
650 -       spin_unlock_irqrestore(&unwind_lock, flags);
651 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
653         return tab;
655 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
656         if (!tab)
657                 return;
659 -       spin_lock_irqsave(&unwind_lock, flags);
660 +       raw_spin_lock_irqsave(&unwind_lock, flags);
661         list_del(&tab->list);
662 -       spin_unlock_irqrestore(&unwind_lock, flags);
663 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
665         kfree(tab);
667 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
668 index d7bef2144760..36a3e51492f7 100644
669 --- a/arch/arm/kvm/arm.c
670 +++ b/arch/arm/kvm/arm.c
671 @@ -496,18 +496,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
672         struct kvm_vcpu *vcpu;
674         kvm_for_each_vcpu(i, vcpu, kvm) {
675 -               wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
676 +               struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
678                 vcpu->arch.pause = false;
679 -               wake_up_interruptible(wq);
680 +               swake_up(wq);
681         }
684  static void vcpu_sleep(struct kvm_vcpu *vcpu)
686 -       wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
687 +       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
689 -       wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
690 +       swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
691                                        (!vcpu->arch.pause)));
694 @@ -566,7 +566,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
695                  * involves poking the GIC, which must be done in a
696                  * non-preemptible context.
697                  */
698 -               preempt_disable();
699 +               migrate_disable();
700                 kvm_timer_flush_hwstate(vcpu);
701                 kvm_vgic_flush_hwstate(vcpu);
703 @@ -585,7 +585,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
704                         local_irq_enable();
705                         kvm_timer_sync_hwstate(vcpu);
706                         kvm_vgic_sync_hwstate(vcpu);
707 -                       preempt_enable();
708 +                       migrate_enable();
709                         continue;
710                 }
712 @@ -639,7 +639,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
714                 kvm_vgic_sync_hwstate(vcpu);
716 -               preempt_enable();
717 +               migrate_enable();
719                 ret = handle_exit(vcpu, run, ret);
720         }
721 diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
722 index 443db0c43d7c..a08d7a93aebb 100644
723 --- a/arch/arm/kvm/psci.c
724 +++ b/arch/arm/kvm/psci.c
725 @@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
727         struct kvm *kvm = source_vcpu->kvm;
728         struct kvm_vcpu *vcpu = NULL;
729 -       wait_queue_head_t *wq;
730 +       struct swait_queue_head *wq;
731         unsigned long cpu_id;
732         unsigned long context_id;
733         phys_addr_t target_pc;
734 @@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
735         smp_mb();               /* Make sure the above is visible */
737         wq = kvm_arch_vcpu_wq(vcpu);
738 -       wake_up_interruptible(wq);
739 +       swake_up(wq);
741         return PSCI_RET_SUCCESS;
743 diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig
744 index 28656c2b54a0..3f501305ca26 100644
745 --- a/arch/arm/mach-at91/Kconfig
746 +++ b/arch/arm/mach-at91/Kconfig
747 @@ -99,6 +99,7 @@ config HAVE_AT91_USB_CLK
748  config COMMON_CLK_AT91
749         bool
750         select COMMON_CLK
751 +       select MFD_SYSCON
753  config HAVE_AT91_SMD
754         bool
755 diff --git a/arch/arm/mach-at91/at91rm9200.c b/arch/arm/mach-at91/at91rm9200.c
756 index c1a7c6cc00e1..63b4fa25b48a 100644
757 --- a/arch/arm/mach-at91/at91rm9200.c
758 +++ b/arch/arm/mach-at91/at91rm9200.c
759 @@ -12,7 +12,6 @@
760  #include <linux/of_platform.h>
762  #include <asm/mach/arch.h>
763 -#include <asm/system_misc.h>
765  #include "generic.h"
766  #include "soc.h"
767 @@ -33,7 +32,6 @@ static void __init at91rm9200_dt_device_init(void)
769         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
771 -       arm_pm_idle = at91rm9200_idle;
772         at91rm9200_pm_init();
775 diff --git a/arch/arm/mach-at91/at91sam9.c b/arch/arm/mach-at91/at91sam9.c
776 index 7eb64f763034..cada2a6412b3 100644
777 --- a/arch/arm/mach-at91/at91sam9.c
778 +++ b/arch/arm/mach-at91/at91sam9.c
779 @@ -62,8 +62,6 @@ static void __init at91sam9_common_init(void)
780                 soc_dev = soc_device_to_device(soc);
782         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
784 -       arm_pm_idle = at91sam9_idle;
787  static void __init at91sam9_dt_device_init(void)
788 diff --git a/arch/arm/mach-at91/generic.h b/arch/arm/mach-at91/generic.h
789 index b0fa7dc7286d..28ca57a2060f 100644
790 --- a/arch/arm/mach-at91/generic.h
791 +++ b/arch/arm/mach-at91/generic.h
792 @@ -11,27 +11,18 @@
793  #ifndef _AT91_GENERIC_H
794  #define _AT91_GENERIC_H
796 -#include <linux/of.h>
797 -#include <linux/reboot.h>
799 - /* Map io */
800 -extern void __init at91_map_io(void);
801 -extern void __init at91_alt_map_io(void);
803 -/* idle */
804 -extern void at91rm9200_idle(void);
805 -extern void at91sam9_idle(void);
807  #ifdef CONFIG_PM
808  extern void __init at91rm9200_pm_init(void);
809  extern void __init at91sam9260_pm_init(void);
810  extern void __init at91sam9g45_pm_init(void);
811  extern void __init at91sam9x5_pm_init(void);
812 +extern void __init sama5_pm_init(void);
813  #else
814  static inline void __init at91rm9200_pm_init(void) { }
815  static inline void __init at91sam9260_pm_init(void) { }
816  static inline void __init at91sam9g45_pm_init(void) { }
817  static inline void __init at91sam9x5_pm_init(void) { }
818 +static inline void __init sama5_pm_init(void) { }
819  #endif
821  #endif /* _AT91_GENERIC_H */
822 diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
823 index 84eefbc2b4f9..bff0d062bf68 100644
824 --- a/arch/arm/mach-at91/pm.c
825 +++ b/arch/arm/mach-at91/pm.c
826 @@ -31,10 +31,13 @@
827  #include <asm/mach/irq.h>
828  #include <asm/fncpy.h>
829  #include <asm/cacheflush.h>
830 +#include <asm/system_misc.h>
832  #include "generic.h"
833  #include "pm.h"
835 +static void __iomem *pmc;
837  /*
838   * FIXME: this is needed to communicate between the pinctrl driver and
839   * the PM implementation in the machine. Possibly part of the PM
840 @@ -87,7 +90,7 @@ static int at91_pm_verify_clocks(void)
841         unsigned long scsr;
842         int i;
844 -       scsr = at91_pmc_read(AT91_PMC_SCSR);
845 +       scsr = readl(pmc + AT91_PMC_SCSR);
847         /* USB must not be using PLLB */
848         if ((scsr & at91_pm_data.uhp_udp_mask) != 0) {
849 @@ -101,8 +104,7 @@ static int at91_pm_verify_clocks(void)
851                 if ((scsr & (AT91_PMC_PCK0 << i)) == 0)
852                         continue;
854 -               css = at91_pmc_read(AT91_PMC_PCKR(i)) & AT91_PMC_CSS;
855 +               css = readl(pmc + AT91_PMC_PCKR(i)) & AT91_PMC_CSS;
856                 if (css != AT91_PMC_CSS_SLOW) {
857                         pr_err("AT91: PM - Suspend-to-RAM with PCK%d src %d\n", i, css);
858                         return 0;
859 @@ -145,8 +147,8 @@ static void at91_pm_suspend(suspend_state_t state)
860         flush_cache_all();
861         outer_disable();
863 -       at91_suspend_sram_fn(at91_pmc_base, at91_ramc_base[0],
864 -                               at91_ramc_base[1], pm_data);
865 +       at91_suspend_sram_fn(pmc, at91_ramc_base[0],
866 +                            at91_ramc_base[1], pm_data);
868         outer_resume();
870 @@ -369,6 +371,21 @@ static __init void at91_dt_ramc(void)
871         at91_pm_set_standby(standby);
874 +void at91rm9200_idle(void)
876 +       /*
877 +        * Disable the processor clock.  The processor will be automatically
878 +        * re-enabled by an interrupt or by a reset.
879 +        */
880 +       writel(AT91_PMC_PCK, pmc + AT91_PMC_SCDR);
883 +void at91sam9_idle(void)
885 +       writel(AT91_PMC_PCK, pmc + AT91_PMC_SCDR);
886 +       cpu_do_idle();
889  static void __init at91_pm_sram_init(void)
891         struct gen_pool *sram_pool;
892 @@ -415,13 +432,36 @@ static void __init at91_pm_sram_init(void)
893                         &at91_pm_suspend_in_sram, at91_pm_suspend_in_sram_sz);
896 -static void __init at91_pm_init(void)
897 +static const struct of_device_id atmel_pmc_ids[] __initconst = {
898 +       { .compatible = "atmel,at91rm9200-pmc"  },
899 +       { .compatible = "atmel,at91sam9260-pmc" },
900 +       { .compatible = "atmel,at91sam9g45-pmc" },
901 +       { .compatible = "atmel,at91sam9n12-pmc" },
902 +       { .compatible = "atmel,at91sam9x5-pmc" },
903 +       { .compatible = "atmel,sama5d3-pmc" },
904 +       { .compatible = "atmel,sama5d2-pmc" },
905 +       { /* sentinel */ },
908 +static void __init at91_pm_init(void (*pm_idle)(void))
910 -       at91_pm_sram_init();
911 +       struct device_node *pmc_np;
913         if (at91_cpuidle_device.dev.platform_data)
914                 platform_device_register(&at91_cpuidle_device);
916 +       pmc_np = of_find_matching_node(NULL, atmel_pmc_ids);
917 +       pmc = of_iomap(pmc_np, 0);
918 +       if (!pmc) {
919 +               pr_err("AT91: PM not supported, PMC not found\n");
920 +               return;
921 +       }
923 +       if (pm_idle)
924 +               arm_pm_idle = pm_idle;
926 +       at91_pm_sram_init();
928         if (at91_suspend_sram_fn)
929                 suspend_set_ops(&at91_pm_ops);
930         else
931 @@ -440,7 +480,7 @@ void __init at91rm9200_pm_init(void)
932         at91_pm_data.uhp_udp_mask = AT91RM9200_PMC_UHP | AT91RM9200_PMC_UDP;
933         at91_pm_data.memctrl = AT91_MEMCTRL_MC;
935 -       at91_pm_init();
936 +       at91_pm_init(at91rm9200_idle);
939  void __init at91sam9260_pm_init(void)
940 @@ -448,7 +488,7 @@ void __init at91sam9260_pm_init(void)
941         at91_dt_ramc();
942         at91_pm_data.memctrl = AT91_MEMCTRL_SDRAMC;
943         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
944 -       return at91_pm_init();
945 +       at91_pm_init(at91sam9_idle);
948  void __init at91sam9g45_pm_init(void)
949 @@ -456,7 +496,7 @@ void __init at91sam9g45_pm_init(void)
950         at91_dt_ramc();
951         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP;
952         at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
953 -       return at91_pm_init();
954 +       at91_pm_init(at91sam9_idle);
957  void __init at91sam9x5_pm_init(void)
958 @@ -464,5 +504,13 @@ void __init at91sam9x5_pm_init(void)
959         at91_dt_ramc();
960         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
961         at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
962 -       return at91_pm_init();
963 +       at91_pm_init(at91sam9_idle);
966 +void __init sama5_pm_init(void)
968 +       at91_dt_ramc();
969 +       at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
970 +       at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
971 +       at91_pm_init(NULL);
973 diff --git a/arch/arm/mach-at91/sama5.c b/arch/arm/mach-at91/sama5.c
974 index d9cf6799aec0..df8fdf1cf66d 100644
975 --- a/arch/arm/mach-at91/sama5.c
976 +++ b/arch/arm/mach-at91/sama5.c
977 @@ -51,7 +51,7 @@ static void __init sama5_dt_device_init(void)
978                 soc_dev = soc_device_to_device(soc);
980         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
981 -       at91sam9x5_pm_init();
982 +       sama5_pm_init();
985  static const char *const sama5_dt_board_compat[] __initconst = {
986 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
987 index 98a2c0cbb833..310dce500d3e 100644
988 --- a/arch/arm/mach-exynos/platsmp.c
989 +++ b/arch/arm/mach-exynos/platsmp.c
990 @@ -230,7 +230,7 @@ static void __iomem *scu_base_addr(void)
991         return (void __iomem *)(S5P_VA_SCU);
994 -static DEFINE_SPINLOCK(boot_lock);
995 +static DEFINE_RAW_SPINLOCK(boot_lock);
997  static void exynos_secondary_init(unsigned int cpu)
999 @@ -243,8 +243,8 @@ static void exynos_secondary_init(unsigned int cpu)
1000         /*
1001          * Synchronise with the boot thread.
1002          */
1003 -       spin_lock(&boot_lock);
1004 -       spin_unlock(&boot_lock);
1005 +       raw_spin_lock(&boot_lock);
1006 +       raw_spin_unlock(&boot_lock);
1009  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
1010 @@ -308,7 +308,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
1011          * Set synchronisation state between this boot processor
1012          * and the secondary one
1013          */
1014 -       spin_lock(&boot_lock);
1015 +       raw_spin_lock(&boot_lock);
1017         /*
1018          * The secondary processor is waiting to be released from
1019 @@ -335,7 +335,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
1021                 if (timeout == 0) {
1022                         printk(KERN_ERR "cpu1 power enable failed");
1023 -                       spin_unlock(&boot_lock);
1024 +                       raw_spin_unlock(&boot_lock);
1025                         return -ETIMEDOUT;
1026                 }
1027         }
1028 @@ -381,7 +381,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
1029          * calibrations, then wait for it to finish
1030          */
1031  fail:
1032 -       spin_unlock(&boot_lock);
1033 +       raw_spin_unlock(&boot_lock);
1035         return pen_release != -1 ? ret : 0;
1037 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
1038 index b5f8f5ffda79..9753a84df9c4 100644
1039 --- a/arch/arm/mach-hisi/platmcpm.c
1040 +++ b/arch/arm/mach-hisi/platmcpm.c
1041 @@ -61,7 +61,7 @@
1043  static void __iomem *sysctrl, *fabric;
1044  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
1045 -static DEFINE_SPINLOCK(boot_lock);
1046 +static DEFINE_RAW_SPINLOCK(boot_lock);
1047  static u32 fabric_phys_addr;
1048  /*
1049   * [0]: bootwrapper physical address
1050 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
1051         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
1052                 return -EINVAL;
1054 -       spin_lock_irq(&boot_lock);
1055 +       raw_spin_lock_irq(&boot_lock);
1057         if (hip04_cpu_table[cluster][cpu])
1058                 goto out;
1059 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
1061  out:
1062         hip04_cpu_table[cluster][cpu]++;
1063 -       spin_unlock_irq(&boot_lock);
1064 +       raw_spin_unlock_irq(&boot_lock);
1066         return 0;
1068 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
1069         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
1070         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
1072 -       spin_lock(&boot_lock);
1073 +       raw_spin_lock(&boot_lock);
1074         hip04_cpu_table[cluster][cpu]--;
1075         if (hip04_cpu_table[cluster][cpu] == 1) {
1076                 /* A power_up request went ahead of us. */
1077 -               spin_unlock(&boot_lock);
1078 +               raw_spin_unlock(&boot_lock);
1079                 return;
1080         } else if (hip04_cpu_table[cluster][cpu] > 1) {
1081                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
1082 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
1083         }
1085         last_man = hip04_cluster_is_down(cluster);
1086 -       spin_unlock(&boot_lock);
1087 +       raw_spin_unlock(&boot_lock);
1088         if (last_man) {
1089                 /* Since it's Cortex A15, disable L2 prefetching. */
1090                 asm volatile(
1091 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1092                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
1094         count = TIMEOUT_MSEC / POLL_MSEC;
1095 -       spin_lock_irq(&boot_lock);
1096 +       raw_spin_lock_irq(&boot_lock);
1097         for (tries = 0; tries < count; tries++) {
1098                 if (hip04_cpu_table[cluster][cpu])
1099                         goto err;
1100 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1101                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
1102                 if (data & CORE_WFI_STATUS(cpu))
1103                         break;
1104 -               spin_unlock_irq(&boot_lock);
1105 +               raw_spin_unlock_irq(&boot_lock);
1106                 /* Wait for clean L2 when the whole cluster is down. */
1107                 msleep(POLL_MSEC);
1108 -               spin_lock_irq(&boot_lock);
1109 +               raw_spin_lock_irq(&boot_lock);
1110         }
1111         if (tries >= count)
1112                 goto err;
1113 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1114                 goto err;
1115         if (hip04_cluster_is_down(cluster))
1116                 hip04_set_snoop_filter(cluster, 0);
1117 -       spin_unlock_irq(&boot_lock);
1118 +       raw_spin_unlock_irq(&boot_lock);
1119         return 1;
1120  err:
1121 -       spin_unlock_irq(&boot_lock);
1122 +       raw_spin_unlock_irq(&boot_lock);
1123         return 0;
1125  #endif
1126 diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig
1127 index 8ceda2844c4f..08bcf8fb76f2 100644
1128 --- a/arch/arm/mach-imx/Kconfig
1129 +++ b/arch/arm/mach-imx/Kconfig
1130 @@ -524,7 +524,7 @@ config SOC_IMX6Q
1131         bool "i.MX6 Quad/DualLite support"
1132         select ARM_ERRATA_764369 if SMP
1133         select HAVE_ARM_SCU if SMP
1134 -       select HAVE_ARM_TWD if SMP
1135 +       select HAVE_ARM_TWD
1136         select PCI_DOMAINS if PCI
1137         select PINCTRL_IMX6Q
1138         select SOC_IMX6
1139 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
1140 index 79e1f876d1c9..7e625c17f78e 100644
1141 --- a/arch/arm/mach-omap2/omap-smp.c
1142 +++ b/arch/arm/mach-omap2/omap-smp.c
1143 @@ -43,7 +43,7 @@
1144  /* SCU base address */
1145  static void __iomem *scu_base;
1147 -static DEFINE_SPINLOCK(boot_lock);
1148 +static DEFINE_RAW_SPINLOCK(boot_lock);
1150  void __iomem *omap4_get_scu_base(void)
1152 @@ -74,8 +74,8 @@ static void omap4_secondary_init(unsigned int cpu)
1153         /*
1154          * Synchronise with the boot thread.
1155          */
1156 -       spin_lock(&boot_lock);
1157 -       spin_unlock(&boot_lock);
1158 +       raw_spin_lock(&boot_lock);
1159 +       raw_spin_unlock(&boot_lock);
1162  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1163 @@ -89,7 +89,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1164          * Set synchronisation state between this boot processor
1165          * and the secondary one
1166          */
1167 -       spin_lock(&boot_lock);
1168 +       raw_spin_lock(&boot_lock);
1170         /*
1171          * Update the AuxCoreBoot0 with boot state for secondary core.
1172 @@ -166,7 +166,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1173          * Now the secondary core is starting up let it run its
1174          * calibrations, then wait for it to finish
1175          */
1176 -       spin_unlock(&boot_lock);
1177 +       raw_spin_unlock(&boot_lock);
1179         return 0;
1181 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
1182 index e46c91094dde..dcb3ed0c26da 100644
1183 --- a/arch/arm/mach-prima2/platsmp.c
1184 +++ b/arch/arm/mach-prima2/platsmp.c
1185 @@ -22,7 +22,7 @@
1187  static void __iomem *clk_base;
1189 -static DEFINE_SPINLOCK(boot_lock);
1190 +static DEFINE_RAW_SPINLOCK(boot_lock);
1192  static void sirfsoc_secondary_init(unsigned int cpu)
1194 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
1195         /*
1196          * Synchronise with the boot thread.
1197          */
1198 -       spin_lock(&boot_lock);
1199 -       spin_unlock(&boot_lock);
1200 +       raw_spin_lock(&boot_lock);
1201 +       raw_spin_unlock(&boot_lock);
1204  static const struct of_device_id clk_ids[]  = {
1205 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
1206         /* make sure write buffer is drained */
1207         mb();
1209 -       spin_lock(&boot_lock);
1210 +       raw_spin_lock(&boot_lock);
1212         /*
1213          * The secondary processor is waiting to be released from
1214 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
1215          * now the secondary core is starting up let it run its
1216          * calibrations, then wait for it to finish
1217          */
1218 -       spin_unlock(&boot_lock);
1219 +       raw_spin_unlock(&boot_lock);
1221         return pen_release != -1 ? -ENOSYS : 0;
1223 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
1224 index 9b00123a315d..0a49fe1bc8cf 100644
1225 --- a/arch/arm/mach-qcom/platsmp.c
1226 +++ b/arch/arm/mach-qcom/platsmp.c
1227 @@ -46,7 +46,7 @@
1229  extern void secondary_startup_arm(void);
1231 -static DEFINE_SPINLOCK(boot_lock);
1232 +static DEFINE_RAW_SPINLOCK(boot_lock);
1234  #ifdef CONFIG_HOTPLUG_CPU
1235  static void qcom_cpu_die(unsigned int cpu)
1236 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
1237         /*
1238          * Synchronise with the boot thread.
1239          */
1240 -       spin_lock(&boot_lock);
1241 -       spin_unlock(&boot_lock);
1242 +       raw_spin_lock(&boot_lock);
1243 +       raw_spin_unlock(&boot_lock);
1246  static int scss_release_secondary(unsigned int cpu)
1247 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
1248          * set synchronisation state between this boot processor
1249          * and the secondary one
1250          */
1251 -       spin_lock(&boot_lock);
1252 +       raw_spin_lock(&boot_lock);
1254         /*
1255          * Send the secondary CPU a soft interrupt, thereby causing
1256 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
1257          * now the secondary core is starting up let it run its
1258          * calibrations, then wait for it to finish
1259          */
1260 -       spin_unlock(&boot_lock);
1261 +       raw_spin_unlock(&boot_lock);
1263         return ret;
1265 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
1266 index fd4297713d67..b0553b2c2d53 100644
1267 --- a/arch/arm/mach-spear/platsmp.c
1268 +++ b/arch/arm/mach-spear/platsmp.c
1269 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
1270         sync_cache_w(&pen_release);
1273 -static DEFINE_SPINLOCK(boot_lock);
1274 +static DEFINE_RAW_SPINLOCK(boot_lock);
1276  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
1278 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
1279         /*
1280          * Synchronise with the boot thread.
1281          */
1282 -       spin_lock(&boot_lock);
1283 -       spin_unlock(&boot_lock);
1284 +       raw_spin_lock(&boot_lock);
1285 +       raw_spin_unlock(&boot_lock);
1288  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1289 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1290          * set synchronisation state between this boot processor
1291          * and the secondary one
1292          */
1293 -       spin_lock(&boot_lock);
1294 +       raw_spin_lock(&boot_lock);
1296         /*
1297          * The secondary processor is waiting to be released from
1298 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1299          * now the secondary core is starting up let it run its
1300          * calibrations, then wait for it to finish
1301          */
1302 -       spin_unlock(&boot_lock);
1303 +       raw_spin_unlock(&boot_lock);
1305         return pen_release != -1 ? -ENOSYS : 0;
1307 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
1308 index c4ad6eae67fa..e830b20b212f 100644
1309 --- a/arch/arm/mach-sti/platsmp.c
1310 +++ b/arch/arm/mach-sti/platsmp.c
1311 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
1312         sync_cache_w(&pen_release);
1315 -static DEFINE_SPINLOCK(boot_lock);
1316 +static DEFINE_RAW_SPINLOCK(boot_lock);
1318  static void sti_secondary_init(unsigned int cpu)
1320 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
1321         /*
1322          * Synchronise with the boot thread.
1323          */
1324 -       spin_lock(&boot_lock);
1325 -       spin_unlock(&boot_lock);
1326 +       raw_spin_lock(&boot_lock);
1327 +       raw_spin_unlock(&boot_lock);
1330  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1331 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1332          * set synchronisation state between this boot processor
1333          * and the secondary one
1334          */
1335 -       spin_lock(&boot_lock);
1336 +       raw_spin_lock(&boot_lock);
1338         /*
1339          * The secondary processor is waiting to be released from
1340 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1341          * now the secondary core is starting up let it run its
1342          * calibrations, then wait for it to finish
1343          */
1344 -       spin_unlock(&boot_lock);
1345 +       raw_spin_unlock(&boot_lock);
1347         return pen_release != -1 ? -ENOSYS : 0;
1349 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
1350 index 0d20cd594017..a11dc6d8ca02 100644
1351 --- a/arch/arm/mm/fault.c
1352 +++ b/arch/arm/mm/fault.c
1353 @@ -433,6 +433,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1354         if (addr < TASK_SIZE)
1355                 return do_page_fault(addr, fsr, regs);
1357 +       if (interrupts_enabled(regs))
1358 +               local_irq_enable();
1360         if (user_mode(regs))
1361                 goto bad_area;
1363 @@ -500,6 +503,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1364  static int
1365  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
1367 +       if (interrupts_enabled(regs))
1368 +               local_irq_enable();
1370         do_bad_area(addr, fsr, regs);
1371         return 0;
1373 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
1374 index d02f8187b1cc..542692dbd40a 100644
1375 --- a/arch/arm/mm/highmem.c
1376 +++ b/arch/arm/mm/highmem.c
1377 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
1378         return *ptep;
1381 +static unsigned int fixmap_idx(int type)
1383 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1386  void *kmap(struct page *page)
1388         might_sleep();
1389 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
1391  void *kmap_atomic(struct page *page)
1393 +       pte_t pte = mk_pte(page, kmap_prot);
1394         unsigned int idx;
1395         unsigned long vaddr;
1396         void *kmap;
1397         int type;
1399 -       preempt_disable();
1400 +       preempt_disable_nort();
1401         pagefault_disable();
1402         if (!PageHighMem(page))
1403                 return page_address(page);
1404 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
1406         type = kmap_atomic_idx_push();
1408 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1409 +       idx = fixmap_idx(type);
1410         vaddr = __fix_to_virt(idx);
1411  #ifdef CONFIG_DEBUG_HIGHMEM
1412         /*
1413 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
1414          * in place, so the contained TLB flush ensures the TLB is updated
1415          * with the new mapping.
1416          */
1417 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1418 +#ifdef CONFIG_PREEMPT_RT_FULL
1419 +       current->kmap_pte[type] = pte;
1420 +#endif
1421 +       set_fixmap_pte(idx, pte);
1423         return (void *)vaddr;
1425 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
1427         if (kvaddr >= (void *)FIXADDR_START) {
1428                 type = kmap_atomic_idx();
1429 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1430 +               idx = fixmap_idx(type);
1432                 if (cache_is_vivt())
1433                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1434 +#ifdef CONFIG_PREEMPT_RT_FULL
1435 +               current->kmap_pte[type] = __pte(0);
1436 +#endif
1437  #ifdef CONFIG_DEBUG_HIGHMEM
1438                 BUG_ON(vaddr != __fix_to_virt(idx));
1439 -               set_fixmap_pte(idx, __pte(0));
1440  #else
1441                 (void) idx;  /* to kill a warning */
1442  #endif
1443 +               set_fixmap_pte(idx, __pte(0));
1444                 kmap_atomic_idx_pop();
1445         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1446                 /* this address was obtained through kmap_high_get() */
1447                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1448         }
1449         pagefault_enable();
1450 -       preempt_enable();
1451 +       preempt_enable_nort();
1453  EXPORT_SYMBOL(__kunmap_atomic);
1455  void *kmap_atomic_pfn(unsigned long pfn)
1457 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1458         unsigned long vaddr;
1459         int idx, type;
1460         struct page *page = pfn_to_page(pfn);
1462 -       preempt_disable();
1463 +       preempt_disable_nort();
1464         pagefault_disable();
1465         if (!PageHighMem(page))
1466                 return page_address(page);
1468         type = kmap_atomic_idx_push();
1469 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1470 +       idx = fixmap_idx(type);
1471         vaddr = __fix_to_virt(idx);
1472  #ifdef CONFIG_DEBUG_HIGHMEM
1473         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1474  #endif
1475 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1476 +#ifdef CONFIG_PREEMPT_RT_FULL
1477 +       current->kmap_pte[type] = pte;
1478 +#endif
1479 +       set_fixmap_pte(idx, pte);
1481         return (void *)vaddr;
1483 +#if defined CONFIG_PREEMPT_RT_FULL
1484 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1486 +       int i;
1488 +       /*
1489 +        * Clear @prev's kmap_atomic mappings
1490 +        */
1491 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1492 +               int idx = fixmap_idx(i);
1494 +               set_fixmap_pte(idx, __pte(0));
1495 +       }
1496 +       /*
1497 +        * Restore @next_p's kmap_atomic mappings
1498 +        */
1499 +       for (i = 0; i < next_p->kmap_idx; i++) {
1500 +               int idx = fixmap_idx(i);
1502 +               if (!pte_none(next_p->kmap_pte[i]))
1503 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1504 +       }
1506 +#endif
1507 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
1508 index 53feb90c840c..b4a8d54fc3f3 100644
1509 --- a/arch/arm/plat-versatile/platsmp.c
1510 +++ b/arch/arm/plat-versatile/platsmp.c
1511 @@ -30,7 +30,7 @@ static void write_pen_release(int val)
1512         sync_cache_w(&pen_release);
1515 -static DEFINE_SPINLOCK(boot_lock);
1516 +static DEFINE_RAW_SPINLOCK(boot_lock);
1518  void versatile_secondary_init(unsigned int cpu)
1520 @@ -43,8 +43,8 @@ void versatile_secondary_init(unsigned int cpu)
1521         /*
1522          * Synchronise with the boot thread.
1523          */
1524 -       spin_lock(&boot_lock);
1525 -       spin_unlock(&boot_lock);
1526 +       raw_spin_lock(&boot_lock);
1527 +       raw_spin_unlock(&boot_lock);
1530  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1531 @@ -55,7 +55,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1532          * Set synchronisation state between this boot processor
1533          * and the secondary one
1534          */
1535 -       spin_lock(&boot_lock);
1536 +       raw_spin_lock(&boot_lock);
1538         /*
1539          * This is really belt and braces; we hold unintended secondary
1540 @@ -85,7 +85,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1541          * now the secondary core is starting up let it run its
1542          * calibrations, then wait for it to finish
1543          */
1544 -       spin_unlock(&boot_lock);
1545 +       raw_spin_unlock(&boot_lock);
1547         return pen_release != -1 ? -ENOSYS : 0;
1549 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
1550 index 14cdc6dea493..9196cf82f7be 100644
1551 --- a/arch/arm64/Kconfig
1552 +++ b/arch/arm64/Kconfig
1553 @@ -76,6 +76,7 @@ config ARM64
1554         select HAVE_PERF_REGS
1555         select HAVE_PERF_USER_STACK_DUMP
1556         select HAVE_RCU_TABLE_FREE
1557 +       select HAVE_PREEMPT_LAZY
1558         select HAVE_SYSCALL_TRACEPOINTS
1559         select IOMMU_DMA if IOMMU_SUPPORT
1560         select IRQ_DOMAIN
1561 @@ -582,7 +583,7 @@ config XEN_DOM0
1563  config XEN
1564         bool "Xen guest support on ARM64"
1565 -       depends on ARM64 && OF
1566 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1567         select SWIOTLB_XEN
1568         help
1569           Say Y if you want to run Linux in a Virtual Machine on Xen on ARM64.
1570 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
1571 index 90c7ff233735..5f4e89fbc290 100644
1572 --- a/arch/arm64/include/asm/thread_info.h
1573 +++ b/arch/arm64/include/asm/thread_info.h
1574 @@ -49,6 +49,7 @@ struct thread_info {
1575         mm_segment_t            addr_limit;     /* address limit */
1576         struct task_struct      *task;          /* main task structure */
1577         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1578 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1579         int                     cpu;            /* cpu */
1580  };
1582 @@ -103,6 +104,7 @@ static inline struct thread_info *current_thread_info(void)
1583  #define TIF_NEED_RESCHED       1
1584  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1585  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1586 +#define TIF_NEED_RESCHED_LAZY  4
1587  #define TIF_NOHZ               7
1588  #define TIF_SYSCALL_TRACE      8
1589  #define TIF_SYSCALL_AUDIT      9
1590 @@ -118,6 +120,7 @@ static inline struct thread_info *current_thread_info(void)
1591  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1592  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1593  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1594 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1595  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1596  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1597  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1598 @@ -126,7 +129,8 @@ static inline struct thread_info *current_thread_info(void)
1599  #define _TIF_32BIT             (1 << TIF_32BIT)
1601  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1602 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1603 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1604 +                                _TIF_NEED_RESCHED_LAZY)
1606  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1607                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1608 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
1609 index 087cf9a65359..d74475928399 100644
1610 --- a/arch/arm64/kernel/asm-offsets.c
1611 +++ b/arch/arm64/kernel/asm-offsets.c
1612 @@ -35,6 +35,7 @@ int main(void)
1613    BLANK();
1614    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1615    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1616 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1617    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1618    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1619    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1620 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
1621 index dccd0c2e9023..a8d4d065fd81 100644
1622 --- a/arch/arm64/kernel/entry.S
1623 +++ b/arch/arm64/kernel/entry.S
1624 @@ -378,11 +378,16 @@ ENDPROC(el1_sync)
1625  #ifdef CONFIG_PREEMPT
1626         get_thread_info tsk
1627         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1628 -       cbnz    w24, 1f                         // preempt count != 0
1629 +       cbnz    w24, 2f                         // preempt count != 0
1630         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1631 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1632 -       bl      el1_preempt
1633 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1635 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1636 +       cbnz    w24, 2f                         // preempt lazy count != 0
1637 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1638  1:
1639 +       bl      el1_preempt
1641  #endif
1642  #ifdef CONFIG_TRACE_IRQFLAGS
1643         bl      trace_hardirqs_on
1644 @@ -396,6 +401,7 @@ ENDPROC(el1_irq)
1645  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1646         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1647         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1648 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1649         ret     x24
1650  #endif
1652 @@ -640,6 +646,7 @@ ENDPROC(cpu_switch_to)
1653   */
1654  work_pending:
1655         tbnz    x1, #TIF_NEED_RESCHED, work_resched
1656 +       tbnz    x1, #TIF_NEED_RESCHED_LAZY, work_resched
1657         /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
1658         ldr     x2, [sp, #S_PSTATE]
1659         mov     x0, sp                          // 'regs'
1660 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
1661 index 8b0424abc84c..5422d4c0bbdf 100644
1662 --- a/arch/mips/Kconfig
1663 +++ b/arch/mips/Kconfig
1664 @@ -2411,7 +2411,7 @@ config CPU_R4400_WORKAROUNDS
1666  config HIGHMEM
1667         bool "High Memory Support"
1668 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1669 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1671  config CPU_SUPPORTS_HIGHMEM
1672         bool
1673 diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
1674 index a017b23ee4aa..8d4d9270140f 100644
1675 --- a/arch/mips/kvm/mips.c
1676 +++ b/arch/mips/kvm/mips.c
1677 @@ -454,8 +454,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1679         dvcpu->arch.wait = 0;
1681 -       if (waitqueue_active(&dvcpu->wq))
1682 -               wake_up_interruptible(&dvcpu->wq);
1683 +       if (swait_active(&dvcpu->wq))
1684 +               swake_up(&dvcpu->wq);
1686         return 0;
1688 @@ -1183,8 +1183,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
1689         kvm_mips_callbacks->queue_timer_int(vcpu);
1691         vcpu->arch.wait = 0;
1692 -       if (waitqueue_active(&vcpu->wq))
1693 -               wake_up_interruptible(&vcpu->wq);
1694 +       if (swait_active(&vcpu->wq))
1695 +               swake_up(&vcpu->wq);
1698  /* low level hrtimer wake routine */
1699 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
1700 index dfb1ee8c3e06..cdc3c20ef225 100644
1701 --- a/arch/powerpc/Kconfig
1702 +++ b/arch/powerpc/Kconfig
1703 @@ -60,10 +60,11 @@ config LOCKDEP_SUPPORT
1705  config RWSEM_GENERIC_SPINLOCK
1706         bool
1707 +       default y if PREEMPT_RT_FULL
1709  config RWSEM_XCHGADD_ALGORITHM
1710         bool
1711 -       default y
1712 +       default y if !PREEMPT_RT_FULL
1714  config GENERIC_LOCKBREAK
1715         bool
1716 @@ -141,6 +142,7 @@ config PPC
1717         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1718         select GENERIC_STRNCPY_FROM_USER
1719         select GENERIC_STRNLEN_USER
1720 +       select HAVE_PREEMPT_LAZY
1721         select HAVE_MOD_ARCH_SPECIFIC
1722         select MODULES_USE_ELF_RELA
1723         select CLONE_BACKWARDS
1724 @@ -319,7 +321,7 @@ menu "Kernel options"
1726  config HIGHMEM
1727         bool "High memory support"
1728 -       depends on PPC32
1729 +       depends on PPC32 && !PREEMPT_RT_FULL
1731  source kernel/Kconfig.hz
1732  source kernel/Kconfig.preempt
1733 diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
1734 index a92d95aee42d..20376580583f 100644
1735 --- a/arch/powerpc/include/asm/kvm_host.h
1736 +++ b/arch/powerpc/include/asm/kvm_host.h
1737 @@ -286,7 +286,7 @@ struct kvmppc_vcore {
1738         struct list_head runnable_threads;
1739         struct list_head preempt_list;
1740         spinlock_t lock;
1741 -       wait_queue_head_t wq;
1742 +       struct swait_queue_head wq;
1743         spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
1744         u64 stolen_tb;
1745         u64 preempt_tb;
1746 @@ -627,7 +627,7 @@ struct kvm_vcpu_arch {
1747         u8 prodded;
1748         u32 last_inst;
1750 -       wait_queue_head_t *wqp;
1751 +       struct swait_queue_head *wqp;
1752         struct kvmppc_vcore *vcore;
1753         int ret;
1754         int trap;
1755 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
1756 index 7efee4a3240b..40e6fa1b85b2 100644
1757 --- a/arch/powerpc/include/asm/thread_info.h
1758 +++ b/arch/powerpc/include/asm/thread_info.h
1759 @@ -42,6 +42,8 @@ struct thread_info {
1760         int             cpu;                    /* cpu we're on */
1761         int             preempt_count;          /* 0 => preemptable,
1762                                                    <0 => BUG */
1763 +       int             preempt_lazy_count;      /* 0 => preemptable,
1764 +                                                  <0 => BUG */
1765         unsigned long   local_flags;            /* private flags for thread */
1767         /* low level flags - has atomic operations done on it */
1768 @@ -82,8 +84,7 @@ static inline struct thread_info *current_thread_info(void)
1769  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1770  #define TIF_SIGPENDING         1       /* signal pending */
1771  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1772 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1773 -                                          TIF_NEED_RESCHED */
1774 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1775  #define TIF_32BIT              4       /* 32 bit binary */
1776  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1777  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1778 @@ -101,6 +102,8 @@ static inline struct thread_info *current_thread_info(void)
1779  #if defined(CONFIG_PPC64)
1780  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1781  #endif
1782 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1783 +                                          TIF_NEED_RESCHED */
1785  /* as above, but as bit values */
1786  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1787 @@ -119,14 +122,16 @@ static inline struct thread_info *current_thread_info(void)
1788  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1789  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1790  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1791 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1792  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1793                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1794                                  _TIF_NOHZ)
1796  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1797                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1798 -                                _TIF_RESTORE_TM)
1799 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1800  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1801 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1803  /* Bits in local_flags */
1804  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1805 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
1806 index 40da69163d51..bd040815334b 100644
1807 --- a/arch/powerpc/kernel/asm-offsets.c
1808 +++ b/arch/powerpc/kernel/asm-offsets.c
1809 @@ -160,6 +160,7 @@ int main(void)
1810         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1811         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1812         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1813 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1814         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1815         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1817 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
1818 index 2405631e91a2..c21b4b42eaa0 100644
1819 --- a/arch/powerpc/kernel/entry_32.S
1820 +++ b/arch/powerpc/kernel/entry_32.S
1821 @@ -818,7 +818,14 @@ user_exc_return:           /* r10 contains MSR_KERNEL here */
1822         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1823         bne     restore
1824         andi.   r8,r8,_TIF_NEED_RESCHED
1825 +       bne+    1f
1826 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1827 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1828 +       bne     restore
1829 +       lwz     r0,TI_FLAGS(r9)
1830 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1831         beq+    restore
1833         lwz     r3,_MSR(r1)
1834         andi.   r0,r3,MSR_EE    /* interrupts off? */
1835         beq     restore         /* don't schedule if so */
1836 @@ -829,11 +836,11 @@ user_exc_return:          /* r10 contains MSR_KERNEL here */
1837          */
1838         bl      trace_hardirqs_off
1839  #endif
1840 -1:     bl      preempt_schedule_irq
1841 +2:     bl      preempt_schedule_irq
1842         CURRENT_THREAD_INFO(r9, r1)
1843         lwz     r3,TI_FLAGS(r9)
1844 -       andi.   r0,r3,_TIF_NEED_RESCHED
1845 -       bne-    1b
1846 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1847 +       bne-    2b
1848  #ifdef CONFIG_TRACE_IRQFLAGS
1849         /* And now, to properly rebalance the above, we tell lockdep they
1850          * are being turned back on, which will happen when we return
1851 @@ -1154,7 +1161,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
1852  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1854  do_work:                       /* r10 contains MSR_KERNEL here */
1855 -       andi.   r0,r9,_TIF_NEED_RESCHED
1856 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1857         beq     do_user_signal
1859  do_resched:                    /* r10 contains MSR_KERNEL here */
1860 @@ -1175,7 +1182,7 @@ do_resched:                       /* r10 contains MSR_KERNEL here */
1861         MTMSRD(r10)             /* disable interrupts */
1862         CURRENT_THREAD_INFO(r9, r1)
1863         lwz     r9,TI_FLAGS(r9)
1864 -       andi.   r0,r9,_TIF_NEED_RESCHED
1865 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1866         bne-    do_resched
1867         andi.   r0,r9,_TIF_USER_WORK_MASK
1868         beq     restore_user
1869 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
1870 index f6fd0332c3a2..96235fe0a581 100644
1871 --- a/arch/powerpc/kernel/entry_64.S
1872 +++ b/arch/powerpc/kernel/entry_64.S
1873 @@ -683,7 +683,7 @@ _GLOBAL(ret_from_except_lite)
1874  #else
1875         beq     restore
1876  #endif
1877 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1878 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1879         beq     2f
1880         bl      restore_interrupts
1881         SCHEDULE_USER
1882 @@ -745,10 +745,18 @@ _GLOBAL(ret_from_except_lite)
1884  #ifdef CONFIG_PREEMPT
1885         /* Check if we need to preempt */
1886 +       lwz     r8,TI_PREEMPT(r9)
1887 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1888 +       bne     restore
1889         andi.   r0,r4,_TIF_NEED_RESCHED
1890 +       bne+    check_count
1892 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1893         beq+    restore
1894 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1896         /* Check that preempt_count() == 0 and interrupts are enabled */
1897 -       lwz     r8,TI_PREEMPT(r9)
1898 +check_count:
1899         cmpwi   cr1,r8,0
1900         ld      r0,SOFTE(r1)
1901         cmpdi   r0,0
1902 @@ -765,7 +773,7 @@ _GLOBAL(ret_from_except_lite)
1903         /* Re-test flags and eventually loop */
1904         CURRENT_THREAD_INFO(r9, r1)
1905         ld      r4,TI_FLAGS(r9)
1906 -       andi.   r0,r4,_TIF_NEED_RESCHED
1907 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1908         bne     1b
1910         /*
1911 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
1912 index 290559df1e8b..070afa6da35d 100644
1913 --- a/arch/powerpc/kernel/irq.c
1914 +++ b/arch/powerpc/kernel/irq.c
1915 @@ -614,6 +614,7 @@ void irq_ctx_init(void)
1916         }
1919 +#ifndef CONFIG_PREEMPT_RT_FULL
1920  void do_softirq_own_stack(void)
1922         struct thread_info *curtp, *irqtp;
1923 @@ -631,6 +632,7 @@ void do_softirq_own_stack(void)
1924         if (irqtp->flags)
1925                 set_bits(irqtp->flags, &curtp->flags);
1927 +#endif
1929  irq_hw_number_t virq_to_hw(unsigned int virq)
1931 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
1932 index df4efa304b2c..9cb0c2f6e7ac 100644
1933 --- a/arch/powerpc/kernel/misc_32.S
1934 +++ b/arch/powerpc/kernel/misc_32.S
1935 @@ -40,6 +40,7 @@
1936   * We store the saved ksp_limit in the unused part
1937   * of the STACK_FRAME_OVERHEAD
1938   */
1939 +#ifndef CONFIG_PREEMPT_RT_FULL
1940  _GLOBAL(call_do_softirq)
1941         mflr    r0
1942         stw     r0,4(r1)
1943 @@ -56,6 +57,7 @@ _GLOBAL(call_do_softirq)
1944         stw     r10,THREAD+KSP_LIMIT(r2)
1945         mtlr    r0
1946         blr
1947 +#endif
1949  /*
1950   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1951 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
1952 index db475d41b57a..96b7ef80e05d 100644
1953 --- a/arch/powerpc/kernel/misc_64.S
1954 +++ b/arch/powerpc/kernel/misc_64.S
1955 @@ -30,6 +30,7 @@
1957         .text
1959 +#ifndef CONFIG_PREEMPT_RT_FULL
1960  _GLOBAL(call_do_softirq)
1961         mflr    r0
1962         std     r0,16(r1)
1963 @@ -40,6 +41,7 @@ _GLOBAL(call_do_softirq)
1964         ld      r0,16(r1)
1965         mtlr    r0
1966         blr
1967 +#endif
1969  _GLOBAL(call_do_irq)
1970         mflr    r0
1971 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
1972 index c2024ac9d4e8..2303788da7e1 100644
1973 --- a/arch/powerpc/kvm/Kconfig
1974 +++ b/arch/powerpc/kvm/Kconfig
1975 @@ -172,6 +172,7 @@ config KVM_E500MC
1976  config KVM_MPIC
1977         bool "KVM in-kernel MPIC emulation"
1978         depends on KVM && E500
1979 +       depends on !PREEMPT_RT_FULL
1980         select HAVE_KVM_IRQCHIP
1981         select HAVE_KVM_IRQFD
1982         select HAVE_KVM_IRQ_ROUTING
1983 diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
1984 index 428563b195c3..4c42c3935025 100644
1985 --- a/arch/powerpc/kvm/book3s_hv.c
1986 +++ b/arch/powerpc/kvm/book3s_hv.c
1987 @@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
1988  static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
1990         int cpu;
1991 -       wait_queue_head_t *wqp;
1992 +       struct swait_queue_head *wqp;
1994         wqp = kvm_arch_vcpu_wq(vcpu);
1995 -       if (waitqueue_active(wqp)) {
1996 -               wake_up_interruptible(wqp);
1997 +       if (swait_active(wqp)) {
1998 +               swake_up(wqp);
1999                 ++vcpu->stat.halt_wakeup;
2000         }
2002 @@ -707,8 +707,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
2003                 tvcpu->arch.prodded = 1;
2004                 smp_mb();
2005                 if (vcpu->arch.ceded) {
2006 -                       if (waitqueue_active(&vcpu->wq)) {
2007 -                               wake_up_interruptible(&vcpu->wq);
2008 +                       if (swait_active(&vcpu->wq)) {
2009 +                               swake_up(&vcpu->wq);
2010                                 vcpu->stat.halt_wakeup++;
2011                         }
2012                 }
2013 @@ -1453,7 +1453,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
2014         INIT_LIST_HEAD(&vcore->runnable_threads);
2015         spin_lock_init(&vcore->lock);
2016         spin_lock_init(&vcore->stoltb_lock);
2017 -       init_waitqueue_head(&vcore->wq);
2018 +       init_swait_queue_head(&vcore->wq);
2019         vcore->preempt_tb = TB_NIL;
2020         vcore->lpcr = kvm->arch.lpcr;
2021         vcore->first_vcpuid = core * threads_per_subcore;
2022 @@ -2525,10 +2525,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2024         struct kvm_vcpu *vcpu;
2025         int do_sleep = 1;
2026 +       DECLARE_SWAITQUEUE(wait);
2028 -       DEFINE_WAIT(wait);
2030 -       prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2031 +       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2033         /*
2034          * Check one last time for pending exceptions and ceded state after
2035 @@ -2542,7 +2541,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2036         }
2038         if (!do_sleep) {
2039 -               finish_wait(&vc->wq, &wait);
2040 +               finish_swait(&vc->wq, &wait);
2041                 return;
2042         }
2044 @@ -2550,7 +2549,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2045         trace_kvmppc_vcore_blocked(vc, 0);
2046         spin_unlock(&vc->lock);
2047         schedule();
2048 -       finish_wait(&vc->wq, &wait);
2049 +       finish_swait(&vc->wq, &wait);
2050         spin_lock(&vc->lock);
2051         vc->vcore_state = VCORE_INACTIVE;
2052         trace_kvmppc_vcore_blocked(vc, 1);
2053 @@ -2606,7 +2605,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2054                         kvmppc_start_thread(vcpu, vc);
2055                         trace_kvm_guest_enter(vcpu);
2056                 } else if (vc->vcore_state == VCORE_SLEEPING) {
2057 -                       wake_up(&vc->wq);
2058 +                       swake_up(&vc->wq);
2059                 }
2061         }
2062 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
2063 index 3f175e8aedb4..c4c02f91904c 100644
2064 --- a/arch/powerpc/platforms/ps3/device-init.c
2065 +++ b/arch/powerpc/platforms/ps3/device-init.c
2066 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
2067         }
2068         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
2070 -       res = wait_event_interruptible(dev->done.wait,
2071 +       res = swait_event_interruptible(dev->done.wait,
2072                                        dev->done.done || kthread_should_stop());
2073         if (kthread_should_stop())
2074                 res = -EINTR;
2075 diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
2076 index e9a983f40a24..bbdc539fb3c6 100644
2077 --- a/arch/s390/include/asm/kvm_host.h
2078 +++ b/arch/s390/include/asm/kvm_host.h
2079 @@ -427,7 +427,7 @@ struct kvm_s390_irq_payload {
2080  struct kvm_s390_local_interrupt {
2081         spinlock_t lock;
2082         struct kvm_s390_float_interrupt *float_int;
2083 -       wait_queue_head_t *wq;
2084 +       struct swait_queue_head *wq;
2085         atomic_t *cpuflags;
2086         DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
2087         struct kvm_s390_irq_payload irq;
2088 diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
2089 index 6a75352f453c..cc862c486002 100644
2090 --- a/arch/s390/kvm/interrupt.c
2091 +++ b/arch/s390/kvm/interrupt.c
2092 @@ -868,13 +868,13 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
2094  void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
2096 -       if (waitqueue_active(&vcpu->wq)) {
2097 +       if (swait_active(&vcpu->wq)) {
2098                 /*
2099                  * The vcpu gave up the cpu voluntarily, mark it as a good
2100                  * yield-candidate.
2101                  */
2102                 vcpu->preempted = true;
2103 -               wake_up_interruptible(&vcpu->wq);
2104 +               swake_up(&vcpu->wq);
2105                 vcpu->stat.halt_wakeup++;
2106         }
2108 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
2109 index 6c0378c0b8b5..abd58b4dff97 100644
2110 --- a/arch/sh/kernel/irq.c
2111 +++ b/arch/sh/kernel/irq.c
2112 @@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
2113         hardirq_ctx[cpu] = NULL;
2116 +#ifndef CONFIG_PREEMPT_RT_FULL
2117  void do_softirq_own_stack(void)
2119         struct thread_info *curctx;
2120 @@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
2121                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
2122         );
2124 +#endif
2125  #else
2126  static inline void handle_one_irq(unsigned int irq)
2128 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
2129 index 94f4ac21761b..d7c369d061cf 100644
2130 --- a/arch/sparc/Kconfig
2131 +++ b/arch/sparc/Kconfig
2132 @@ -189,12 +189,10 @@ config NR_CPUS
2133  source kernel/Kconfig.hz
2135  config RWSEM_GENERIC_SPINLOCK
2136 -       bool
2137 -       default y if SPARC32
2138 +       def_bool PREEMPT_RT_FULL
2140  config RWSEM_XCHGADD_ALGORITHM
2141 -       bool
2142 -       default y if SPARC64
2143 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
2145  config GENERIC_HWEIGHT
2146         bool
2147 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
2148 index bfbde8c4ffb2..62e7e06013c5 100644
2149 --- a/arch/sparc/kernel/irq_64.c
2150 +++ b/arch/sparc/kernel/irq_64.c
2151 @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
2152         set_irq_regs(old_regs);
2155 +#ifndef CONFIG_PREEMPT_RT_FULL
2156  void do_softirq_own_stack(void)
2158         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
2159 @@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
2160         __asm__ __volatile__("mov %0, %%sp"
2161                              : : "r" (orig_sp));
2163 +#endif
2165  #ifdef CONFIG_HOTPLUG_CPU
2166  void fixup_irqs(void)
2167 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
2168 index 0ef2cdd11616..2eaae0f372fa 100644
2169 --- a/arch/x86/Kconfig
2170 +++ b/arch/x86/Kconfig
2171 @@ -17,6 +17,7 @@ config X86_64
2172  ### Arch settings
2173  config X86
2174         def_bool y
2175 +       select HAVE_PREEMPT_LAZY
2176         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
2177         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
2178         select ANON_INODES
2179 @@ -213,8 +214,11 @@ config ARCH_MAY_HAVE_PC_FDC
2180         def_bool y
2181         depends on ISA_DMA_API
2183 +config RWSEM_GENERIC_SPINLOCK
2184 +       def_bool PREEMPT_RT_FULL
2186  config RWSEM_XCHGADD_ALGORITHM
2187 -       def_bool y
2188 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
2190  config GENERIC_CALIBRATE_DELAY
2191         def_bool y
2192 @@ -849,7 +853,7 @@ config IOMMU_HELPER
2193  config MAXSMP
2194         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
2195         depends on X86_64 && SMP && DEBUG_KERNEL
2196 -       select CPUMASK_OFFSTACK
2197 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
2198         ---help---
2199           Enable maximum number of CPUS and NUMA Nodes for this architecture.
2200           If unsure, say N.
2201 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
2202 index 3633ad6145c5..c6d5458ee7f9 100644
2203 --- a/arch/x86/crypto/aesni-intel_glue.c
2204 +++ b/arch/x86/crypto/aesni-intel_glue.c
2205 @@ -383,14 +383,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
2206         err = blkcipher_walk_virt(desc, &walk);
2207         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2209 -       kernel_fpu_begin();
2210         while ((nbytes = walk.nbytes)) {
2211 +               kernel_fpu_begin();
2212                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2213 -                             nbytes & AES_BLOCK_MASK);
2214 +                               nbytes & AES_BLOCK_MASK);
2215 +               kernel_fpu_end();
2216                 nbytes &= AES_BLOCK_SIZE - 1;
2217                 err = blkcipher_walk_done(desc, &walk, nbytes);
2218         }
2219 -       kernel_fpu_end();
2221         return err;
2223 @@ -407,14 +407,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
2224         err = blkcipher_walk_virt(desc, &walk);
2225         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2227 -       kernel_fpu_begin();
2228         while ((nbytes = walk.nbytes)) {
2229 +               kernel_fpu_begin();
2230                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2231                               nbytes & AES_BLOCK_MASK);
2232 +               kernel_fpu_end();
2233                 nbytes &= AES_BLOCK_SIZE - 1;
2234                 err = blkcipher_walk_done(desc, &walk, nbytes);
2235         }
2236 -       kernel_fpu_end();
2238         return err;
2240 @@ -431,14 +431,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
2241         err = blkcipher_walk_virt(desc, &walk);
2242         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2244 -       kernel_fpu_begin();
2245         while ((nbytes = walk.nbytes)) {
2246 +               kernel_fpu_begin();
2247                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2248                               nbytes & AES_BLOCK_MASK, walk.iv);
2249 +               kernel_fpu_end();
2250                 nbytes &= AES_BLOCK_SIZE - 1;
2251                 err = blkcipher_walk_done(desc, &walk, nbytes);
2252         }
2253 -       kernel_fpu_end();
2255         return err;
2257 @@ -455,14 +455,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
2258         err = blkcipher_walk_virt(desc, &walk);
2259         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2261 -       kernel_fpu_begin();
2262         while ((nbytes = walk.nbytes)) {
2263 +               kernel_fpu_begin();
2264                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2265                               nbytes & AES_BLOCK_MASK, walk.iv);
2266 +               kernel_fpu_end();
2267                 nbytes &= AES_BLOCK_SIZE - 1;
2268                 err = blkcipher_walk_done(desc, &walk, nbytes);
2269         }
2270 -       kernel_fpu_end();
2272         return err;
2274 @@ -514,18 +514,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
2275         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
2276         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2278 -       kernel_fpu_begin();
2279         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
2280 +               kernel_fpu_begin();
2281                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2282                                       nbytes & AES_BLOCK_MASK, walk.iv);
2283 +               kernel_fpu_end();
2284                 nbytes &= AES_BLOCK_SIZE - 1;
2285                 err = blkcipher_walk_done(desc, &walk, nbytes);
2286         }
2287         if (walk.nbytes) {
2288 +               kernel_fpu_begin();
2289                 ctr_crypt_final(ctx, &walk);
2290 +               kernel_fpu_end();
2291                 err = blkcipher_walk_done(desc, &walk, 0);
2292         }
2293 -       kernel_fpu_end();
2295         return err;
2297 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
2298 index 8648158f3916..d7699130ee36 100644
2299 --- a/arch/x86/crypto/cast5_avx_glue.c
2300 +++ b/arch/x86/crypto/cast5_avx_glue.c
2301 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
2302  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2303                      bool enc)
2305 -       bool fpu_enabled = false;
2306 +       bool fpu_enabled;
2307         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
2308         const unsigned int bsize = CAST5_BLOCK_SIZE;
2309         unsigned int nbytes;
2310 @@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2311                 u8 *wsrc = walk->src.virt.addr;
2312                 u8 *wdst = walk->dst.virt.addr;
2314 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2315 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2317                 /* Process multi-block batch */
2318                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
2319 @@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2320                 } while (nbytes >= bsize);
2322  done:
2323 +               cast5_fpu_end(fpu_enabled);
2324                 err = blkcipher_walk_done(desc, walk, nbytes);
2325         }
2327 -       cast5_fpu_end(fpu_enabled);
2328         return err;
2331 @@ -227,7 +226,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
2332  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2333                        struct scatterlist *src, unsigned int nbytes)
2335 -       bool fpu_enabled = false;
2336 +       bool fpu_enabled;
2337         struct blkcipher_walk walk;
2338         int err;
2340 @@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2341         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2343         while ((nbytes = walk.nbytes)) {
2344 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2345 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2346                 nbytes = __cbc_decrypt(desc, &walk);
2347 +               cast5_fpu_end(fpu_enabled);
2348                 err = blkcipher_walk_done(desc, &walk, nbytes);
2349         }
2351 -       cast5_fpu_end(fpu_enabled);
2352         return err;
2355 @@ -311,7 +309,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
2356  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2357                      struct scatterlist *src, unsigned int nbytes)
2359 -       bool fpu_enabled = false;
2360 +       bool fpu_enabled;
2361         struct blkcipher_walk walk;
2362         int err;
2364 @@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2365         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2367         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
2368 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2369 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2370                 nbytes = __ctr_crypt(desc, &walk);
2371 +               cast5_fpu_end(fpu_enabled);
2372                 err = blkcipher_walk_done(desc, &walk, nbytes);
2373         }
2375 -       cast5_fpu_end(fpu_enabled);
2377         if (walk.nbytes) {
2378                 ctr_crypt_final(desc, &walk);
2379                 err = blkcipher_walk_done(desc, &walk, 0);
2380 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
2381 index 6a85598931b5..3a506ce7ed93 100644
2382 --- a/arch/x86/crypto/glue_helper.c
2383 +++ b/arch/x86/crypto/glue_helper.c
2384 @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2385         void *ctx = crypto_blkcipher_ctx(desc->tfm);
2386         const unsigned int bsize = 128 / 8;
2387         unsigned int nbytes, i, func_bytes;
2388 -       bool fpu_enabled = false;
2389 +       bool fpu_enabled;
2390         int err;
2392         err = blkcipher_walk_virt(desc, walk);
2393 @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2394                 u8 *wdst = walk->dst.virt.addr;
2396                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2397 -                                            desc, fpu_enabled, nbytes);
2398 +                                            desc, false, nbytes);
2400                 for (i = 0; i < gctx->num_funcs; i++) {
2401                         func_bytes = bsize * gctx->funcs[i].num_blocks;
2402 @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2403                 }
2405  done:
2406 +               glue_fpu_end(fpu_enabled);
2407                 err = blkcipher_walk_done(desc, walk, nbytes);
2408         }
2410 -       glue_fpu_end(fpu_enabled);
2411         return err;
2414 @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
2415                             struct scatterlist *src, unsigned int nbytes)
2417         const unsigned int bsize = 128 / 8;
2418 -       bool fpu_enabled = false;
2419 +       bool fpu_enabled;
2420         struct blkcipher_walk walk;
2421         int err;
2423 @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
2425         while ((nbytes = walk.nbytes)) {
2426                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2427 -                                            desc, fpu_enabled, nbytes);
2428 +                                            desc, false, nbytes);
2429                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
2430 +               glue_fpu_end(fpu_enabled);
2431                 err = blkcipher_walk_done(desc, &walk, nbytes);
2432         }
2434 -       glue_fpu_end(fpu_enabled);
2435         return err;
2437  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
2438 @@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
2439                           struct scatterlist *src, unsigned int nbytes)
2441         const unsigned int bsize = 128 / 8;
2442 -       bool fpu_enabled = false;
2443 +       bool fpu_enabled;
2444         struct blkcipher_walk walk;
2445         int err;
2447 @@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
2449         while ((nbytes = walk.nbytes) >= bsize) {
2450                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2451 -                                            desc, fpu_enabled, nbytes);
2452 +                                            desc, false, nbytes);
2453                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
2454 +               glue_fpu_end(fpu_enabled);
2455                 err = blkcipher_walk_done(desc, &walk, nbytes);
2456         }
2458 -       glue_fpu_end(fpu_enabled);
2460         if (walk.nbytes) {
2461                 glue_ctr_crypt_final_128bit(
2462                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
2463 @@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
2464                           void *tweak_ctx, void *crypt_ctx)
2466         const unsigned int bsize = 128 / 8;
2467 -       bool fpu_enabled = false;
2468 +       bool fpu_enabled;
2469         struct blkcipher_walk walk;
2470         int err;
2472 @@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
2474         /* set minimum length to bsize, for tweak_fn */
2475         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2476 -                                    desc, fpu_enabled,
2477 +                                    desc, false,
2478                                      nbytes < bsize ? bsize : nbytes);
2480         /* calculate first value of T */
2481         tweak_fn(tweak_ctx, walk.iv, walk.iv);
2482 +       glue_fpu_end(fpu_enabled);
2484         while (nbytes) {
2485 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2486 +                               desc, false, nbytes);
2487                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
2489 +               glue_fpu_end(fpu_enabled);
2490                 err = blkcipher_walk_done(desc, &walk, nbytes);
2491                 nbytes = walk.nbytes;
2492         }
2494 -       glue_fpu_end(fpu_enabled);
2496         return err;
2498  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
2499 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
2500 index 1a4477cedc49..75a301b6a5b6 100644
2501 --- a/arch/x86/entry/common.c
2502 +++ b/arch/x86/entry/common.c
2503 @@ -220,7 +220,7 @@ long syscall_trace_enter(struct pt_regs *regs)
2505  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
2506         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
2507 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
2508 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
2510  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2512 @@ -236,9 +236,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2513                 /* We have work to do. */
2514                 local_irq_enable();
2516 -               if (cached_flags & _TIF_NEED_RESCHED)
2517 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
2518                         schedule();
2520 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
2521 +               if (unlikely(current->forced_info.si_signo)) {
2522 +                       struct task_struct *t = current;
2523 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
2524 +                       t->forced_info.si_signo = 0;
2525 +               }
2526 +#endif
2527                 if (cached_flags & _TIF_UPROBE)
2528                         uprobe_notify_resume(regs);
2530 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
2531 index ae678ad128a9..3bcef8bdb911 100644
2532 --- a/arch/x86/entry/entry_32.S
2533 +++ b/arch/x86/entry/entry_32.S
2534 @@ -278,8 +278,24 @@ END(ret_from_exception)
2535  ENTRY(resume_kernel)
2536         DISABLE_INTERRUPTS(CLBR_ANY)
2537  need_resched:
2538 +       # preempt count == 0 + NEED_RS set?
2539         cmpl    $0, PER_CPU_VAR(__preempt_count)
2540 +#ifndef CONFIG_PREEMPT_LAZY
2541         jnz     restore_all
2542 +#else
2543 +       jz test_int_off
2545 +       # atleast preempt count == 0 ?
2546 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2547 +       jne restore_all
2549 +       cmpl $0,TI_preempt_lazy_count(%ebp)     # non-zero preempt_lazy_count ?
2550 +       jnz restore_all
2552 +       testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
2553 +       jz restore_all
2554 +test_int_off:
2555 +#endif
2556         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2557         jz      restore_all
2558         call    preempt_schedule_irq
2559 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
2560 index 952b23b5d4e9..459a68cffcc2 100644
2561 --- a/arch/x86/entry/entry_64.S
2562 +++ b/arch/x86/entry/entry_64.S
2563 @@ -607,7 +607,23 @@ GLOBAL(retint_user)
2564         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2565         jnc     1f
2566  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2567 +#ifndef CONFIG_PREEMPT_LAZY
2568         jnz     1f
2569 +#else
2570 +       jz      do_preempt_schedule_irq
2572 +       # atleast preempt count == 0 ?
2573 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2574 +       jnz     1f
2576 +       GET_THREAD_INFO(%rcx)
2577 +       cmpl    $0, TI_preempt_lazy_count(%rcx)
2578 +       jnz     1f
2580 +       bt      $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
2581 +       jnc     1f
2582 +do_preempt_schedule_irq:
2583 +#endif
2584         call    preempt_schedule_irq
2585         jmp     0b
2586  1:
2587 @@ -897,6 +913,7 @@ END(native_load_gs_index)
2588         jmp     2b
2589         .previous
2591 +#ifndef CONFIG_PREEMPT_RT_FULL
2592  /* Call softirq on interrupt stack. Interrupts are off. */
2593  ENTRY(do_softirq_own_stack)
2594         pushq   %rbp
2595 @@ -909,6 +926,7 @@ ENTRY(do_softirq_own_stack)
2596         decl    PER_CPU_VAR(irq_count)
2597         ret
2598  END(do_softirq_own_stack)
2599 +#endif
2601  #ifdef CONFIG_XEN
2602  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2603 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
2604 index 01bcde84d3e4..6f432adc55cd 100644
2605 --- a/arch/x86/include/asm/preempt.h
2606 +++ b/arch/x86/include/asm/preempt.h
2607 @@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
2608   * a decrement which hits zero means we have no preempt_count and should
2609   * reschedule.
2610   */
2611 -static __always_inline bool __preempt_count_dec_and_test(void)
2612 +static __always_inline bool ____preempt_count_dec_and_test(void)
2614         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
2617 +static __always_inline bool __preempt_count_dec_and_test(void)
2619 +       if (____preempt_count_dec_and_test())
2620 +               return true;
2621 +#ifdef CONFIG_PREEMPT_LAZY
2622 +       if (current_thread_info()->preempt_lazy_count)
2623 +               return false;
2624 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2625 +#else
2626 +       return false;
2627 +#endif
2630  /*
2631   * Returns true when we need to resched and can (barring IRQ state).
2632   */
2633  static __always_inline bool should_resched(int preempt_offset)
2635 +#ifdef CONFIG_PREEMPT_LAZY
2636 +       u32 tmp;
2638 +       tmp = raw_cpu_read_4(__preempt_count);
2639 +       if (tmp == preempt_offset)
2640 +               return true;
2642 +       /* preempt count == 0 ? */
2643 +       tmp &= ~PREEMPT_NEED_RESCHED;
2644 +       if (tmp)
2645 +               return false;
2646 +       if (current_thread_info()->preempt_lazy_count)
2647 +               return false;
2648 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2649 +#else
2650         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2651 +#endif
2654  #ifdef CONFIG_PREEMPT
2655 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
2656 index 2138c9ae19ee..3f5b4ee2e2c1 100644
2657 --- a/arch/x86/include/asm/signal.h
2658 +++ b/arch/x86/include/asm/signal.h
2659 @@ -23,6 +23,19 @@ typedef struct {
2660         unsigned long sig[_NSIG_WORDS];
2661  } sigset_t;
2664 + * Because some traps use the IST stack, we must keep preemption
2665 + * disabled while calling do_trap(), but do_trap() may call
2666 + * force_sig_info() which will grab the signal spin_locks for the
2667 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2668 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2669 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2670 + * trap.
2671 + */
2672 +#if defined(CONFIG_PREEMPT_RT_FULL)
2673 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2674 +#endif
2676  #ifndef CONFIG_COMPAT
2677  typedef sigset_t compat_sigset_t;
2678  #endif
2679 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
2680 index 58505f01962f..02fa39652cd6 100644
2681 --- a/arch/x86/include/asm/stackprotector.h
2682 +++ b/arch/x86/include/asm/stackprotector.h
2683 @@ -59,7 +59,7 @@
2684   */
2685  static __always_inline void boot_init_stack_canary(void)
2687 -       u64 canary;
2688 +       u64 uninitialized_var(canary);
2689         u64 tsc;
2691  #ifdef CONFIG_X86_64
2692 @@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
2693          * of randomness. The TSC only matters for very early init,
2694          * there it already has some randomness on most systems. Later
2695          * on during the bootup the random pool has true entropy too.
2696 +        *
2697 +        * For preempt-rt we need to weaken the randomness a bit, as
2698 +        * we can't call into the random generator from atomic context
2699 +        * due to locking constraints. We just leave canary
2700 +        * uninitialized and use the TSC based randomness on top of it.
2701          */
2702 +#ifndef CONFIG_PREEMPT_RT_FULL
2703         get_random_bytes(&canary, sizeof(canary));
2704 +#endif
2705         tsc = rdtsc();
2706         canary += tsc + (tsc << 32UL);
2708 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2709 index c7b551028740..ddb63bd90e3c 100644
2710 --- a/arch/x86/include/asm/thread_info.h
2711 +++ b/arch/x86/include/asm/thread_info.h
2712 @@ -58,6 +58,8 @@ struct thread_info {
2713         __u32                   status;         /* thread synchronous flags */
2714         __u32                   cpu;            /* current CPU */
2715         mm_segment_t            addr_limit;
2716 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2717 +                                                         <0 => BUG */
2718         unsigned int            sig_on_uaccess_error:1;
2719         unsigned int            uaccess_err:1;  /* uaccess failed */
2720  };
2721 @@ -95,6 +97,7 @@ struct thread_info {
2722  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2723  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2724  #define TIF_SECCOMP            8       /* secure computing */
2725 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2726  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2727  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2728  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2729 @@ -119,6 +122,7 @@ struct thread_info {
2730  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2731  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2732  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2733 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2734  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2735  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2736  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2737 @@ -152,6 +156,8 @@ struct thread_info {
2738  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2739  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2741 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2743  #define STACK_WARN             (THREAD_SIZE/8)
2745  /*
2746 diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
2747 index fc808b83fccb..ebb40118abf5 100644
2748 --- a/arch/x86/include/asm/uv/uv_bau.h
2749 +++ b/arch/x86/include/asm/uv/uv_bau.h
2750 @@ -615,9 +615,9 @@ struct bau_control {
2751         cycles_t                send_message;
2752         cycles_t                period_end;
2753         cycles_t                period_time;
2754 -       spinlock_t              uvhub_lock;
2755 -       spinlock_t              queue_lock;
2756 -       spinlock_t              disable_lock;
2757 +       raw_spinlock_t          uvhub_lock;
2758 +       raw_spinlock_t          queue_lock;
2759 +       raw_spinlock_t          disable_lock;
2760         /* tunables */
2761         int                     max_concurr;
2762         int                     max_concurr_const;
2763 @@ -776,15 +776,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
2764   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2765   * on equal.
2766   */
2767 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2768 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2770 -       spin_lock(lock);
2771 +       raw_spin_lock(lock);
2772         if (atomic_read(v) >= u) {
2773 -               spin_unlock(lock);
2774 +               raw_spin_unlock(lock);
2775                 return 0;
2776         }
2777         atomic_inc(v);
2778 -       spin_unlock(lock);
2779 +       raw_spin_unlock(lock);
2780         return 1;
2783 diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
2784 index ea7074784cc4..01ec643ce66e 100644
2785 --- a/arch/x86/include/asm/uv/uv_hub.h
2786 +++ b/arch/x86/include/asm/uv/uv_hub.h
2787 @@ -492,7 +492,7 @@ struct uv_blade_info {
2788         unsigned short  nr_online_cpus;
2789         unsigned short  pnode;
2790         short           memory_nid;
2791 -       spinlock_t      nmi_lock;       /* obsolete, see uv_hub_nmi */
2792 +       raw_spinlock_t  nmi_lock;       /* obsolete, see uv_hub_nmi */
2793         unsigned long   nmi_count;      /* obsolete, see uv_hub_nmi */
2794  };
2795  extern struct uv_blade_info *uv_blade_info;
2796 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
2797 index a1e4a6c3f394..86adbf86f366 100644
2798 --- a/arch/x86/kernel/acpi/boot.c
2799 +++ b/arch/x86/kernel/acpi/boot.c
2800 @@ -87,7 +87,9 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
2801   *             ->ioapic_mutex
2802   *                     ->ioapic_lock
2803   */
2804 +#ifdef CONFIG_X86_IO_APIC
2805  static DEFINE_MUTEX(acpi_ioapic_lock);
2806 +#endif
2808  /* --------------------------------------------------------------------------
2809                                Boot-time Configuration
2810 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
2811 index fc91c98bee01..29c4f1b078a1 100644
2812 --- a/arch/x86/kernel/apic/io_apic.c
2813 +++ b/arch/x86/kernel/apic/io_apic.c
2814 @@ -1711,7 +1711,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
2815  static inline bool ioapic_irqd_mask(struct irq_data *data)
2817         /* If we are moving the irq we need to mask it */
2818 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2819 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2820 +                    !irqd_irq_inprogress(data))) {
2821                 mask_ioapic_irq(data);
2822                 return true;
2823         }
2824 diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
2825 index 4a139465f1d4..ad2afff02b36 100644
2826 --- a/arch/x86/kernel/apic/x2apic_uv_x.c
2827 +++ b/arch/x86/kernel/apic/x2apic_uv_x.c
2828 @@ -947,7 +947,7 @@ void __init uv_system_init(void)
2829                         uv_blade_info[blade].pnode = pnode;
2830                         uv_blade_info[blade].nr_possible_cpus = 0;
2831                         uv_blade_info[blade].nr_online_cpus = 0;
2832 -                       spin_lock_init(&uv_blade_info[blade].nmi_lock);
2833 +                       raw_spin_lock_init(&uv_blade_info[blade].nmi_lock);
2834                         min_pnode = min(pnode, min_pnode);
2835                         max_pnode = max(pnode, max_pnode);
2836                         blade++;
2837 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
2838 index 439df975bc7a..b7954ddd6a0a 100644
2839 --- a/arch/x86/kernel/asm-offsets.c
2840 +++ b/arch/x86/kernel/asm-offsets.c
2841 @@ -32,6 +32,7 @@ void common(void) {
2842         OFFSET(TI_flags, thread_info, flags);
2843         OFFSET(TI_status, thread_info, status);
2844         OFFSET(TI_addr_limit, thread_info, addr_limit);
2845 +       OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
2847         BLANK();
2848         OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
2849 @@ -89,4 +90,5 @@ void common(void) {
2851         BLANK();
2852         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2853 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2855 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
2856 index 7e8a736d09db..430a4ec07811 100644
2857 --- a/arch/x86/kernel/cpu/mcheck/mce.c
2858 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
2859 @@ -41,6 +41,8 @@
2860  #include <linux/debugfs.h>
2861  #include <linux/irq_work.h>
2862  #include <linux/export.h>
2863 +#include <linux/jiffies.h>
2864 +#include <linux/swork.h>
2866  #include <asm/processor.h>
2867  #include <asm/traps.h>
2868 @@ -1236,7 +1238,7 @@ void mce_log_therm_throt_event(__u64 status)
2869  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2871  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2872 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2873 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2875  static unsigned long mce_adjust_timer_default(unsigned long interval)
2877 @@ -1245,32 +1247,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
2879  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2881 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2882 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2884 -       unsigned long when = jiffies + interval;
2885 -       unsigned long flags;
2887 -       local_irq_save(flags);
2889 -       if (timer_pending(t)) {
2890 -               if (time_before(when, t->expires))
2891 -                       mod_timer_pinned(t, when);
2892 -       } else {
2893 -               t->expires = round_jiffies(when);
2894 -               add_timer_on(t, smp_processor_id());
2895 -       }
2897 -       local_irq_restore(flags);
2898 +       if (!interval)
2899 +               return HRTIMER_NORESTART;
2900 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2901 +       return HRTIMER_RESTART;
2904 -static void mce_timer_fn(unsigned long data)
2905 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2907 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2908 -       int cpu = smp_processor_id();
2909         unsigned long iv;
2911 -       WARN_ON(cpu != data);
2913         iv = __this_cpu_read(mce_next_interval);
2915         if (mce_available(this_cpu_ptr(&cpu_info))) {
2916 @@ -1293,7 +1281,7 @@ static void mce_timer_fn(unsigned long data)
2918  done:
2919         __this_cpu_write(mce_next_interval, iv);
2920 -       __restart_timer(t, iv);
2921 +       return __restart_timer(timer, iv);
2924  /*
2925 @@ -1301,7 +1289,7 @@ static void mce_timer_fn(unsigned long data)
2926   */
2927  void mce_timer_kick(unsigned long interval)
2929 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2930 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2931         unsigned long iv = __this_cpu_read(mce_next_interval);
2933         __restart_timer(t, interval);
2934 @@ -1316,7 +1304,7 @@ static void mce_timer_delete_all(void)
2935         int cpu;
2937         for_each_online_cpu(cpu)
2938 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2939 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2942  static void mce_do_trigger(struct work_struct *work)
2943 @@ -1326,6 +1314,56 @@ static void mce_do_trigger(struct work_struct *work)
2945  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2947 +static void __mce_notify_work(struct swork_event *event)
2949 +       /* Not more than two messages every minute */
2950 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2952 +       /* wake processes polling /dev/mcelog */
2953 +       wake_up_interruptible(&mce_chrdev_wait);
2955 +       /*
2956 +        * There is no risk of missing notifications because
2957 +        * work_pending is always cleared before the function is
2958 +        * executed.
2959 +        */
2960 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2961 +               schedule_work(&mce_trigger_work);
2963 +       if (__ratelimit(&ratelimit))
2964 +               pr_info(HW_ERR "Machine check events logged\n");
2967 +#ifdef CONFIG_PREEMPT_RT_FULL
2968 +static bool notify_work_ready __read_mostly;
2969 +static struct swork_event notify_work;
2971 +static int mce_notify_work_init(void)
2973 +       int err;
2975 +       err = swork_get();
2976 +       if (err)
2977 +               return err;
2979 +       INIT_SWORK(&notify_work, __mce_notify_work);
2980 +       notify_work_ready = true;
2981 +       return 0;
2984 +static void mce_notify_work(void)
2986 +       if (notify_work_ready)
2987 +               swork_queue(&notify_work);
2989 +#else
2990 +static void mce_notify_work(void)
2992 +       __mce_notify_work(NULL);
2994 +static inline int mce_notify_work_init(void) { return 0; }
2995 +#endif
2997  /*
2998   * Notify the user(s) about new machine check events.
2999   * Can be called from interrupt context, but not from machine check/NMI
3000 @@ -1333,19 +1371,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
3001   */
3002  int mce_notify_irq(void)
3004 -       /* Not more than two messages every minute */
3005 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
3007         if (test_and_clear_bit(0, &mce_need_notify)) {
3008 -               /* wake processes polling /dev/mcelog */
3009 -               wake_up_interruptible(&mce_chrdev_wait);
3011 -               if (mce_helper[0])
3012 -                       schedule_work(&mce_trigger_work);
3014 -               if (__ratelimit(&ratelimit))
3015 -                       pr_info(HW_ERR "Machine check events logged\n");
3017 +               mce_notify_work();
3018                 return 1;
3019         }
3020         return 0;
3021 @@ -1639,7 +1666,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
3022         }
3025 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
3026 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
3028         unsigned long iv = check_interval * HZ;
3030 @@ -1648,16 +1675,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
3032         per_cpu(mce_next_interval, cpu) = iv;
3034 -       t->expires = round_jiffies(jiffies + iv);
3035 -       add_timer_on(t, cpu);
3036 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
3037 +                       0, HRTIMER_MODE_REL_PINNED);
3040  static void __mcheck_cpu_init_timer(void)
3042 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
3043 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
3044         unsigned int cpu = smp_processor_id();
3046 -       setup_timer(t, mce_timer_fn, cpu);
3047 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3048 +       t->function = mce_timer_fn;
3049         mce_start_timer(cpu, t);
3052 @@ -2376,6 +2404,8 @@ static void mce_disable_cpu(void *h)
3053         if (!mce_available(raw_cpu_ptr(&cpu_info)))
3054                 return;
3056 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
3058         if (!(action & CPU_TASKS_FROZEN))
3059                 cmci_clear();
3061 @@ -2398,6 +2428,7 @@ static void mce_reenable_cpu(void *h)
3062                 if (b->init)
3063                         wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
3064         }
3065 +       __mcheck_cpu_init_timer();
3068  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
3069 @@ -2405,7 +2436,6 @@ static int
3070  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
3072         unsigned int cpu = (unsigned long)hcpu;
3073 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
3075         switch (action & ~CPU_TASKS_FROZEN) {
3076         case CPU_ONLINE:
3077 @@ -2425,11 +2455,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
3078                 break;
3079         case CPU_DOWN_PREPARE:
3080                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
3081 -               del_timer_sync(t);
3082                 break;
3083         case CPU_DOWN_FAILED:
3084                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
3085 -               mce_start_timer(cpu, t);
3086                 break;
3087         }
3089 @@ -2468,6 +2496,10 @@ static __init int mcheck_init_device(void)
3090                 goto err_out;
3091         }
3093 +       err = mce_notify_work_init();
3094 +       if (err)
3095 +               goto err_out;
3097         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
3098                 err = -ENOMEM;
3099                 goto err_out;
3100 diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3101 index ed446bdcbf31..d2ac364e2118 100644
3102 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3103 +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3104 @@ -117,7 +117,7 @@ static struct perf_pmu_events_attr event_attr_##v = {                       \
3105  };
3107  struct rapl_pmu {
3108 -       spinlock_t       lock;
3109 +       raw_spinlock_t   lock;
3110         int              n_active; /* number of active events */
3111         struct list_head active_list;
3112         struct pmu       *pmu; /* pointer to rapl_pmu_class */
3113 @@ -220,13 +220,13 @@ static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
3114         if (!pmu->n_active)
3115                 return HRTIMER_NORESTART;
3117 -       spin_lock_irqsave(&pmu->lock, flags);
3118 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3120         list_for_each_entry(event, &pmu->active_list, active_entry) {
3121                 rapl_event_update(event);
3122         }
3124 -       spin_unlock_irqrestore(&pmu->lock, flags);
3125 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3127         hrtimer_forward_now(hrtimer, pmu->timer_interval);
3129 @@ -263,9 +263,9 @@ static void rapl_pmu_event_start(struct perf_event *event, int mode)
3130         struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
3131         unsigned long flags;
3133 -       spin_lock_irqsave(&pmu->lock, flags);
3134 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3135         __rapl_pmu_event_start(pmu, event);
3136 -       spin_unlock_irqrestore(&pmu->lock, flags);
3137 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3140  static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3141 @@ -274,7 +274,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3142         struct hw_perf_event *hwc = &event->hw;
3143         unsigned long flags;
3145 -       spin_lock_irqsave(&pmu->lock, flags);
3146 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3148         /* mark event as deactivated and stopped */
3149         if (!(hwc->state & PERF_HES_STOPPED)) {
3150 @@ -299,7 +299,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3151                 hwc->state |= PERF_HES_UPTODATE;
3152         }
3154 -       spin_unlock_irqrestore(&pmu->lock, flags);
3155 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3158  static int rapl_pmu_event_add(struct perf_event *event, int mode)
3159 @@ -308,14 +308,14 @@ static int rapl_pmu_event_add(struct perf_event *event, int mode)
3160         struct hw_perf_event *hwc = &event->hw;
3161         unsigned long flags;
3163 -       spin_lock_irqsave(&pmu->lock, flags);
3164 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3166         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
3168         if (mode & PERF_EF_START)
3169                 __rapl_pmu_event_start(pmu, event);
3171 -       spin_unlock_irqrestore(&pmu->lock, flags);
3172 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3174         return 0;
3176 @@ -603,7 +603,7 @@ static int rapl_cpu_prepare(int cpu)
3177         pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
3178         if (!pmu)
3179                 return -1;
3180 -       spin_lock_init(&pmu->lock);
3181 +       raw_spin_lock_init(&pmu->lock);
3183         INIT_LIST_HEAD(&pmu->active_list);
3185 diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
3186 index 464ffd69b92e..00db1aad1548 100644
3187 --- a/arch/x86/kernel/dumpstack_32.c
3188 +++ b/arch/x86/kernel/dumpstack_32.c
3189 @@ -42,7 +42,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3190                 unsigned long *stack, unsigned long bp,
3191                 const struct stacktrace_ops *ops, void *data)
3193 -       const unsigned cpu = get_cpu();
3194 +       const unsigned cpu = get_cpu_light();
3195         int graph = 0;
3196         u32 *prev_esp;
3198 @@ -86,7 +86,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3199                         break;
3200                 touch_nmi_watchdog();
3201         }
3202 -       put_cpu();
3203 +       put_cpu_light();
3205  EXPORT_SYMBOL(dump_trace);
3207 diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
3208 index 5f1c6266eb30..c331e3fef465 100644
3209 --- a/arch/x86/kernel/dumpstack_64.c
3210 +++ b/arch/x86/kernel/dumpstack_64.c
3211 @@ -152,7 +152,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3212                 unsigned long *stack, unsigned long bp,
3213                 const struct stacktrace_ops *ops, void *data)
3215 -       const unsigned cpu = get_cpu();
3216 +       const unsigned cpu = get_cpu_light();
3217         struct thread_info *tinfo;
3218         unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
3219         unsigned long dummy;
3220 @@ -241,7 +241,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3221          * This handles the process stack:
3222          */
3223         bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
3224 -       put_cpu();
3225 +       put_cpu_light();
3227  EXPORT_SYMBOL(dump_trace);
3229 @@ -255,7 +255,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
3230         int cpu;
3231         int i;
3233 -       preempt_disable();
3234 +       migrate_disable();
3235         cpu = smp_processor_id();
3237         irq_stack_end   = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
3238 @@ -291,7 +291,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
3239                         pr_cont(" %016lx", *stack++);
3240                 touch_nmi_watchdog();
3241         }
3242 -       preempt_enable();
3243 +       migrate_enable();
3245         pr_cont("\n");
3246         show_trace_log_lvl(task, regs, sp, bp, log_lvl);
3247 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
3248 index 38da8f29a9c8..ce71f7098f15 100644
3249 --- a/arch/x86/kernel/irq_32.c
3250 +++ b/arch/x86/kernel/irq_32.c
3251 @@ -128,6 +128,7 @@ void irq_ctx_init(int cpu)
3252                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
3255 +#ifndef CONFIG_PREEMPT_RT_FULL
3256  void do_softirq_own_stack(void)
3258         struct thread_info *curstk;
3259 @@ -146,6 +147,7 @@ void do_softirq_own_stack(void)
3261         call_on_stack(__do_softirq, isp);
3263 +#endif
3265  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
3267 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
3268 index 32187f8a49b4..1f456f1e06f8 100644
3269 --- a/arch/x86/kernel/kvm.c
3270 +++ b/arch/x86/kernel/kvm.c
3271 @@ -36,6 +36,7 @@
3272  #include <linux/kprobes.h>
3273  #include <linux/debugfs.h>
3274  #include <linux/nmi.h>
3275 +#include <linux/swait.h>
3276  #include <asm/timer.h>
3277  #include <asm/cpu.h>
3278  #include <asm/traps.h>
3279 @@ -91,14 +92,14 @@ static void kvm_io_delay(void)
3281  struct kvm_task_sleep_node {
3282         struct hlist_node link;
3283 -       wait_queue_head_t wq;
3284 +       struct swait_queue_head wq;
3285         u32 token;
3286         int cpu;
3287         bool halted;
3288  };
3290  static struct kvm_task_sleep_head {
3291 -       spinlock_t lock;
3292 +       raw_spinlock_t lock;
3293         struct hlist_head list;
3294  } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
3296 @@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token)
3297         u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
3298         struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
3299         struct kvm_task_sleep_node n, *e;
3300 -       DEFINE_WAIT(wait);
3301 +       DECLARE_SWAITQUEUE(wait);
3303         rcu_irq_enter();
3305 -       spin_lock(&b->lock);
3306 +       raw_spin_lock(&b->lock);
3307         e = _find_apf_task(b, token);
3308         if (e) {
3309                 /* dummy entry exist -> wake up was delivered ahead of PF */
3310                 hlist_del(&e->link);
3311                 kfree(e);
3312 -               spin_unlock(&b->lock);
3313 +               raw_spin_unlock(&b->lock);
3315                 rcu_irq_exit();
3316                 return;
3317 @@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token)
3318         n.token = token;
3319         n.cpu = smp_processor_id();
3320         n.halted = is_idle_task(current) || preempt_count() > 1;
3321 -       init_waitqueue_head(&n.wq);
3322 +       init_swait_queue_head(&n.wq);
3323         hlist_add_head(&n.link, &b->list);
3324 -       spin_unlock(&b->lock);
3325 +       raw_spin_unlock(&b->lock);
3327         for (;;) {
3328                 if (!n.halted)
3329 -                       prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
3330 +                       prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
3331                 if (hlist_unhashed(&n.link))
3332                         break;
3334 @@ -168,7 +169,7 @@ void kvm_async_pf_task_wait(u32 token)
3335                 rcu_irq_enter();
3336         }
3337         if (!n.halted)
3338 -               finish_wait(&n.wq, &wait);
3339 +               finish_swait(&n.wq, &wait);
3341         rcu_irq_exit();
3342         return;
3343 @@ -180,8 +181,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
3344         hlist_del_init(&n->link);
3345         if (n->halted)
3346                 smp_send_reschedule(n->cpu);
3347 -       else if (waitqueue_active(&n->wq))
3348 -               wake_up(&n->wq);
3349 +       else if (swait_active(&n->wq))
3350 +               swake_up(&n->wq);
3353  static void apf_task_wake_all(void)
3354 @@ -191,14 +192,14 @@ static void apf_task_wake_all(void)
3355         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
3356                 struct hlist_node *p, *next;
3357                 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
3358 -               spin_lock(&b->lock);
3359 +               raw_spin_lock(&b->lock);
3360                 hlist_for_each_safe(p, next, &b->list) {
3361                         struct kvm_task_sleep_node *n =
3362                                 hlist_entry(p, typeof(*n), link);
3363                         if (n->cpu == smp_processor_id())
3364                                 apf_task_wake_one(n);
3365                 }
3366 -               spin_unlock(&b->lock);
3367 +               raw_spin_unlock(&b->lock);
3368         }
3371 @@ -214,7 +215,7 @@ void kvm_async_pf_task_wake(u32 token)
3372         }
3374  again:
3375 -       spin_lock(&b->lock);
3376 +       raw_spin_lock(&b->lock);
3377         n = _find_apf_task(b, token);
3378         if (!n) {
3379                 /*
3380 @@ -227,17 +228,17 @@ void kvm_async_pf_task_wake(u32 token)
3381                          * Allocation failed! Busy wait while other cpu
3382                          * handles async PF.
3383                          */
3384 -                       spin_unlock(&b->lock);
3385 +                       raw_spin_unlock(&b->lock);
3386                         cpu_relax();
3387                         goto again;
3388                 }
3389                 n->token = token;
3390                 n->cpu = smp_processor_id();
3391 -               init_waitqueue_head(&n->wq);
3392 +               init_swait_queue_head(&n->wq);
3393                 hlist_add_head(&n->link, &b->list);
3394         } else
3395                 apf_task_wake_one(n);
3396 -       spin_unlock(&b->lock);
3397 +       raw_spin_unlock(&b->lock);
3398         return;
3400  EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
3401 @@ -488,7 +489,7 @@ void __init kvm_guest_init(void)
3402         paravirt_ops_setup();
3403         register_reboot_notifier(&kvm_pv_reboot_nb);
3404         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
3405 -               spin_lock_init(&async_pf_sleepers[i].lock);
3406 +               raw_spin_lock_init(&async_pf_sleepers[i].lock);
3407         if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
3408                 x86_init.irqs.trap_init = kvm_apf_trap_init;
3410 diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
3411 index 697f90db0e37..424aec4a4c71 100644
3412 --- a/arch/x86/kernel/nmi.c
3413 +++ b/arch/x86/kernel/nmi.c
3414 @@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
3415  #endif
3417         if (panic_on_unrecovered_nmi)
3418 -               panic("NMI: Not continuing");
3419 +               nmi_panic(regs, "NMI: Not continuing");
3421         pr_emerg("Dazed and confused, but trying to continue\n");
3423 @@ -255,8 +255,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
3424                  reason, smp_processor_id());
3425         show_regs(regs);
3427 -       if (panic_on_io_nmi)
3428 -               panic("NMI IOCK error: Not continuing");
3429 +       if (panic_on_io_nmi) {
3430 +               nmi_panic(regs, "NMI IOCK error: Not continuing");
3432 +               /*
3433 +                * If we end up here, it means we have received an NMI while
3434 +                * processing panic(). Simply return without delaying and
3435 +                * re-enabling NMIs.
3436 +                */
3437 +               return;
3438 +       }
3440         /* Re-enable the IOCK line, wait for a few seconds */
3441         reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
3442 @@ -297,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
3444         pr_emerg("Do you have a strange power saving mode enabled?\n");
3445         if (unknown_nmi_panic || panic_on_unrecovered_nmi)
3446 -               panic("NMI: Not continuing");
3447 +               nmi_panic(regs, "NMI: Not continuing");
3449         pr_emerg("Dazed and confused, but trying to continue\n");
3451 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
3452 index 9f950917528b..4dd4beae917a 100644
3453 --- a/arch/x86/kernel/process_32.c
3454 +++ b/arch/x86/kernel/process_32.c
3455 @@ -35,6 +35,7 @@
3456  #include <linux/uaccess.h>
3457  #include <linux/io.h>
3458  #include <linux/kdebug.h>
3459 +#include <linux/highmem.h>
3461  #include <asm/pgtable.h>
3462  #include <asm/ldt.h>
3463 @@ -210,6 +211,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
3465  EXPORT_SYMBOL_GPL(start_thread);
3467 +#ifdef CONFIG_PREEMPT_RT_FULL
3468 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
3470 +       int i;
3472 +       /*
3473 +        * Clear @prev's kmap_atomic mappings
3474 +        */
3475 +       for (i = 0; i < prev_p->kmap_idx; i++) {
3476 +               int idx = i + KM_TYPE_NR * smp_processor_id();
3477 +               pte_t *ptep = kmap_pte - idx;
3479 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
3480 +       }
3481 +       /*
3482 +        * Restore @next_p's kmap_atomic mappings
3483 +        */
3484 +       for (i = 0; i < next_p->kmap_idx; i++) {
3485 +               int idx = i + KM_TYPE_NR * smp_processor_id();
3487 +               if (!pte_none(next_p->kmap_pte[i]))
3488 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
3489 +       }
3491 +#else
3492 +static inline void
3493 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
3494 +#endif
3497  /*
3498   *     switch_to(x,y) should switch tasks from x to y.
3499 @@ -286,6 +316,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
3500                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
3501                 __switch_to_xtra(prev_p, next_p, tss);
3503 +       switch_kmaps(prev_p, next_p);
3505         /*
3506          * Leave lazy mode, flushing any hypercalls made here.
3507          * This must be done before restoring TLS segments so
3508 diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
3509 index 9a16932c7258..219ffb9ba3a9 100644
3510 --- a/arch/x86/kernel/reboot.c
3511 +++ b/arch/x86/kernel/reboot.c
3512 @@ -730,6 +730,7 @@ static int crashing_cpu;
3513  static nmi_shootdown_cb shootdown_callback;
3515  static atomic_t waiting_for_crash_ipi;
3516 +static int crash_ipi_issued;
3518  static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
3520 @@ -792,6 +793,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3522         smp_send_nmi_allbutself();
3524 +       /* Kick CPUs looping in NMI context. */
3525 +       WRITE_ONCE(crash_ipi_issued, 1);
3527         msecs = 1000; /* Wait at most a second for the other cpus to stop */
3528         while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
3529                 mdelay(1);
3530 @@ -800,6 +804,22 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3532         /* Leave the nmi callback set */
3535 +/* Override the weak function in kernel/panic.c */
3536 +void nmi_panic_self_stop(struct pt_regs *regs)
3538 +       while (1) {
3539 +               /*
3540 +                * Wait for the crash dumping IPI to be issued, and then
3541 +                * call its callback directly.
3542 +                */
3543 +               if (READ_ONCE(crash_ipi_issued))
3544 +                       crash_nmi_callback(0, regs); /* Don't return */
3546 +               cpu_relax();
3547 +       }
3550  #else /* !CONFIG_SMP */
3551  void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3553 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
3554 index 1c96f09367ae..ffb6d9859122 100644
3555 --- a/arch/x86/kvm/lapic.c
3556 +++ b/arch/x86/kvm/lapic.c
3557 @@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
3558  static void apic_timer_expired(struct kvm_lapic *apic)
3560         struct kvm_vcpu *vcpu = apic->vcpu;
3561 -       wait_queue_head_t *q = &vcpu->wq;
3562 +       struct swait_queue_head *q = &vcpu->wq;
3563         struct kvm_timer *ktimer = &apic->lapic_timer;
3565         if (atomic_read(&apic->lapic_timer.pending))
3566 @@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
3567         atomic_inc(&apic->lapic_timer.pending);
3568         kvm_set_pending_timer(vcpu);
3570 -       if (waitqueue_active(q))
3571 -               wake_up_interruptible(q);
3572 +       if (swait_active(q))
3573 +               swake_up(q);
3575         if (apic_lvtt_tscdeadline(apic))
3576                 ktimer->expired_tscdeadline = ktimer->tscdeadline;
3577 @@ -1801,6 +1801,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
3578         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
3579                      HRTIMER_MODE_ABS);
3580         apic->lapic_timer.timer.function = apic_timer_fn;
3581 +       apic->lapic_timer.timer.irqsafe = 1;
3583         /*
3584          * APIC is created enabled. This will prevent kvm_lapic_set_base from
3585 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
3586 index f973cfa8ff4f..b63350721623 100644
3587 --- a/arch/x86/kvm/x86.c
3588 +++ b/arch/x86/kvm/x86.c
3589 @@ -5836,6 +5836,13 @@ int kvm_arch_init(void *opaque)
3590                 goto out;
3591         }
3593 +#ifdef CONFIG_PREEMPT_RT_FULL
3594 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3595 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
3596 +               return -EOPNOTSUPP;
3597 +       }
3598 +#endif
3600         r = kvm_mmu_module_init();
3601         if (r)
3602                 goto out_free_percpu;
3603 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
3604 index a6d739258137..bd24ba1c4a86 100644
3605 --- a/arch/x86/mm/highmem_32.c
3606 +++ b/arch/x86/mm/highmem_32.c
3607 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
3608   */
3609  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3611 +       pte_t pte = mk_pte(page, prot);
3612         unsigned long vaddr;
3613         int idx, type;
3615 -       preempt_disable();
3616 +       preempt_disable_nort();
3617         pagefault_disable();
3619         if (!PageHighMem(page))
3620 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3621         idx = type + KM_TYPE_NR*smp_processor_id();
3622         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3623         BUG_ON(!pte_none(*(kmap_pte-idx)));
3624 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
3625 +#ifdef CONFIG_PREEMPT_RT_FULL
3626 +       current->kmap_pte[type] = pte;
3627 +#endif
3628 +       set_pte(kmap_pte-idx, pte);
3629         arch_flush_lazy_mmu_mode();
3631         return (void *)vaddr;
3632 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
3633                  * is a bad idea also, in case the page changes cacheability
3634                  * attributes or becomes a protected page in a hypervisor.
3635                  */
3636 +#ifdef CONFIG_PREEMPT_RT_FULL
3637 +               current->kmap_pte[type] = __pte(0);
3638 +#endif
3639                 kpte_clear_flush(kmap_pte-idx, vaddr);
3640                 kmap_atomic_idx_pop();
3641                 arch_flush_lazy_mmu_mode();
3642 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
3643  #endif
3645         pagefault_enable();
3646 -       preempt_enable();
3647 +       preempt_enable_nort();
3649  EXPORT_SYMBOL(__kunmap_atomic);
3651 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
3652 index 9c0ff045fdd4..dd25dd1671b6 100644
3653 --- a/arch/x86/mm/iomap_32.c
3654 +++ b/arch/x86/mm/iomap_32.c
3655 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
3657  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3659 +       pte_t pte = pfn_pte(pfn, prot);
3660         unsigned long vaddr;
3661         int idx, type;
3663 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3664         type = kmap_atomic_idx_push();
3665         idx = type + KM_TYPE_NR * smp_processor_id();
3666         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3667 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
3668 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
3670 +#ifdef CONFIG_PREEMPT_RT_FULL
3671 +       current->kmap_pte[type] = pte;
3672 +#endif
3673 +       set_pte(kmap_pte - idx, pte);
3674         arch_flush_lazy_mmu_mode();
3676         return (void *)vaddr;
3677 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
3678                  * is a bad idea also, in case the page changes cacheability
3679                  * attributes or becomes a protected page in a hypervisor.
3680                  */
3681 +#ifdef CONFIG_PREEMPT_RT_FULL
3682 +               current->kmap_pte[type] = __pte(0);
3683 +#endif
3684                 kpte_clear_flush(kmap_pte-idx, vaddr);
3685                 kmap_atomic_idx_pop();
3686         }
3687 diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
3688 index 79377e2a7bcd..dca36c1f1966 100644
3689 --- a/arch/x86/mm/pageattr.c
3690 +++ b/arch/x86/mm/pageattr.c
3691 @@ -209,7 +209,15 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
3692                             int in_flags, struct page **pages)
3694         unsigned int i, level;
3695 +#ifdef CONFIG_PREEMPT
3696 +       /*
3697 +        * Avoid wbinvd() because it causes latencies on all CPUs,
3698 +        * regardless of any CPU isolation that may be in effect.
3699 +        */
3700 +       unsigned long do_wbinvd = 0;
3701 +#else
3702         unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
3703 +#endif
3705         BUG_ON(irqs_disabled());
3707 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
3708 index 3b6ec42718e4..7871083de089 100644
3709 --- a/arch/x86/platform/uv/tlb_uv.c
3710 +++ b/arch/x86/platform/uv/tlb_uv.c
3711 @@ -714,9 +714,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
3713                 quiesce_local_uvhub(hmaster);
3715 -               spin_lock(&hmaster->queue_lock);
3716 +               raw_spin_lock(&hmaster->queue_lock);
3717                 reset_with_ipi(&bau_desc->distribution, bcp);
3718 -               spin_unlock(&hmaster->queue_lock);
3719 +               raw_spin_unlock(&hmaster->queue_lock);
3721                 end_uvhub_quiesce(hmaster);
3723 @@ -736,9 +736,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
3725                 quiesce_local_uvhub(hmaster);
3727 -               spin_lock(&hmaster->queue_lock);
3728 +               raw_spin_lock(&hmaster->queue_lock);
3729                 reset_with_ipi(&bau_desc->distribution, bcp);
3730 -               spin_unlock(&hmaster->queue_lock);
3731 +               raw_spin_unlock(&hmaster->queue_lock);
3733                 end_uvhub_quiesce(hmaster);
3735 @@ -759,7 +759,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
3736         cycles_t tm1;
3738         hmaster = bcp->uvhub_master;
3739 -       spin_lock(&hmaster->disable_lock);
3740 +       raw_spin_lock(&hmaster->disable_lock);
3741         if (!bcp->baudisabled) {
3742                 stat->s_bau_disabled++;
3743                 tm1 = get_cycles();
3744 @@ -772,7 +772,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
3745                         }
3746                 }
3747         }
3748 -       spin_unlock(&hmaster->disable_lock);
3749 +       raw_spin_unlock(&hmaster->disable_lock);
3752  static void count_max_concurr(int stat, struct bau_control *bcp,
3753 @@ -835,7 +835,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
3754   */
3755  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
3757 -       spinlock_t *lock = &hmaster->uvhub_lock;
3758 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
3759         atomic_t *v;
3761         v = &hmaster->active_descriptor_count;
3762 @@ -968,7 +968,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
3763         struct bau_control *hmaster;
3765         hmaster = bcp->uvhub_master;
3766 -       spin_lock(&hmaster->disable_lock);
3767 +       raw_spin_lock(&hmaster->disable_lock);
3768         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
3769                 stat->s_bau_reenabled++;
3770                 for_each_present_cpu(tcpu) {
3771 @@ -980,10 +980,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
3772                                 tbcp->period_giveups = 0;
3773                         }
3774                 }
3775 -               spin_unlock(&hmaster->disable_lock);
3776 +               raw_spin_unlock(&hmaster->disable_lock);
3777                 return 0;
3778         }
3779 -       spin_unlock(&hmaster->disable_lock);
3780 +       raw_spin_unlock(&hmaster->disable_lock);
3781         return -1;
3784 @@ -1901,9 +1901,9 @@ static void __init init_per_cpu_tunables(void)
3785                 bcp->cong_reps                  = congested_reps;
3786                 bcp->disabled_period =          sec_2_cycles(disabled_period);
3787                 bcp->giveup_limit =             giveup_limit;
3788 -               spin_lock_init(&bcp->queue_lock);
3789 -               spin_lock_init(&bcp->uvhub_lock);
3790 -               spin_lock_init(&bcp->disable_lock);
3791 +               raw_spin_lock_init(&bcp->queue_lock);
3792 +               raw_spin_lock_init(&bcp->uvhub_lock);
3793 +               raw_spin_lock_init(&bcp->disable_lock);
3794         }
3797 diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
3798 index 2b158a9fa1d7..5e0b122620cb 100644
3799 --- a/arch/x86/platform/uv/uv_time.c
3800 +++ b/arch/x86/platform/uv/uv_time.c
3801 @@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
3803  /* There is one of these allocated per node */
3804  struct uv_rtc_timer_head {
3805 -       spinlock_t      lock;
3806 +       raw_spinlock_t  lock;
3807         /* next cpu waiting for timer, local node relative: */
3808         int             next_cpu;
3809         /* number of cpus on this node: */
3810 @@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
3811                                 uv_rtc_deallocate_timers();
3812                                 return -ENOMEM;
3813                         }
3814 -                       spin_lock_init(&head->lock);
3815 +                       raw_spin_lock_init(&head->lock);
3816                         head->ncpus = uv_blade_nr_possible_cpus(bid);
3817                         head->next_cpu = -1;
3818                         blade_info[bid] = head;
3819 @@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
3820         unsigned long flags;
3821         int next_cpu;
3823 -       spin_lock_irqsave(&head->lock, flags);
3824 +       raw_spin_lock_irqsave(&head->lock, flags);
3826         next_cpu = head->next_cpu;
3827         *t = expires;
3828 @@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
3829                 if (uv_setup_intr(cpu, expires)) {
3830                         *t = ULLONG_MAX;
3831                         uv_rtc_find_next_timer(head, pnode);
3832 -                       spin_unlock_irqrestore(&head->lock, flags);
3833 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
3834                         return -ETIME;
3835                 }
3836         }
3838 -       spin_unlock_irqrestore(&head->lock, flags);
3839 +       raw_spin_unlock_irqrestore(&head->lock, flags);
3840         return 0;
3843 @@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
3844         unsigned long flags;
3845         int rc = 0;
3847 -       spin_lock_irqsave(&head->lock, flags);
3848 +       raw_spin_lock_irqsave(&head->lock, flags);
3850         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
3851                 rc = 1;
3852 @@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
3853                         uv_rtc_find_next_timer(head, pnode);
3854         }
3856 -       spin_unlock_irqrestore(&head->lock, flags);
3857 +       raw_spin_unlock_irqrestore(&head->lock, flags);
3859         return rc;
3861 @@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
3862  static cycle_t uv_read_rtc(struct clocksource *cs)
3864         unsigned long offset;
3865 +       cycle_t cycles;
3867 +       preempt_disable();
3868         if (uv_get_min_hub_revision_id() == 1)
3869                 offset = 0;
3870         else
3871                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
3873 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
3874 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
3875 +       preempt_enable();
3877 +       return cycles;
3880  /*
3881 diff --git a/block/blk-core.c b/block/blk-core.c
3882 index f5f1a55703ae..75a76bcc7ac0 100644
3883 --- a/block/blk-core.c
3884 +++ b/block/blk-core.c
3885 @@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
3887         INIT_LIST_HEAD(&rq->queuelist);
3888         INIT_LIST_HEAD(&rq->timeout_list);
3889 +#ifdef CONFIG_PREEMPT_RT_FULL
3890 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3891 +#endif
3892         rq->cpu = -1;
3893         rq->q = q;
3894         rq->__sector = (sector_t) -1;
3895 @@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
3896   **/
3897  void blk_start_queue(struct request_queue *q)
3899 -       WARN_ON(!in_interrupt() && !irqs_disabled());
3900 +       WARN_ON_NONRT(!in_interrupt() && !irqs_disabled());
3902         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
3903         __blk_run_queue(q);
3904 @@ -659,7 +662,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp)
3905                 if (!gfpflags_allow_blocking(gfp))
3906                         return -EBUSY;
3908 -               ret = wait_event_interruptible(q->mq_freeze_wq,
3909 +               ret = swait_event_interruptible(q->mq_freeze_wq,
3910                                 !atomic_read(&q->mq_freeze_depth) ||
3911                                 blk_queue_dying(q));
3912                 if (blk_queue_dying(q))
3913 @@ -679,7 +682,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
3914         struct request_queue *q =
3915                 container_of(ref, struct request_queue, q_usage_counter);
3917 -       wake_up_all(&q->mq_freeze_wq);
3918 +       swake_up_all(&q->mq_freeze_wq);
3921  struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3922 @@ -741,7 +744,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3923         q->bypass_depth = 1;
3924         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
3926 -       init_waitqueue_head(&q->mq_freeze_wq);
3927 +       init_swait_queue_head(&q->mq_freeze_wq);
3929         /*
3930          * Init percpu_ref in atomic mode so that it's faster to shutdown.
3931 @@ -3222,7 +3225,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
3932                 blk_run_queue_async(q);
3933         else
3934                 __blk_run_queue(q);
3935 -       spin_unlock(q->queue_lock);
3936 +       spin_unlock_irq(q->queue_lock);
3939  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3940 @@ -3270,7 +3273,6 @@ EXPORT_SYMBOL(blk_check_plugged);
3941  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3943         struct request_queue *q;
3944 -       unsigned long flags;
3945         struct request *rq;
3946         LIST_HEAD(list);
3947         unsigned int depth;
3948 @@ -3290,11 +3292,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3949         q = NULL;
3950         depth = 0;
3952 -       /*
3953 -        * Save and disable interrupts here, to avoid doing it for every
3954 -        * queue lock we have to take.
3955 -        */
3956 -       local_irq_save(flags);
3957         while (!list_empty(&list)) {
3958                 rq = list_entry_rq(list.next);
3959                 list_del_init(&rq->queuelist);
3960 @@ -3307,7 +3304,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3961                                 queue_unplugged(q, depth, from_schedule);
3962                         q = rq->q;
3963                         depth = 0;
3964 -                       spin_lock(q->queue_lock);
3965 +                       spin_lock_irq(q->queue_lock);
3966                 }
3968                 /*
3969 @@ -3334,8 +3331,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3970          */
3971         if (q)
3972                 queue_unplugged(q, depth, from_schedule);
3974 -       local_irq_restore(flags);
3977  void blk_finish_plug(struct blk_plug *plug)
3978 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
3979 index 381cb50a673c..dc8785233d94 100644
3980 --- a/block/blk-ioc.c
3981 +++ b/block/blk-ioc.c
3982 @@ -7,6 +7,7 @@
3983  #include <linux/bio.h>
3984  #include <linux/blkdev.h>
3985  #include <linux/slab.h>
3986 +#include <linux/delay.h>
3988  #include "blk.h"
3990 @@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
3991                         spin_unlock(q->queue_lock);
3992                 } else {
3993                         spin_unlock_irqrestore(&ioc->lock, flags);
3994 -                       cpu_relax();
3995 +                       cpu_chill();
3996                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3997                 }
3998         }
3999 @@ -187,7 +188,7 @@ void put_io_context_active(struct io_context *ioc)
4000                         spin_unlock(icq->q->queue_lock);
4001                 } else {
4002                         spin_unlock_irqrestore(&ioc->lock, flags);
4003 -                       cpu_relax();
4004 +                       cpu_chill();
4005                         goto retry;
4006                 }
4007         }
4008 diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
4009 index 0736729d6494..3e21e31d0d7e 100644
4010 --- a/block/blk-iopoll.c
4011 +++ b/block/blk-iopoll.c
4012 @@ -35,6 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
4013         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
4014         __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
4015         local_irq_restore(flags);
4016 +       preempt_check_resched_rt();
4018  EXPORT_SYMBOL(blk_iopoll_sched);
4020 @@ -132,6 +133,7 @@ static void blk_iopoll_softirq(struct softirq_action *h)
4021                 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
4023         local_irq_enable();
4024 +       preempt_check_resched_rt();
4027  /**
4028 @@ -201,6 +203,7 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,
4029                                  this_cpu_ptr(&blk_cpu_iopoll));
4030                 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
4031                 local_irq_enable();
4032 +               preempt_check_resched_rt();
4033         }
4035         return NOTIFY_OK;
4036 diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
4037 index bb3ed488f7b5..628c6c13c482 100644
4038 --- a/block/blk-mq-cpu.c
4039 +++ b/block/blk-mq-cpu.c
4040 @@ -16,7 +16,7 @@
4041  #include "blk-mq.h"
4043  static LIST_HEAD(blk_mq_cpu_notify_list);
4044 -static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
4045 +static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
4047  static int blk_mq_main_cpu_notify(struct notifier_block *self,
4048                                   unsigned long action, void *hcpu)
4049 @@ -25,7 +25,10 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
4050         struct blk_mq_cpu_notifier *notify;
4051         int ret = NOTIFY_OK;
4053 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4054 +       if (action != CPU_POST_DEAD)
4055 +               return NOTIFY_OK;
4057 +       spin_lock(&blk_mq_cpu_notify_lock);
4059         list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
4060                 ret = notify->notify(notify->data, action, cpu);
4061 @@ -33,7 +36,7 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
4062                         break;
4063         }
4065 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4066 +       spin_unlock(&blk_mq_cpu_notify_lock);
4067         return ret;
4070 @@ -41,16 +44,16 @@ void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
4072         BUG_ON(!notifier->notify);
4074 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4075 +       spin_lock(&blk_mq_cpu_notify_lock);
4076         list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
4077 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4078 +       spin_unlock(&blk_mq_cpu_notify_lock);
4081  void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
4083 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4084 +       spin_lock(&blk_mq_cpu_notify_lock);
4085         list_del(&notifier->list);
4086 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4087 +       spin_unlock(&blk_mq_cpu_notify_lock);
4090  void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
4091 diff --git a/block/blk-mq.c b/block/blk-mq.c
4092 index 0d1af3e44efb..e4fc80184dd8 100644
4093 --- a/block/blk-mq.c
4094 +++ b/block/blk-mq.c
4095 @@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
4097  static void blk_mq_freeze_queue_wait(struct request_queue *q)
4099 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
4100 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
4103  /*
4104 @@ -130,7 +130,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
4105         WARN_ON_ONCE(freeze_depth < 0);
4106         if (!freeze_depth) {
4107                 percpu_ref_reinit(&q->q_usage_counter);
4108 -               wake_up_all(&q->mq_freeze_wq);
4109 +               swake_up_all(&q->mq_freeze_wq);
4110         }
4112  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
4113 @@ -149,7 +149,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
4114          * dying, we need to ensure that processes currently waiting on
4115          * the queue are notified as well.
4116          */
4117 -       wake_up_all(&q->mq_freeze_wq);
4118 +       swake_up_all(&q->mq_freeze_wq);
4121  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
4122 @@ -196,6 +196,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
4123         rq->resid_len = 0;
4124         rq->sense = NULL;
4126 +#ifdef CONFIG_PREEMPT_RT_FULL
4127 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
4128 +#endif
4129         INIT_LIST_HEAD(&rq->timeout_list);
4130         rq->timeout = 0;
4132 @@ -325,6 +328,17 @@ void blk_mq_end_request(struct request *rq, int error)
4134  EXPORT_SYMBOL(blk_mq_end_request);
4136 +#ifdef CONFIG_PREEMPT_RT_FULL
4138 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
4140 +       struct request *rq = container_of(work, struct request, work);
4142 +       rq->q->softirq_done_fn(rq);
4145 +#else
4147  static void __blk_mq_complete_request_remote(void *data)
4149         struct request *rq = data;
4150 @@ -332,6 +346,8 @@ static void __blk_mq_complete_request_remote(void *data)
4151         rq->q->softirq_done_fn(rq);
4154 +#endif
4156  static void blk_mq_ipi_complete_request(struct request *rq)
4158         struct blk_mq_ctx *ctx = rq->mq_ctx;
4159 @@ -343,19 +359,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
4160                 return;
4161         }
4163 -       cpu = get_cpu();
4164 +       cpu = get_cpu_light();
4165         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
4166                 shared = cpus_share_cache(cpu, ctx->cpu);
4168         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
4169 +#ifdef CONFIG_PREEMPT_RT_FULL
4170 +               schedule_work_on(ctx->cpu, &rq->work);
4171 +#else
4172                 rq->csd.func = __blk_mq_complete_request_remote;
4173                 rq->csd.info = rq;
4174                 rq->csd.flags = 0;
4175                 smp_call_function_single_async(ctx->cpu, &rq->csd);
4176 +#endif
4177         } else {
4178                 rq->q->softirq_done_fn(rq);
4179         }
4180 -       put_cpu();
4181 +       put_cpu_light();
4184  static void __blk_mq_complete_request(struct request *rq)
4185 @@ -862,14 +882,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
4186                 return;
4188         if (!async) {
4189 -               int cpu = get_cpu();
4190 +               int cpu = get_cpu_light();
4191                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
4192                         __blk_mq_run_hw_queue(hctx);
4193 -                       put_cpu();
4194 +                       put_cpu_light();
4195                         return;
4196                 }
4198 -               put_cpu();
4199 +               put_cpu_light();
4200         }
4202         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
4203 @@ -1616,7 +1636,7 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
4205         struct blk_mq_hw_ctx *hctx = data;
4207 -       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
4208 +       if (action == CPU_POST_DEAD)
4209                 return blk_mq_hctx_cpu_offline(hctx, cpu);
4211         /*
4212 diff --git a/block/blk-mq.h b/block/blk-mq.h
4213 index 713820b47b31..3cb6feb4fe23 100644
4214 --- a/block/blk-mq.h
4215 +++ b/block/blk-mq.h
4216 @@ -74,7 +74,10 @@ struct blk_align_bitmap {
4217  static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
4218                                            unsigned int cpu)
4220 -       return per_cpu_ptr(q->queue_ctx, cpu);
4221 +       struct blk_mq_ctx *ctx;
4223 +       ctx = per_cpu_ptr(q->queue_ctx, cpu);
4224 +       return ctx;
4227  /*
4228 @@ -85,12 +88,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
4229   */
4230  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
4232 -       return __blk_mq_get_ctx(q, get_cpu());
4233 +       return __blk_mq_get_ctx(q, get_cpu_light());
4236  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
4238 -       put_cpu();
4239 +       put_cpu_light();
4242  struct blk_mq_alloc_data {
4243 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
4244 index 53b1737e978d..81c3c0a62edf 100644
4245 --- a/block/blk-softirq.c
4246 +++ b/block/blk-softirq.c
4247 @@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
4248                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
4250         local_irq_restore(flags);
4251 +       preempt_check_resched_rt();
4254  /*
4255 @@ -93,6 +94,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
4256                                  this_cpu_ptr(&blk_cpu_done));
4257                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
4258                 local_irq_enable();
4259 +               preempt_check_resched_rt();
4260         }
4262         return NOTIFY_OK;
4263 @@ -150,6 +152,7 @@ void __blk_complete_request(struct request *req)
4264                 goto do_local;
4266         local_irq_restore(flags);
4267 +       preempt_check_resched_rt();
4270  /**
4271 diff --git a/block/bounce.c b/block/bounce.c
4272 index 1cb5dd3a5da1..2f1ec8a67cbe 100644
4273 --- a/block/bounce.c
4274 +++ b/block/bounce.c
4275 @@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
4276         unsigned long flags;
4277         unsigned char *vto;
4279 -       local_irq_save(flags);
4280 +       local_irq_save_nort(flags);
4281         vto = kmap_atomic(to->bv_page);
4282         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
4283         kunmap_atomic(vto);
4284 -       local_irq_restore(flags);
4285 +       local_irq_restore_nort(flags);
4288  #else /* CONFIG_HIGHMEM */
4289 diff --git a/crypto/algapi.c b/crypto/algapi.c
4290 index eb58b73ca925..ee228adf2dac 100644
4291 --- a/crypto/algapi.c
4292 +++ b/crypto/algapi.c
4293 @@ -732,13 +732,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
4295  int crypto_register_notifier(struct notifier_block *nb)
4297 -       return blocking_notifier_chain_register(&crypto_chain, nb);
4298 +       return srcu_notifier_chain_register(&crypto_chain, nb);
4300  EXPORT_SYMBOL_GPL(crypto_register_notifier);
4302  int crypto_unregister_notifier(struct notifier_block *nb)
4304 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
4305 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
4307  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
4309 diff --git a/crypto/api.c b/crypto/api.c
4310 index bbc147cb5dec..bc1a848f02ec 100644
4311 --- a/crypto/api.c
4312 +++ b/crypto/api.c
4313 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
4314  DECLARE_RWSEM(crypto_alg_sem);
4315  EXPORT_SYMBOL_GPL(crypto_alg_sem);
4317 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
4318 +SRCU_NOTIFIER_HEAD(crypto_chain);
4319  EXPORT_SYMBOL_GPL(crypto_chain);
4321  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
4322 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
4324         int ok;
4326 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
4327 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
4328         if (ok == NOTIFY_DONE) {
4329                 request_module("cryptomgr");
4330 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
4331 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
4332         }
4334         return ok;
4335 diff --git a/crypto/internal.h b/crypto/internal.h
4336 index 00e42a3ed814..2e85551e235f 100644
4337 --- a/crypto/internal.h
4338 +++ b/crypto/internal.h
4339 @@ -47,7 +47,7 @@ struct crypto_larval {
4341  extern struct list_head crypto_alg_list;
4342  extern struct rw_semaphore crypto_alg_sem;
4343 -extern struct blocking_notifier_head crypto_chain;
4344 +extern struct srcu_notifier_head crypto_chain;
4346  #ifdef CONFIG_PROC_FS
4347  void __init crypto_init_proc(void);
4348 @@ -143,7 +143,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
4350  static inline void crypto_notify(unsigned long val, void *v)
4352 -       blocking_notifier_call_chain(&crypto_chain, val, v);
4353 +       srcu_notifier_call_chain(&crypto_chain, val, v);
4356  #endif /* _CRYPTO_INTERNAL_H */
4357 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
4358 index faa97604d878..941497f31cf0 100644
4359 --- a/drivers/acpi/acpica/acglobal.h
4360 +++ b/drivers/acpi/acpica/acglobal.h
4361 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
4362   * interrupt level
4363   */
4364  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
4365 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
4366 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
4367  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
4369  /* Mutex for _OSI support */
4370 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
4371 index 3cf77afd142c..dc32e72132f1 100644
4372 --- a/drivers/acpi/acpica/hwregs.c
4373 +++ b/drivers/acpi/acpica/hwregs.c
4374 @@ -269,14 +269,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
4375                           ACPI_BITMASK_ALL_FIXED_STATUS,
4376                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
4378 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
4379 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
4381         /* Clear the fixed events in PM1 A/B */
4383         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
4384                                         ACPI_BITMASK_ALL_FIXED_STATUS);
4386 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
4387 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
4389         if (ACPI_FAILURE(status)) {
4390                 goto exit;
4391 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
4392 index b2e50d8007fe..ff007084dc48 100644
4393 --- a/drivers/acpi/acpica/hwxface.c
4394 +++ b/drivers/acpi/acpica/hwxface.c
4395 @@ -374,7 +374,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
4396                 return_ACPI_STATUS(AE_BAD_PARAMETER);
4397         }
4399 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
4400 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
4402         /*
4403          * At this point, we know that the parent register is one of the
4404 @@ -435,7 +435,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
4406  unlock_and_exit:
4408 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
4409 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
4410         return_ACPI_STATUS(status);
4413 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
4414 index ce406e39b669..41a75eb3ae9d 100644
4415 --- a/drivers/acpi/acpica/utmutex.c
4416 +++ b/drivers/acpi/acpica/utmutex.c
4417 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
4418                 return_ACPI_STATUS (status);
4419         }
4421 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
4422 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
4423         if (ACPI_FAILURE (status)) {
4424                 return_ACPI_STATUS (status);
4425         }
4426 @@ -156,7 +156,7 @@ void acpi_ut_mutex_terminate(void)
4427         /* Delete the spinlocks */
4429         acpi_os_delete_lock(acpi_gbl_gpe_lock);
4430 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
4431 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
4432         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
4434         /* Delete the reader/writer lock */
4435 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
4436 index 18de4c457068..dfecf14df732 100644
4437 --- a/drivers/ata/libata-sff.c
4438 +++ b/drivers/ata/libata-sff.c
4439 @@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
4440         unsigned long flags;
4441         unsigned int consumed;
4443 -       local_irq_save(flags);
4444 +       local_irq_save_nort(flags);
4445         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
4446 -       local_irq_restore(flags);
4447 +       local_irq_restore_nort(flags);
4449         return consumed;
4451 @@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
4452                 unsigned long flags;
4454                 /* FIXME: use a bounce buffer */
4455 -               local_irq_save(flags);
4456 +               local_irq_save_nort(flags);
4457                 buf = kmap_atomic(page);
4459                 /* do the actual data transfer */
4460 @@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
4461                                        do_write);
4463                 kunmap_atomic(buf);
4464 -               local_irq_restore(flags);
4465 +               local_irq_restore_nort(flags);
4466         } else {
4467                 buf = page_address(page);
4468                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
4469 @@ -864,7 +864,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
4470                 unsigned long flags;
4472                 /* FIXME: use bounce buffer */
4473 -               local_irq_save(flags);
4474 +               local_irq_save_nort(flags);
4475                 buf = kmap_atomic(page);
4477                 /* do the actual data transfer */
4478 @@ -872,7 +872,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
4479                                                                 count, rw);
4481                 kunmap_atomic(buf);
4482 -               local_irq_restore(flags);
4483 +               local_irq_restore_nort(flags);
4484         } else {
4485                 buf = page_address(page);
4486                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
4487 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
4488 index 502406c9e6e1..91d909148688 100644
4489 --- a/drivers/block/zram/zram_drv.c
4490 +++ b/drivers/block/zram/zram_drv.c
4491 @@ -520,6 +520,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
4492                 goto out_error;
4493         }
4495 +       zram_meta_init_table_locks(meta, disksize);
4497         return meta;
4499  out_error:
4500 @@ -568,12 +570,12 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
4501         unsigned long handle;
4502         size_t size;
4504 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4505 +       zram_lock_table(&meta->table[index]);
4506         handle = meta->table[index].handle;
4507         size = zram_get_obj_size(meta, index);
4509         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
4510 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4511 +               zram_unlock_table(&meta->table[index]);
4512                 memset(mem, 0, PAGE_SIZE);
4513                 return 0;
4514         }
4515 @@ -584,7 +586,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
4516         else
4517                 ret = zcomp_decompress(zram->comp, cmem, size, mem);
4518         zs_unmap_object(meta->mem_pool, handle);
4519 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4520 +       zram_unlock_table(&meta->table[index]);
4522         /* Should NEVER happen. Return bio error if it does. */
4523         if (unlikely(ret)) {
4524 @@ -604,14 +606,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
4525         struct zram_meta *meta = zram->meta;
4526         page = bvec->bv_page;
4528 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4529 +       zram_lock_table(&meta->table[index]);
4530         if (unlikely(!meta->table[index].handle) ||
4531                         zram_test_flag(meta, index, ZRAM_ZERO)) {
4532 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4533 +               zram_unlock_table(&meta->table[index]);
4534                 handle_zero_page(bvec);
4535                 return 0;
4536         }
4537 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4538 +       zram_unlock_table(&meta->table[index]);
4540         if (is_partial_io(bvec))
4541                 /* Use  a temporary buffer to decompress the page */
4542 @@ -689,10 +691,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
4543                 if (user_mem)
4544                         kunmap_atomic(user_mem);
4545                 /* Free memory associated with this sector now. */
4546 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4547 +               zram_lock_table(&meta->table[index]);
4548                 zram_free_page(zram, index);
4549                 zram_set_flag(meta, index, ZRAM_ZERO);
4550 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4551 +               zram_unlock_table(&meta->table[index]);
4553                 atomic64_inc(&zram->stats.zero_pages);
4554                 ret = 0;
4555 @@ -752,12 +754,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
4556          * Free memory associated with this sector
4557          * before overwriting unused sectors.
4558          */
4559 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4560 +       zram_lock_table(&meta->table[index]);
4561         zram_free_page(zram, index);
4563         meta->table[index].handle = handle;
4564         zram_set_obj_size(meta, index, clen);
4565 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4566 +       zram_unlock_table(&meta->table[index]);
4568         /* Update stats */
4569         atomic64_add(clen, &zram->stats.compr_data_size);
4570 @@ -800,9 +802,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
4571         }
4573         while (n >= PAGE_SIZE) {
4574 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4575 +               zram_lock_table(&meta->table[index]);
4576                 zram_free_page(zram, index);
4577 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4578 +               zram_unlock_table(&meta->table[index]);
4579                 atomic64_inc(&zram->stats.notify_free);
4580                 index++;
4581                 n -= PAGE_SIZE;
4582 @@ -928,9 +930,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
4583         zram = bdev->bd_disk->private_data;
4584         meta = zram->meta;
4586 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4587 +       zram_lock_table(&meta->table[index]);
4588         zram_free_page(zram, index);
4589 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4590 +       zram_unlock_table(&meta->table[index]);
4591         atomic64_inc(&zram->stats.notify_free);
4594 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
4595 index 8e92339686d7..9e3e953d680e 100644
4596 --- a/drivers/block/zram/zram_drv.h
4597 +++ b/drivers/block/zram/zram_drv.h
4598 @@ -72,6 +72,9 @@ enum zram_pageflags {
4599  struct zram_table_entry {
4600         unsigned long handle;
4601         unsigned long value;
4602 +#ifdef CONFIG_PREEMPT_RT_BASE
4603 +       spinlock_t lock;
4604 +#endif
4605  };
4607  struct zram_stats {
4608 @@ -119,4 +122,42 @@ struct zram {
4609          */
4610         bool claim; /* Protected by bdev->bd_mutex */
4611  };
4613 +#ifndef CONFIG_PREEMPT_RT_BASE
4614 +static inline void zram_lock_table(struct zram_table_entry *table)
4616 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
4619 +static inline void zram_unlock_table(struct zram_table_entry *table)
4621 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
4624 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
4625 +#else /* CONFIG_PREEMPT_RT_BASE */
4626 +static inline void zram_lock_table(struct zram_table_entry *table)
4628 +       spin_lock(&table->lock);
4629 +       __set_bit(ZRAM_ACCESS, &table->value);
4632 +static inline void zram_unlock_table(struct zram_table_entry *table)
4634 +       __clear_bit(ZRAM_ACCESS, &table->value);
4635 +       spin_unlock(&table->lock);
4638 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
4640 +        size_t num_pages = disksize >> PAGE_SHIFT;
4641 +        size_t index;
4643 +        for (index = 0; index < num_pages; index++) {
4644 +               spinlock_t *lock = &meta->table[index].lock;
4645 +               spin_lock_init(lock);
4646 +        }
4648 +#endif /* CONFIG_PREEMPT_RT_BASE */
4650  #endif
4651 diff --git a/drivers/char/random.c b/drivers/char/random.c
4652 index 1822472dffab..46c0e27cf27f 100644
4653 --- a/drivers/char/random.c
4654 +++ b/drivers/char/random.c
4655 @@ -260,6 +260,7 @@
4656  #include <linux/irq.h>
4657  #include <linux/syscalls.h>
4658  #include <linux/completion.h>
4659 +#include <linux/locallock.h>
4661  #include <asm/processor.h>
4662  #include <asm/uaccess.h>
4663 @@ -799,8 +800,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
4664         } sample;
4665         long delta, delta2, delta3;
4667 -       preempt_disable();
4669         sample.jiffies = jiffies;
4670         sample.cycles = random_get_entropy();
4671         sample.num = num;
4672 @@ -841,7 +840,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
4673                  */
4674                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
4675         }
4676 -       preempt_enable();
4679  void add_input_randomness(unsigned int type, unsigned int code,
4680 @@ -894,28 +892,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
4681         return *(ptr + f->reg_idx++);
4684 -void add_interrupt_randomness(int irq, int irq_flags)
4685 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
4687         struct entropy_store    *r;
4688         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
4689 -       struct pt_regs          *regs = get_irq_regs();
4690         unsigned long           now = jiffies;
4691         cycles_t                cycles = random_get_entropy();
4692         __u32                   c_high, j_high;
4693 -       __u64                   ip;
4694         unsigned long           seed;
4695         int                     credit = 0;
4697         if (cycles == 0)
4698 -               cycles = get_reg(fast_pool, regs);
4699 +               cycles = get_reg(fast_pool, NULL);
4700         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
4701         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
4702         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
4703         fast_pool->pool[1] ^= now ^ c_high;
4704 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
4705 +       if (!ip)
4706 +               ip = _RET_IP_;
4707         fast_pool->pool[2] ^= ip;
4708         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
4709 -               get_reg(fast_pool, regs);
4710 +               get_reg(fast_pool, NULL);
4712         fast_mix(fast_pool);
4713         add_interrupt_bench(cycles);
4714 @@ -1800,6 +1797,7 @@ int random_int_secret_init(void)
4716  static DEFINE_PER_CPU(__u32 [MD5_DIGEST_WORDS], get_random_int_hash)
4717                 __aligned(sizeof(unsigned long));
4718 +static DEFINE_LOCAL_IRQ_LOCK(hash_entropy_int_lock);
4720  /*
4721   * Get a random word for internal kernel use only. Similar to urandom but
4722 @@ -1815,12 +1813,12 @@ unsigned int get_random_int(void)
4723         if (arch_get_random_int(&ret))
4724                 return ret;
4726 -       hash = get_cpu_var(get_random_int_hash);
4727 +       hash = get_locked_var(hash_entropy_int_lock, get_random_int_hash);
4729         hash[0] += current->pid + jiffies + random_get_entropy();
4730         md5_transform(hash, random_int_secret);
4731         ret = hash[0];
4732 -       put_cpu_var(get_random_int_hash);
4733 +       put_locked_var(hash_entropy_int_lock, get_random_int_hash);
4735         return ret;
4737 @@ -1837,12 +1835,12 @@ unsigned long get_random_long(void)
4738         if (arch_get_random_long(&ret))
4739                 return ret;
4741 -       hash = get_cpu_var(get_random_int_hash);
4742 +       hash = get_locked_var(hash_entropy_int_lock, get_random_int_hash);
4744         hash[0] += current->pid + jiffies + random_get_entropy();
4745         md5_transform(hash, random_int_secret);
4746         ret = *(unsigned long *)hash;
4747 -       put_cpu_var(get_random_int_hash);
4748 +       put_locked_var(hash_entropy_int_lock, get_random_int_hash);
4750         return ret;
4752 diff --git a/drivers/clk/at91/clk-generated.c b/drivers/clk/at91/clk-generated.c
4753 index abc80949e1dd..4ad3298eb372 100644
4754 --- a/drivers/clk/at91/clk-generated.c
4755 +++ b/drivers/clk/at91/clk-generated.c
4756 @@ -15,8 +15,8 @@
4757  #include <linux/clkdev.h>
4758  #include <linux/clk/at91_pmc.h>
4759  #include <linux/of.h>
4760 -#include <linux/of_address.h>
4761 -#include <linux/io.h>
4762 +#include <linux/mfd/syscon.h>
4763 +#include <linux/regmap.h>
4765  #include "pmc.h"
4767 @@ -28,8 +28,9 @@
4769  struct clk_generated {
4770         struct clk_hw hw;
4771 -       struct at91_pmc *pmc;
4772 +       struct regmap *regmap;
4773         struct clk_range range;
4774 +       spinlock_t *lock;
4775         u32 id;
4776         u32 gckdiv;
4777         u8 parent_id;
4778 @@ -41,49 +42,52 @@ struct clk_generated {
4779  static int clk_generated_enable(struct clk_hw *hw)
4781         struct clk_generated *gck = to_clk_generated(hw);
4782 -       struct at91_pmc *pmc = gck->pmc;
4783 -       u32 tmp;
4784 +       unsigned long flags;
4786         pr_debug("GCLK: %s, gckdiv = %d, parent id = %d\n",
4787                  __func__, gck->gckdiv, gck->parent_id);
4789 -       pmc_lock(pmc);
4790 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4791 -       tmp = pmc_read(pmc, AT91_PMC_PCR) &
4792 -                       ~(AT91_PMC_PCR_GCKDIV_MASK | AT91_PMC_PCR_GCKCSS_MASK);
4793 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_GCKCSS(gck->parent_id)
4794 -                                        | AT91_PMC_PCR_CMD
4795 -                                        | AT91_PMC_PCR_GCKDIV(gck->gckdiv)
4796 -                                        | AT91_PMC_PCR_GCKEN);
4797 -       pmc_unlock(pmc);
4798 +       spin_lock_irqsave(gck->lock, flags);
4799 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4800 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4801 +       regmap_update_bits(gck->regmap, AT91_PMC_PCR,
4802 +                          AT91_PMC_PCR_GCKDIV_MASK | AT91_PMC_PCR_GCKCSS_MASK |
4803 +                          AT91_PMC_PCR_CMD | AT91_PMC_PCR_GCKEN,
4804 +                          AT91_PMC_PCR_GCKCSS(gck->parent_id) |
4805 +                          AT91_PMC_PCR_CMD |
4806 +                          AT91_PMC_PCR_GCKDIV(gck->gckdiv) |
4807 +                          AT91_PMC_PCR_GCKEN);
4808 +       spin_unlock_irqrestore(gck->lock, flags);
4809         return 0;
4812  static void clk_generated_disable(struct clk_hw *hw)
4814         struct clk_generated *gck = to_clk_generated(hw);
4815 -       struct at91_pmc *pmc = gck->pmc;
4816 -       u32 tmp;
4818 -       pmc_lock(pmc);
4819 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4820 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_GCKEN;
4821 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_CMD);
4822 -       pmc_unlock(pmc);
4823 +       unsigned long flags;
4825 +       spin_lock_irqsave(gck->lock, flags);
4826 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4827 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4828 +       regmap_update_bits(gck->regmap, AT91_PMC_PCR,
4829 +                          AT91_PMC_PCR_CMD | AT91_PMC_PCR_GCKEN,
4830 +                          AT91_PMC_PCR_CMD);
4831 +       spin_unlock_irqrestore(gck->lock, flags);
4834  static int clk_generated_is_enabled(struct clk_hw *hw)
4836         struct clk_generated *gck = to_clk_generated(hw);
4837 -       struct at91_pmc *pmc = gck->pmc;
4838 -       int ret;
4839 +       unsigned long flags;
4840 +       unsigned int status;
4842 -       pmc_lock(pmc);
4843 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4844 -       ret = !!(pmc_read(pmc, AT91_PMC_PCR) & AT91_PMC_PCR_GCKEN);
4845 -       pmc_unlock(pmc);
4846 +       spin_lock_irqsave(gck->lock, flags);
4847 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4848 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4849 +       regmap_read(gck->regmap, AT91_PMC_PCR, &status);
4850 +       spin_unlock_irqrestore(gck->lock, flags);
4852 -       return ret;
4853 +       return status & AT91_PMC_PCR_GCKEN ? 1 : 0;
4856  static unsigned long
4857 @@ -214,13 +218,14 @@ static const struct clk_ops generated_ops = {
4858   */
4859  static void clk_generated_startup(struct clk_generated *gck)
4861 -       struct at91_pmc *pmc = gck->pmc;
4862         u32 tmp;
4863 +       unsigned long flags;
4865 -       pmc_lock(pmc);
4866 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4867 -       tmp = pmc_read(pmc, AT91_PMC_PCR);
4868 -       pmc_unlock(pmc);
4869 +       spin_lock_irqsave(gck->lock, flags);
4870 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4871 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4872 +       regmap_read(gck->regmap, AT91_PMC_PCR, &tmp);
4873 +       spin_unlock_irqrestore(gck->lock, flags);
4875         gck->parent_id = (tmp & AT91_PMC_PCR_GCKCSS_MASK)
4876                                         >> AT91_PMC_PCR_GCKCSS_OFFSET;
4877 @@ -229,8 +234,8 @@ static void clk_generated_startup(struct clk_generated *gck)
4880  static struct clk * __init
4881 -at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4882 -                           const char **parent_names, u8 num_parents,
4883 +at91_clk_register_generated(struct regmap *regmap,  spinlock_t *lock, const char
4884 +                           *name, const char **parent_names, u8 num_parents,
4885                             u8 id, const struct clk_range *range)
4887         struct clk_generated *gck;
4888 @@ -249,7 +254,8 @@ at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4890         gck->id = id;
4891         gck->hw.init = &init;
4892 -       gck->pmc = pmc;
4893 +       gck->regmap = regmap;
4894 +       gck->lock = lock;
4895         gck->range = *range;
4897         clk = clk_register(NULL, &gck->hw);
4898 @@ -261,8 +267,7 @@ at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4899         return clk;
4902 -void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4903 -                                          struct at91_pmc *pmc)
4904 +void __init of_sama5d2_clk_generated_setup(struct device_node *np)
4906         int num;
4907         u32 id;
4908 @@ -272,6 +277,7 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4909         const char *parent_names[GENERATED_SOURCE_MAX];
4910         struct device_node *gcknp;
4911         struct clk_range range = CLK_RANGE(0, 0);
4912 +       struct regmap *regmap;
4914         num_parents = of_clk_get_parent_count(np);
4915         if (num_parents <= 0 || num_parents > GENERATED_SOURCE_MAX)
4916 @@ -283,6 +289,10 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4917         if (!num || num > PERIPHERAL_MAX)
4918                 return;
4920 +       regmap = syscon_node_to_regmap(of_get_parent(np));
4921 +       if (IS_ERR(regmap))
4922 +               return;
4924         for_each_child_of_node(np, gcknp) {
4925                 if (of_property_read_u32(gcknp, "reg", &id))
4926                         continue;
4927 @@ -296,11 +306,14 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4928                 of_at91_get_clk_range(gcknp, "atmel,clk-output-range",
4929                                       &range);
4931 -               clk = at91_clk_register_generated(pmc, name, parent_names,
4932 -                                                 num_parents, id, &range);
4933 +               clk = at91_clk_register_generated(regmap, &pmc_pcr_lock, name,
4934 +                                                 parent_names, num_parents,
4935 +                                                 id, &range);
4936                 if (IS_ERR(clk))
4937                         continue;
4939                 of_clk_add_provider(gcknp, of_clk_src_simple_get, clk);
4940         }
4942 +CLK_OF_DECLARE(of_sama5d2_clk_generated_setup, "atmel,sama5d2-clk-generated",
4943 +              of_sama5d2_clk_generated_setup);
4944 diff --git a/drivers/clk/at91/clk-h32mx.c b/drivers/clk/at91/clk-h32mx.c
4945 index a165230e7eda..8e20c8a76db7 100644
4946 --- a/drivers/clk/at91/clk-h32mx.c
4947 +++ b/drivers/clk/at91/clk-h32mx.c
4948 @@ -15,15 +15,9 @@
4949  #include <linux/clk-provider.h>
4950  #include <linux/clkdev.h>
4951  #include <linux/clk/at91_pmc.h>
4952 -#include <linux/delay.h>
4953  #include <linux/of.h>
4954 -#include <linux/of_address.h>
4955 -#include <linux/of_irq.h>
4956 -#include <linux/io.h>
4957 -#include <linux/interrupt.h>
4958 -#include <linux/irq.h>
4959 -#include <linux/sched.h>
4960 -#include <linux/wait.h>
4961 +#include <linux/regmap.h>
4962 +#include <linux/mfd/syscon.h>
4964  #include "pmc.h"
4966 @@ -31,7 +25,7 @@
4968  struct clk_sama5d4_h32mx {
4969         struct clk_hw hw;
4970 -       struct at91_pmc *pmc;
4971 +       struct regmap *regmap;
4972  };
4974  #define to_clk_sama5d4_h32mx(hw) container_of(hw, struct clk_sama5d4_h32mx, hw)
4975 @@ -40,8 +34,10 @@ static unsigned long clk_sama5d4_h32mx_recalc_rate(struct clk_hw *hw,
4976                                                  unsigned long parent_rate)
4978         struct clk_sama5d4_h32mx *h32mxclk = to_clk_sama5d4_h32mx(hw);
4979 +       unsigned int mckr;
4981 -       if (pmc_read(h32mxclk->pmc, AT91_PMC_MCKR) & AT91_PMC_H32MXDIV)
4982 +       regmap_read(h32mxclk->regmap, AT91_PMC_MCKR, &mckr);
4983 +       if (mckr & AT91_PMC_H32MXDIV)
4984                 return parent_rate / 2;
4986         if (parent_rate > H32MX_MAX_FREQ)
4987 @@ -70,18 +66,16 @@ static int clk_sama5d4_h32mx_set_rate(struct clk_hw *hw, unsigned long rate,
4988                                     unsigned long parent_rate)
4990         struct clk_sama5d4_h32mx *h32mxclk = to_clk_sama5d4_h32mx(hw);
4991 -       struct at91_pmc *pmc = h32mxclk->pmc;
4992 -       u32 tmp;
4993 +       u32 mckr = 0;
4995         if (parent_rate != rate && (parent_rate / 2) != rate)
4996                 return -EINVAL;
4998 -       pmc_lock(pmc);
4999 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & ~AT91_PMC_H32MXDIV;
5000         if ((parent_rate / 2) == rate)
5001 -               tmp |= AT91_PMC_H32MXDIV;
5002 -       pmc_write(pmc, AT91_PMC_MCKR, tmp);
5003 -       pmc_unlock(pmc);
5004 +               mckr = AT91_PMC_H32MXDIV;
5006 +       regmap_update_bits(h32mxclk->regmap, AT91_PMC_MCKR,
5007 +                          AT91_PMC_H32MXDIV, mckr);
5009         return 0;
5011 @@ -92,14 +86,18 @@ static const struct clk_ops h32mx_ops = {
5012         .set_rate = clk_sama5d4_h32mx_set_rate,
5013  };
5015 -void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
5016 -                                    struct at91_pmc *pmc)
5017 +static void __init of_sama5d4_clk_h32mx_setup(struct device_node *np)
5019         struct clk_sama5d4_h32mx *h32mxclk;
5020         struct clk_init_data init;
5021         const char *parent_name;
5022 +       struct regmap *regmap;
5023         struct clk *clk;
5025 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5026 +       if (IS_ERR(regmap))
5027 +               return;
5029         h32mxclk = kzalloc(sizeof(*h32mxclk), GFP_KERNEL);
5030         if (!h32mxclk)
5031                 return;
5032 @@ -113,7 +111,7 @@ void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
5033         init.flags = CLK_SET_RATE_GATE;
5035         h32mxclk->hw.init = &init;
5036 -       h32mxclk->pmc = pmc;
5037 +       h32mxclk->regmap = regmap;
5039         clk = clk_register(NULL, &h32mxclk->hw);
5040         if (IS_ERR(clk)) {
5041 @@ -123,3 +121,5 @@ void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
5043         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5045 +CLK_OF_DECLARE(of_sama5d4_clk_h32mx_setup, "atmel,sama5d4-clk-h32mx",
5046 +              of_sama5d4_clk_h32mx_setup);
5047 diff --git a/drivers/clk/at91/clk-main.c b/drivers/clk/at91/clk-main.c
5048 index fd7247deabdc..4bfc94d6c26e 100644
5049 --- a/drivers/clk/at91/clk-main.c
5050 +++ b/drivers/clk/at91/clk-main.c
5051 @@ -13,13 +13,8 @@
5052  #include <linux/clk/at91_pmc.h>
5053  #include <linux/delay.h>
5054  #include <linux/of.h>
5055 -#include <linux/of_address.h>
5056 -#include <linux/of_irq.h>
5057 -#include <linux/io.h>
5058 -#include <linux/interrupt.h>
5059 -#include <linux/irq.h>
5060 -#include <linux/sched.h>
5061 -#include <linux/wait.h>
5062 +#include <linux/mfd/syscon.h>
5063 +#include <linux/regmap.h>
5065  #include "pmc.h"
5067 @@ -34,18 +29,14 @@
5069  struct clk_main_osc {
5070         struct clk_hw hw;
5071 -       struct at91_pmc *pmc;
5072 -       unsigned int irq;
5073 -       wait_queue_head_t wait;
5074 +       struct regmap *regmap;
5075  };
5077  #define to_clk_main_osc(hw) container_of(hw, struct clk_main_osc, hw)
5079  struct clk_main_rc_osc {
5080         struct clk_hw hw;
5081 -       struct at91_pmc *pmc;
5082 -       unsigned int irq;
5083 -       wait_queue_head_t wait;
5084 +       struct regmap *regmap;
5085         unsigned long frequency;
5086         unsigned long accuracy;
5087  };
5088 @@ -54,51 +45,47 @@ struct clk_main_rc_osc {
5090  struct clk_rm9200_main {
5091         struct clk_hw hw;
5092 -       struct at91_pmc *pmc;
5093 +       struct regmap *regmap;
5094  };
5096  #define to_clk_rm9200_main(hw) container_of(hw, struct clk_rm9200_main, hw)
5098  struct clk_sam9x5_main {
5099         struct clk_hw hw;
5100 -       struct at91_pmc *pmc;
5101 -       unsigned int irq;
5102 -       wait_queue_head_t wait;
5103 +       struct regmap *regmap;
5104         u8 parent;
5105  };
5107  #define to_clk_sam9x5_main(hw) container_of(hw, struct clk_sam9x5_main, hw)
5109 -static irqreturn_t clk_main_osc_irq_handler(int irq, void *dev_id)
5110 +static inline bool clk_main_osc_ready(struct regmap *regmap)
5112 -       struct clk_main_osc *osc = dev_id;
5113 +       unsigned int status;
5115 -       wake_up(&osc->wait);
5116 -       disable_irq_nosync(osc->irq);
5117 +       regmap_read(regmap, AT91_PMC_SR, &status);
5119 -       return IRQ_HANDLED;
5120 +       return status & AT91_PMC_MOSCS;
5123  static int clk_main_osc_prepare(struct clk_hw *hw)
5125         struct clk_main_osc *osc = to_clk_main_osc(hw);
5126 -       struct at91_pmc *pmc = osc->pmc;
5127 +       struct regmap *regmap = osc->regmap;
5128         u32 tmp;
5130 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5131 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5132 +       tmp &= ~MOR_KEY_MASK;
5134         if (tmp & AT91_PMC_OSCBYPASS)
5135                 return 0;
5137         if (!(tmp & AT91_PMC_MOSCEN)) {
5138                 tmp |= AT91_PMC_MOSCEN | AT91_PMC_KEY;
5139 -               pmc_write(pmc, AT91_CKGR_MOR, tmp);
5140 +               regmap_write(regmap, AT91_CKGR_MOR, tmp);
5141         }
5143 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS)) {
5144 -               enable_irq(osc->irq);
5145 -               wait_event(osc->wait,
5146 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS);
5147 -       }
5148 +       while (!clk_main_osc_ready(regmap))
5149 +               cpu_relax();
5151         return 0;
5153 @@ -106,9 +93,10 @@ static int clk_main_osc_prepare(struct clk_hw *hw)
5154  static void clk_main_osc_unprepare(struct clk_hw *hw)
5156         struct clk_main_osc *osc = to_clk_main_osc(hw);
5157 -       struct at91_pmc *pmc = osc->pmc;
5158 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5159 +       struct regmap *regmap = osc->regmap;
5160 +       u32 tmp;
5162 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5163         if (tmp & AT91_PMC_OSCBYPASS)
5164                 return;
5166 @@ -116,20 +104,22 @@ static void clk_main_osc_unprepare(struct clk_hw *hw)
5167                 return;
5169         tmp &= ~(AT91_PMC_KEY | AT91_PMC_MOSCEN);
5170 -       pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5171 +       regmap_write(regmap, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5174  static int clk_main_osc_is_prepared(struct clk_hw *hw)
5176         struct clk_main_osc *osc = to_clk_main_osc(hw);
5177 -       struct at91_pmc *pmc = osc->pmc;
5178 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5179 +       struct regmap *regmap = osc->regmap;
5180 +       u32 tmp, status;
5182 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5183         if (tmp & AT91_PMC_OSCBYPASS)
5184                 return 1;
5186 -       return !!((pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS) &&
5187 -                 (pmc_read(pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCEN));
5188 +       regmap_read(regmap, AT91_PMC_SR, &status);
5190 +       return (status & AT91_PMC_MOSCS) && (tmp & AT91_PMC_MOSCEN);
5193  static const struct clk_ops main_osc_ops = {
5194 @@ -139,18 +129,16 @@ static const struct clk_ops main_osc_ops = {
5195  };
5197  static struct clk * __init
5198 -at91_clk_register_main_osc(struct at91_pmc *pmc,
5199 -                          unsigned int irq,
5200 +at91_clk_register_main_osc(struct regmap *regmap,
5201                            const char *name,
5202                            const char *parent_name,
5203                            bool bypass)
5205 -       int ret;
5206         struct clk_main_osc *osc;
5207         struct clk *clk = NULL;
5208         struct clk_init_data init;
5210 -       if (!pmc || !irq || !name || !parent_name)
5211 +       if (!name || !parent_name)
5212                 return ERR_PTR(-EINVAL);
5214         osc = kzalloc(sizeof(*osc), GFP_KERNEL);
5215 @@ -164,85 +152,70 @@ at91_clk_register_main_osc(struct at91_pmc *pmc,
5216         init.flags = CLK_IGNORE_UNUSED;
5218         osc->hw.init = &init;
5219 -       osc->pmc = pmc;
5220 -       osc->irq = irq;
5222 -       init_waitqueue_head(&osc->wait);
5223 -       irq_set_status_flags(osc->irq, IRQ_NOAUTOEN);
5224 -       ret = request_irq(osc->irq, clk_main_osc_irq_handler,
5225 -                         IRQF_TRIGGER_HIGH, name, osc);
5226 -       if (ret) {
5227 -               kfree(osc);
5228 -               return ERR_PTR(ret);
5229 -       }
5230 +       osc->regmap = regmap;
5232         if (bypass)
5233 -               pmc_write(pmc, AT91_CKGR_MOR,
5234 -                         (pmc_read(pmc, AT91_CKGR_MOR) &
5235 -                          ~(MOR_KEY_MASK | AT91_PMC_MOSCEN)) |
5236 -                         AT91_PMC_OSCBYPASS | AT91_PMC_KEY);
5237 +               regmap_update_bits(regmap,
5238 +                                  AT91_CKGR_MOR, MOR_KEY_MASK |
5239 +                                  AT91_PMC_MOSCEN,
5240 +                                  AT91_PMC_OSCBYPASS | AT91_PMC_KEY);
5242         clk = clk_register(NULL, &osc->hw);
5243 -       if (IS_ERR(clk)) {
5244 -               free_irq(irq, osc);
5245 +       if (IS_ERR(clk))
5246                 kfree(osc);
5247 -       }
5249         return clk;
5252 -void __init of_at91rm9200_clk_main_osc_setup(struct device_node *np,
5253 -                                            struct at91_pmc *pmc)
5254 +static void __init of_at91rm9200_clk_main_osc_setup(struct device_node *np)
5256         struct clk *clk;
5257 -       unsigned int irq;
5258         const char *name = np->name;
5259         const char *parent_name;
5260 +       struct regmap *regmap;
5261         bool bypass;
5263         of_property_read_string(np, "clock-output-names", &name);
5264         bypass = of_property_read_bool(np, "atmel,osc-bypass");
5265         parent_name = of_clk_get_parent_name(np, 0);
5267 -       irq = irq_of_parse_and_map(np, 0);
5268 -       if (!irq)
5269 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5270 +       if (IS_ERR(regmap))
5271                 return;
5273 -       clk = at91_clk_register_main_osc(pmc, irq, name, parent_name, bypass);
5274 +       clk = at91_clk_register_main_osc(regmap, name, parent_name, bypass);
5275         if (IS_ERR(clk))
5276                 return;
5278         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5280 +CLK_OF_DECLARE(at91rm9200_clk_main_osc, "atmel,at91rm9200-clk-main-osc",
5281 +              of_at91rm9200_clk_main_osc_setup);
5283 -static irqreturn_t clk_main_rc_osc_irq_handler(int irq, void *dev_id)
5284 +static bool clk_main_rc_osc_ready(struct regmap *regmap)
5286 -       struct clk_main_rc_osc *osc = dev_id;
5287 +       unsigned int status;
5289 -       wake_up(&osc->wait);
5290 -       disable_irq_nosync(osc->irq);
5291 +       regmap_read(regmap, AT91_PMC_SR, &status);
5293 -       return IRQ_HANDLED;
5294 +       return status & AT91_PMC_MOSCRCS;
5297  static int clk_main_rc_osc_prepare(struct clk_hw *hw)
5299         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5300 -       struct at91_pmc *pmc = osc->pmc;
5301 -       u32 tmp;
5302 +       struct regmap *regmap = osc->regmap;
5303 +       unsigned int mor;
5305 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5306 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5308 -       if (!(tmp & AT91_PMC_MOSCRCEN)) {
5309 -               tmp |= AT91_PMC_MOSCRCEN | AT91_PMC_KEY;
5310 -               pmc_write(pmc, AT91_CKGR_MOR, tmp);
5311 -       }
5312 +       if (!(mor & AT91_PMC_MOSCRCEN))
5313 +               regmap_update_bits(regmap, AT91_CKGR_MOR,
5314 +                                  MOR_KEY_MASK | AT91_PMC_MOSCRCEN,
5315 +                                  AT91_PMC_MOSCRCEN | AT91_PMC_KEY);
5317 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS)) {
5318 -               enable_irq(osc->irq);
5319 -               wait_event(osc->wait,
5320 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS);
5321 -       }
5322 +       while (!clk_main_rc_osc_ready(regmap))
5323 +               cpu_relax();
5325         return 0;
5327 @@ -250,23 +223,28 @@ static int clk_main_rc_osc_prepare(struct clk_hw *hw)
5328  static void clk_main_rc_osc_unprepare(struct clk_hw *hw)
5330         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5331 -       struct at91_pmc *pmc = osc->pmc;
5332 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5333 +       struct regmap *regmap = osc->regmap;
5334 +       unsigned int mor;
5336 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5338 -       if (!(tmp & AT91_PMC_MOSCRCEN))
5339 +       if (!(mor & AT91_PMC_MOSCRCEN))
5340                 return;
5342 -       tmp &= ~(MOR_KEY_MASK | AT91_PMC_MOSCRCEN);
5343 -       pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5344 +       regmap_update_bits(regmap, AT91_CKGR_MOR,
5345 +                          MOR_KEY_MASK | AT91_PMC_MOSCRCEN, AT91_PMC_KEY);
5348  static int clk_main_rc_osc_is_prepared(struct clk_hw *hw)
5350         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5351 -       struct at91_pmc *pmc = osc->pmc;
5352 +       struct regmap *regmap = osc->regmap;
5353 +       unsigned int mor, status;
5355 -       return !!((pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS) &&
5356 -                 (pmc_read(pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCRCEN));
5357 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5358 +       regmap_read(regmap, AT91_PMC_SR, &status);
5360 +       return (mor & AT91_PMC_MOSCRCEN) && (status & AT91_PMC_MOSCRCS);
5363  static unsigned long clk_main_rc_osc_recalc_rate(struct clk_hw *hw,
5364 @@ -294,17 +272,15 @@ static const struct clk_ops main_rc_osc_ops = {
5365  };
5367  static struct clk * __init
5368 -at91_clk_register_main_rc_osc(struct at91_pmc *pmc,
5369 -                             unsigned int irq,
5370 +at91_clk_register_main_rc_osc(struct regmap *regmap,
5371                               const char *name,
5372                               u32 frequency, u32 accuracy)
5374 -       int ret;
5375         struct clk_main_rc_osc *osc;
5376         struct clk *clk = NULL;
5377         struct clk_init_data init;
5379 -       if (!pmc || !irq || !name || !frequency)
5380 +       if (!name || !frequency)
5381                 return ERR_PTR(-EINVAL);
5383         osc = kzalloc(sizeof(*osc), GFP_KERNEL);
5384 @@ -318,63 +294,53 @@ at91_clk_register_main_rc_osc(struct at91_pmc *pmc,
5385         init.flags = CLK_IS_ROOT | CLK_IGNORE_UNUSED;
5387         osc->hw.init = &init;
5388 -       osc->pmc = pmc;
5389 -       osc->irq = irq;
5390 +       osc->regmap = regmap;
5391         osc->frequency = frequency;
5392         osc->accuracy = accuracy;
5394 -       init_waitqueue_head(&osc->wait);
5395 -       irq_set_status_flags(osc->irq, IRQ_NOAUTOEN);
5396 -       ret = request_irq(osc->irq, clk_main_rc_osc_irq_handler,
5397 -                         IRQF_TRIGGER_HIGH, name, osc);
5398 -       if (ret)
5399 -               return ERR_PTR(ret);
5401         clk = clk_register(NULL, &osc->hw);
5402 -       if (IS_ERR(clk)) {
5403 -               free_irq(irq, osc);
5404 +       if (IS_ERR(clk))
5405                 kfree(osc);
5406 -       }
5408         return clk;
5411 -void __init of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np,
5412 -                                               struct at91_pmc *pmc)
5413 +static void __init of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np)
5415         struct clk *clk;
5416 -       unsigned int irq;
5417         u32 frequency = 0;
5418         u32 accuracy = 0;
5419         const char *name = np->name;
5420 +       struct regmap *regmap;
5422         of_property_read_string(np, "clock-output-names", &name);
5423         of_property_read_u32(np, "clock-frequency", &frequency);
5424         of_property_read_u32(np, "clock-accuracy", &accuracy);
5426 -       irq = irq_of_parse_and_map(np, 0);
5427 -       if (!irq)
5428 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5429 +       if (IS_ERR(regmap))
5430                 return;
5432 -       clk = at91_clk_register_main_rc_osc(pmc, irq, name, frequency,
5433 -                                           accuracy);
5434 +       clk = at91_clk_register_main_rc_osc(regmap, name, frequency, accuracy);
5435         if (IS_ERR(clk))
5436                 return;
5438         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5440 +CLK_OF_DECLARE(at91sam9x5_clk_main_rc_osc, "atmel,at91sam9x5-clk-main-rc-osc",
5441 +              of_at91sam9x5_clk_main_rc_osc_setup);
5444 -static int clk_main_probe_frequency(struct at91_pmc *pmc)
5445 +static int clk_main_probe_frequency(struct regmap *regmap)
5447         unsigned long prep_time, timeout;
5448 -       u32 tmp;
5449 +       unsigned int mcfr;
5451         timeout = jiffies + usecs_to_jiffies(MAINFRDY_TIMEOUT);
5452         do {
5453                 prep_time = jiffies;
5454 -               tmp = pmc_read(pmc, AT91_CKGR_MCFR);
5455 -               if (tmp & AT91_PMC_MAINRDY)
5456 +               regmap_read(regmap, AT91_CKGR_MCFR, &mcfr);
5457 +               if (mcfr & AT91_PMC_MAINRDY)
5458                         return 0;
5459                 usleep_range(MAINF_LOOP_MIN_WAIT, MAINF_LOOP_MAX_WAIT);
5460         } while (time_before(prep_time, timeout));
5461 @@ -382,34 +348,37 @@ static int clk_main_probe_frequency(struct at91_pmc *pmc)
5462         return -ETIMEDOUT;
5465 -static unsigned long clk_main_recalc_rate(struct at91_pmc *pmc,
5466 +static unsigned long clk_main_recalc_rate(struct regmap *regmap,
5467                                           unsigned long parent_rate)
5469 -       u32 tmp;
5470 +       unsigned int mcfr;
5472         if (parent_rate)
5473                 return parent_rate;
5475         pr_warn("Main crystal frequency not set, using approximate value\n");
5476 -       tmp = pmc_read(pmc, AT91_CKGR_MCFR);
5477 -       if (!(tmp & AT91_PMC_MAINRDY))
5478 +       regmap_read(regmap, AT91_CKGR_MCFR, &mcfr);
5479 +       if (!(mcfr & AT91_PMC_MAINRDY))
5480                 return 0;
5482 -       return ((tmp & AT91_PMC_MAINF) * SLOW_CLOCK_FREQ) / MAINF_DIV;
5483 +       return ((mcfr & AT91_PMC_MAINF) * SLOW_CLOCK_FREQ) / MAINF_DIV;
5486  static int clk_rm9200_main_prepare(struct clk_hw *hw)
5488         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5490 -       return clk_main_probe_frequency(clkmain->pmc);
5491 +       return clk_main_probe_frequency(clkmain->regmap);
5494  static int clk_rm9200_main_is_prepared(struct clk_hw *hw)
5496         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5497 +       unsigned int status;
5499 +       regmap_read(clkmain->regmap, AT91_CKGR_MCFR, &status);
5501 -       return !!(pmc_read(clkmain->pmc, AT91_CKGR_MCFR) & AT91_PMC_MAINRDY);
5502 +       return status & AT91_PMC_MAINRDY ? 1 : 0;
5505  static unsigned long clk_rm9200_main_recalc_rate(struct clk_hw *hw,
5506 @@ -417,7 +386,7 @@ static unsigned long clk_rm9200_main_recalc_rate(struct clk_hw *hw,
5508         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5510 -       return clk_main_recalc_rate(clkmain->pmc, parent_rate);
5511 +       return clk_main_recalc_rate(clkmain->regmap, parent_rate);
5514  static const struct clk_ops rm9200_main_ops = {
5515 @@ -427,7 +396,7 @@ static const struct clk_ops rm9200_main_ops = {
5516  };
5518  static struct clk * __init
5519 -at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5520 +at91_clk_register_rm9200_main(struct regmap *regmap,
5521                               const char *name,
5522                               const char *parent_name)
5524 @@ -435,7 +404,7 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5525         struct clk *clk = NULL;
5526         struct clk_init_data init;
5528 -       if (!pmc || !name)
5529 +       if (!name)
5530                 return ERR_PTR(-EINVAL);
5532         if (!parent_name)
5533 @@ -452,7 +421,7 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5534         init.flags = 0;
5536         clkmain->hw.init = &init;
5537 -       clkmain->pmc = pmc;
5538 +       clkmain->regmap = regmap;
5540         clk = clk_register(NULL, &clkmain->hw);
5541         if (IS_ERR(clk))
5542 @@ -461,52 +430,54 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5543         return clk;
5546 -void __init of_at91rm9200_clk_main_setup(struct device_node *np,
5547 -                                        struct at91_pmc *pmc)
5548 +static void __init of_at91rm9200_clk_main_setup(struct device_node *np)
5550         struct clk *clk;
5551         const char *parent_name;
5552         const char *name = np->name;
5553 +       struct regmap *regmap;
5555         parent_name = of_clk_get_parent_name(np, 0);
5556         of_property_read_string(np, "clock-output-names", &name);
5558 -       clk = at91_clk_register_rm9200_main(pmc, name, parent_name);
5559 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5560 +       if (IS_ERR(regmap))
5561 +               return;
5563 +       clk = at91_clk_register_rm9200_main(regmap, name, parent_name);
5564         if (IS_ERR(clk))
5565                 return;
5567         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5569 +CLK_OF_DECLARE(at91rm9200_clk_main, "atmel,at91rm9200-clk-main",
5570 +              of_at91rm9200_clk_main_setup);
5572 -static irqreturn_t clk_sam9x5_main_irq_handler(int irq, void *dev_id)
5573 +static inline bool clk_sam9x5_main_ready(struct regmap *regmap)
5575 -       struct clk_sam9x5_main *clkmain = dev_id;
5576 +       unsigned int status;
5578 -       wake_up(&clkmain->wait);
5579 -       disable_irq_nosync(clkmain->irq);
5580 +       regmap_read(regmap, AT91_PMC_SR, &status);
5582 -       return IRQ_HANDLED;
5583 +       return status & AT91_PMC_MOSCSELS ? 1 : 0;
5586  static int clk_sam9x5_main_prepare(struct clk_hw *hw)
5588         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5589 -       struct at91_pmc *pmc = clkmain->pmc;
5590 +       struct regmap *regmap = clkmain->regmap;
5592 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS)) {
5593 -               enable_irq(clkmain->irq);
5594 -               wait_event(clkmain->wait,
5595 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5596 -       }
5597 +       while (!clk_sam9x5_main_ready(regmap))
5598 +               cpu_relax();
5600 -       return clk_main_probe_frequency(pmc);
5601 +       return clk_main_probe_frequency(regmap);
5604  static int clk_sam9x5_main_is_prepared(struct clk_hw *hw)
5606         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5608 -       return !!(pmc_read(clkmain->pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5609 +       return clk_sam9x5_main_ready(clkmain->regmap);
5612  static unsigned long clk_sam9x5_main_recalc_rate(struct clk_hw *hw,
5613 @@ -514,30 +485,28 @@ static unsigned long clk_sam9x5_main_recalc_rate(struct clk_hw *hw,
5615         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5617 -       return clk_main_recalc_rate(clkmain->pmc, parent_rate);
5618 +       return clk_main_recalc_rate(clkmain->regmap, parent_rate);
5621  static int clk_sam9x5_main_set_parent(struct clk_hw *hw, u8 index)
5623         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5624 -       struct at91_pmc *pmc = clkmain->pmc;
5625 -       u32 tmp;
5626 +       struct regmap *regmap = clkmain->regmap;
5627 +       unsigned int tmp;
5629         if (index > 1)
5630                 return -EINVAL;
5632 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5633 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5634 +       tmp &= ~MOR_KEY_MASK;
5636         if (index && !(tmp & AT91_PMC_MOSCSEL))
5637 -               pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_MOSCSEL);
5638 +               regmap_write(regmap, AT91_CKGR_MOR, tmp | AT91_PMC_MOSCSEL);
5639         else if (!index && (tmp & AT91_PMC_MOSCSEL))
5640 -               pmc_write(pmc, AT91_CKGR_MOR, tmp & ~AT91_PMC_MOSCSEL);
5641 +               regmap_write(regmap, AT91_CKGR_MOR, tmp & ~AT91_PMC_MOSCSEL);
5643 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS)) {
5644 -               enable_irq(clkmain->irq);
5645 -               wait_event(clkmain->wait,
5646 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5647 -       }
5648 +       while (!clk_sam9x5_main_ready(regmap))
5649 +               cpu_relax();
5651         return 0;
5653 @@ -545,8 +514,11 @@ static int clk_sam9x5_main_set_parent(struct clk_hw *hw, u8 index)
5654  static u8 clk_sam9x5_main_get_parent(struct clk_hw *hw)
5656         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5657 +       unsigned int status;
5659 +       regmap_read(clkmain->regmap, AT91_CKGR_MOR, &status);
5661 -       return !!(pmc_read(clkmain->pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCEN);
5662 +       return status & AT91_PMC_MOSCEN ? 1 : 0;
5665  static const struct clk_ops sam9x5_main_ops = {
5666 @@ -558,18 +530,17 @@ static const struct clk_ops sam9x5_main_ops = {
5667  };
5669  static struct clk * __init
5670 -at91_clk_register_sam9x5_main(struct at91_pmc *pmc,
5671 -                             unsigned int irq,
5672 +at91_clk_register_sam9x5_main(struct regmap *regmap,
5673                               const char *name,
5674                               const char **parent_names,
5675                               int num_parents)
5677 -       int ret;
5678         struct clk_sam9x5_main *clkmain;
5679         struct clk *clk = NULL;
5680         struct clk_init_data init;
5681 +       unsigned int status;
5683 -       if (!pmc || !irq || !name)
5684 +       if (!name)
5685                 return ERR_PTR(-EINVAL);
5687         if (!parent_names || !num_parents)
5688 @@ -586,51 +557,42 @@ at91_clk_register_sam9x5_main(struct at91_pmc *pmc,
5689         init.flags = CLK_SET_PARENT_GATE;
5691         clkmain->hw.init = &init;
5692 -       clkmain->pmc = pmc;
5693 -       clkmain->irq = irq;
5694 -       clkmain->parent = !!(pmc_read(clkmain->pmc, AT91_CKGR_MOR) &
5695 -                            AT91_PMC_MOSCEN);
5696 -       init_waitqueue_head(&clkmain->wait);
5697 -       irq_set_status_flags(clkmain->irq, IRQ_NOAUTOEN);
5698 -       ret = request_irq(clkmain->irq, clk_sam9x5_main_irq_handler,
5699 -                         IRQF_TRIGGER_HIGH, name, clkmain);
5700 -       if (ret)
5701 -               return ERR_PTR(ret);
5702 +       clkmain->regmap = regmap;
5703 +       regmap_read(clkmain->regmap, AT91_CKGR_MOR, &status);
5704 +       clkmain->parent = status & AT91_PMC_MOSCEN ? 1 : 0;
5706         clk = clk_register(NULL, &clkmain->hw);
5707 -       if (IS_ERR(clk)) {
5708 -               free_irq(clkmain->irq, clkmain);
5709 +       if (IS_ERR(clk))
5710                 kfree(clkmain);
5711 -       }
5713         return clk;
5716 -void __init of_at91sam9x5_clk_main_setup(struct device_node *np,
5717 -                                        struct at91_pmc *pmc)
5718 +static void __init of_at91sam9x5_clk_main_setup(struct device_node *np)
5720         struct clk *clk;
5721         const char *parent_names[2];
5722         int num_parents;
5723 -       unsigned int irq;
5724         const char *name = np->name;
5725 +       struct regmap *regmap;
5727         num_parents = of_clk_get_parent_count(np);
5728         if (num_parents <= 0 || num_parents > 2)
5729                 return;
5731         of_clk_parent_fill(np, parent_names, num_parents);
5732 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5733 +       if (IS_ERR(regmap))
5734 +               return;
5736         of_property_read_string(np, "clock-output-names", &name);
5738 -       irq = irq_of_parse_and_map(np, 0);
5739 -       if (!irq)
5740 -               return;
5742 -       clk = at91_clk_register_sam9x5_main(pmc, irq, name, parent_names,
5743 +       clk = at91_clk_register_sam9x5_main(regmap, name, parent_names,
5744                                             num_parents);
5745         if (IS_ERR(clk))
5746                 return;
5748         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5750 +CLK_OF_DECLARE(at91sam9x5_clk_main, "atmel,at91sam9x5-clk-main",
5751 +              of_at91sam9x5_clk_main_setup);
5752 diff --git a/drivers/clk/at91/clk-master.c b/drivers/clk/at91/clk-master.c
5753 index 620ea323356b..7d4a1864ea7c 100644
5754 --- a/drivers/clk/at91/clk-master.c
5755 +++ b/drivers/clk/at91/clk-master.c
5756 @@ -12,13 +12,8 @@
5757  #include <linux/clkdev.h>
5758  #include <linux/clk/at91_pmc.h>
5759  #include <linux/of.h>
5760 -#include <linux/of_address.h>
5761 -#include <linux/of_irq.h>
5762 -#include <linux/io.h>
5763 -#include <linux/wait.h>
5764 -#include <linux/sched.h>
5765 -#include <linux/interrupt.h>
5766 -#include <linux/irq.h>
5767 +#include <linux/mfd/syscon.h>
5768 +#include <linux/regmap.h>
5770  #include "pmc.h"
5772 @@ -44,32 +39,26 @@ struct clk_master_layout {
5774  struct clk_master {
5775         struct clk_hw hw;
5776 -       struct at91_pmc *pmc;
5777 -       unsigned int irq;
5778 -       wait_queue_head_t wait;
5779 +       struct regmap *regmap;
5780         const struct clk_master_layout *layout;
5781         const struct clk_master_characteristics *characteristics;
5782  };
5784 -static irqreturn_t clk_master_irq_handler(int irq, void *dev_id)
5785 +static inline bool clk_master_ready(struct regmap *regmap)
5787 -       struct clk_master *master = (struct clk_master *)dev_id;
5788 +       unsigned int status;
5790 -       wake_up(&master->wait);
5791 -       disable_irq_nosync(master->irq);
5792 +       regmap_read(regmap, AT91_PMC_SR, &status);
5794 -       return IRQ_HANDLED;
5795 +       return status & AT91_PMC_MCKRDY ? 1 : 0;
5798  static int clk_master_prepare(struct clk_hw *hw)
5800         struct clk_master *master = to_clk_master(hw);
5801 -       struct at91_pmc *pmc = master->pmc;
5803 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY)) {
5804 -               enable_irq(master->irq);
5805 -               wait_event(master->wait,
5806 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY);
5807 -       }
5808 +       while (!clk_master_ready(master->regmap))
5809 +               cpu_relax();
5811         return 0;
5813 @@ -78,7 +67,7 @@ static int clk_master_is_prepared(struct clk_hw *hw)
5815         struct clk_master *master = to_clk_master(hw);
5817 -       return !!(pmc_read(master->pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY);
5818 +       return clk_master_ready(master->regmap);
5821  static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5822 @@ -88,18 +77,16 @@ static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5823         u8 div;
5824         unsigned long rate = parent_rate;
5825         struct clk_master *master = to_clk_master(hw);
5826 -       struct at91_pmc *pmc = master->pmc;
5827         const struct clk_master_layout *layout = master->layout;
5828         const struct clk_master_characteristics *characteristics =
5829                                                 master->characteristics;
5830 -       u32 tmp;
5831 +       unsigned int mckr;
5833 -       pmc_lock(pmc);
5834 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & layout->mask;
5835 -       pmc_unlock(pmc);
5836 +       regmap_read(master->regmap, AT91_PMC_MCKR, &mckr);
5837 +       mckr &= layout->mask;
5839 -       pres = (tmp >> layout->pres_shift) & MASTER_PRES_MASK;
5840 -       div = (tmp >> MASTER_DIV_SHIFT) & MASTER_DIV_MASK;
5841 +       pres = (mckr >> layout->pres_shift) & MASTER_PRES_MASK;
5842 +       div = (mckr >> MASTER_DIV_SHIFT) & MASTER_DIV_MASK;
5844         if (characteristics->have_div3_pres && pres == MASTER_PRES_MAX)
5845                 rate /= 3;
5846 @@ -119,9 +106,11 @@ static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5847  static u8 clk_master_get_parent(struct clk_hw *hw)
5849         struct clk_master *master = to_clk_master(hw);
5850 -       struct at91_pmc *pmc = master->pmc;
5851 +       unsigned int mckr;
5853 -       return pmc_read(pmc, AT91_PMC_MCKR) & AT91_PMC_CSS;
5854 +       regmap_read(master->regmap, AT91_PMC_MCKR, &mckr);
5856 +       return mckr & AT91_PMC_CSS;
5859  static const struct clk_ops master_ops = {
5860 @@ -132,18 +121,17 @@ static const struct clk_ops master_ops = {
5861  };
5863  static struct clk * __init
5864 -at91_clk_register_master(struct at91_pmc *pmc, unsigned int irq,
5865 +at91_clk_register_master(struct regmap *regmap,
5866                 const char *name, int num_parents,
5867                 const char **parent_names,
5868                 const struct clk_master_layout *layout,
5869                 const struct clk_master_characteristics *characteristics)
5871 -       int ret;
5872         struct clk_master *master;
5873         struct clk *clk = NULL;
5874         struct clk_init_data init;
5876 -       if (!pmc || !irq || !name || !num_parents || !parent_names)
5877 +       if (!name || !num_parents || !parent_names)
5878                 return ERR_PTR(-EINVAL);
5880         master = kzalloc(sizeof(*master), GFP_KERNEL);
5881 @@ -159,20 +147,10 @@ at91_clk_register_master(struct at91_pmc *pmc, unsigned int irq,
5882         master->hw.init = &init;
5883         master->layout = layout;
5884         master->characteristics = characteristics;
5885 -       master->pmc = pmc;
5886 -       master->irq = irq;
5887 -       init_waitqueue_head(&master->wait);
5888 -       irq_set_status_flags(master->irq, IRQ_NOAUTOEN);
5889 -       ret = request_irq(master->irq, clk_master_irq_handler,
5890 -                         IRQF_TRIGGER_HIGH, "clk-master", master);
5891 -       if (ret) {
5892 -               kfree(master);
5893 -               return ERR_PTR(ret);
5894 -       }
5895 +       master->regmap = regmap;
5897         clk = clk_register(NULL, &master->hw);
5898         if (IS_ERR(clk)) {
5899 -               free_irq(master->irq, master);
5900                 kfree(master);
5901         }
5903 @@ -217,15 +195,15 @@ of_at91_clk_master_get_characteristics(struct device_node *np)
5906  static void __init
5907 -of_at91_clk_master_setup(struct device_node *np, struct at91_pmc *pmc,
5908 +of_at91_clk_master_setup(struct device_node *np,
5909                          const struct clk_master_layout *layout)
5911         struct clk *clk;
5912         int num_parents;
5913 -       unsigned int irq;
5914         const char *parent_names[MASTER_SOURCE_MAX];
5915         const char *name = np->name;
5916         struct clk_master_characteristics *characteristics;
5917 +       struct regmap *regmap;
5919         num_parents = of_clk_get_parent_count(np);
5920         if (num_parents <= 0 || num_parents > MASTER_SOURCE_MAX)
5921 @@ -239,11 +217,11 @@ of_at91_clk_master_setup(struct device_node *np, struct at91_pmc *pmc,
5922         if (!characteristics)
5923                 return;
5925 -       irq = irq_of_parse_and_map(np, 0);
5926 -       if (!irq)
5927 -               goto out_free_characteristics;
5928 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5929 +       if (IS_ERR(regmap))
5930 +               return;
5932 -       clk = at91_clk_register_master(pmc, irq, name, num_parents,
5933 +       clk = at91_clk_register_master(regmap, name, num_parents,
5934                                        parent_names, layout,
5935                                        characteristics);
5936         if (IS_ERR(clk))
5937 @@ -256,14 +234,16 @@ of_at91_clk_master_setup(struct device_node *np, struct at91_pmc *pmc,
5938         kfree(characteristics);
5941 -void __init of_at91rm9200_clk_master_setup(struct device_node *np,
5942 -                                          struct at91_pmc *pmc)
5943 +static void __init of_at91rm9200_clk_master_setup(struct device_node *np)
5945 -       of_at91_clk_master_setup(np, pmc, &at91rm9200_master_layout);
5946 +       of_at91_clk_master_setup(np, &at91rm9200_master_layout);
5948 +CLK_OF_DECLARE(at91rm9200_clk_master, "atmel,at91rm9200-clk-master",
5949 +              of_at91rm9200_clk_master_setup);
5951 -void __init of_at91sam9x5_clk_master_setup(struct device_node *np,
5952 -                                          struct at91_pmc *pmc)
5953 +static void __init of_at91sam9x5_clk_master_setup(struct device_node *np)
5955 -       of_at91_clk_master_setup(np, pmc, &at91sam9x5_master_layout);
5956 +       of_at91_clk_master_setup(np, &at91sam9x5_master_layout);
5958 +CLK_OF_DECLARE(at91sam9x5_clk_master, "atmel,at91sam9x5-clk-master",
5959 +              of_at91sam9x5_clk_master_setup);
5960 diff --git a/drivers/clk/at91/clk-peripheral.c b/drivers/clk/at91/clk-peripheral.c
5961 index 58f3b568e9cb..d69cd2a121b1 100644
5962 --- a/drivers/clk/at91/clk-peripheral.c
5963 +++ b/drivers/clk/at91/clk-peripheral.c
5964 @@ -12,11 +12,13 @@
5965  #include <linux/clkdev.h>
5966  #include <linux/clk/at91_pmc.h>
5967  #include <linux/of.h>
5968 -#include <linux/of_address.h>
5969 -#include <linux/io.h>
5970 +#include <linux/mfd/syscon.h>
5971 +#include <linux/regmap.h>
5973  #include "pmc.h"
5975 +DEFINE_SPINLOCK(pmc_pcr_lock);
5977  #define PERIPHERAL_MAX         64
5979  #define PERIPHERAL_AT91RM9200  0
5980 @@ -33,7 +35,7 @@
5982  struct clk_peripheral {
5983         struct clk_hw hw;
5984 -       struct at91_pmc *pmc;
5985 +       struct regmap *regmap;
5986         u32 id;
5987  };
5989 @@ -41,8 +43,9 @@ struct clk_peripheral {
5991  struct clk_sam9x5_peripheral {
5992         struct clk_hw hw;
5993 -       struct at91_pmc *pmc;
5994 +       struct regmap *regmap;
5995         struct clk_range range;
5996 +       spinlock_t *lock;
5997         u32 id;
5998         u32 div;
5999         bool auto_div;
6000 @@ -54,7 +57,6 @@ struct clk_sam9x5_peripheral {
6001  static int clk_peripheral_enable(struct clk_hw *hw)
6003         struct clk_peripheral *periph = to_clk_peripheral(hw);
6004 -       struct at91_pmc *pmc = periph->pmc;
6005         int offset = AT91_PMC_PCER;
6006         u32 id = periph->id;
6008 @@ -62,14 +64,14 @@ static int clk_peripheral_enable(struct clk_hw *hw)
6009                 return 0;
6010         if (id > PERIPHERAL_ID_MAX)
6011                 offset = AT91_PMC_PCER1;
6012 -       pmc_write(pmc, offset, PERIPHERAL_MASK(id));
6013 +       regmap_write(periph->regmap, offset, PERIPHERAL_MASK(id));
6015         return 0;
6018  static void clk_peripheral_disable(struct clk_hw *hw)
6020         struct clk_peripheral *periph = to_clk_peripheral(hw);
6021 -       struct at91_pmc *pmc = periph->pmc;
6022         int offset = AT91_PMC_PCDR;
6023         u32 id = periph->id;
6025 @@ -77,21 +79,23 @@ static void clk_peripheral_disable(struct clk_hw *hw)
6026                 return;
6027         if (id > PERIPHERAL_ID_MAX)
6028                 offset = AT91_PMC_PCDR1;
6029 -       pmc_write(pmc, offset, PERIPHERAL_MASK(id));
6030 +       regmap_write(periph->regmap, offset, PERIPHERAL_MASK(id));
6033  static int clk_peripheral_is_enabled(struct clk_hw *hw)
6035         struct clk_peripheral *periph = to_clk_peripheral(hw);
6036 -       struct at91_pmc *pmc = periph->pmc;
6037         int offset = AT91_PMC_PCSR;
6038 +       unsigned int status;
6039         u32 id = periph->id;
6041         if (id < PERIPHERAL_ID_MIN)
6042                 return 1;
6043         if (id > PERIPHERAL_ID_MAX)
6044                 offset = AT91_PMC_PCSR1;
6045 -       return !!(pmc_read(pmc, offset) & PERIPHERAL_MASK(id));
6046 +       regmap_read(periph->regmap, offset, &status);
6048 +       return status & PERIPHERAL_MASK(id) ? 1 : 0;
6051  static const struct clk_ops peripheral_ops = {
6052 @@ -101,14 +105,14 @@ static const struct clk_ops peripheral_ops = {
6053  };
6055  static struct clk * __init
6056 -at91_clk_register_peripheral(struct at91_pmc *pmc, const char *name,
6057 +at91_clk_register_peripheral(struct regmap *regmap, const char *name,
6058                              const char *parent_name, u32 id)
6060         struct clk_peripheral *periph;
6061         struct clk *clk = NULL;
6062         struct clk_init_data init;
6064 -       if (!pmc || !name || !parent_name || id > PERIPHERAL_ID_MAX)
6065 +       if (!name || !parent_name || id > PERIPHERAL_ID_MAX)
6066                 return ERR_PTR(-EINVAL);
6068         periph = kzalloc(sizeof(*periph), GFP_KERNEL);
6069 @@ -123,7 +127,7 @@ at91_clk_register_peripheral(struct at91_pmc *pmc, const char *name,
6071         periph->id = id;
6072         periph->hw.init = &init;
6073 -       periph->pmc = pmc;
6074 +       periph->regmap = regmap;
6076         clk = clk_register(NULL, &periph->hw);
6077         if (IS_ERR(clk))
6078 @@ -160,53 +164,58 @@ static void clk_sam9x5_peripheral_autodiv(struct clk_sam9x5_peripheral *periph)
6079  static int clk_sam9x5_peripheral_enable(struct clk_hw *hw)
6081         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6082 -       struct at91_pmc *pmc = periph->pmc;
6083 -       u32 tmp;
6084 +       unsigned long flags;
6086         if (periph->id < PERIPHERAL_ID_MIN)
6087                 return 0;
6089 -       pmc_lock(pmc);
6090 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6091 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_DIV_MASK;
6092 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_DIV(periph->div)
6093 -                                        | AT91_PMC_PCR_CMD
6094 -                                        | AT91_PMC_PCR_EN);
6095 -       pmc_unlock(pmc);
6096 +       spin_lock_irqsave(periph->lock, flags);
6097 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6098 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6099 +       regmap_update_bits(periph->regmap, AT91_PMC_PCR,
6100 +                          AT91_PMC_PCR_DIV_MASK | AT91_PMC_PCR_CMD |
6101 +                          AT91_PMC_PCR_EN,
6102 +                          AT91_PMC_PCR_DIV(periph->div) |
6103 +                          AT91_PMC_PCR_CMD |
6104 +                          AT91_PMC_PCR_EN);
6105 +       spin_unlock_irqrestore(periph->lock, flags);
6107         return 0;
6110  static void clk_sam9x5_peripheral_disable(struct clk_hw *hw)
6112         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6113 -       struct at91_pmc *pmc = periph->pmc;
6114 -       u32 tmp;
6115 +       unsigned long flags;
6117         if (periph->id < PERIPHERAL_ID_MIN)
6118                 return;
6120 -       pmc_lock(pmc);
6121 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6122 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_EN;
6123 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_CMD);
6124 -       pmc_unlock(pmc);
6125 +       spin_lock_irqsave(periph->lock, flags);
6126 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6127 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6128 +       regmap_update_bits(periph->regmap, AT91_PMC_PCR,
6129 +                          AT91_PMC_PCR_EN | AT91_PMC_PCR_CMD,
6130 +                          AT91_PMC_PCR_CMD);
6131 +       spin_unlock_irqrestore(periph->lock, flags);
6134  static int clk_sam9x5_peripheral_is_enabled(struct clk_hw *hw)
6136         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6137 -       struct at91_pmc *pmc = periph->pmc;
6138 -       int ret;
6139 +       unsigned long flags;
6140 +       unsigned int status;
6142         if (periph->id < PERIPHERAL_ID_MIN)
6143                 return 1;
6145 -       pmc_lock(pmc);
6146 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6147 -       ret = !!(pmc_read(pmc, AT91_PMC_PCR) & AT91_PMC_PCR_EN);
6148 -       pmc_unlock(pmc);
6149 +       spin_lock_irqsave(periph->lock, flags);
6150 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6151 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6152 +       regmap_read(periph->regmap, AT91_PMC_PCR, &status);
6153 +       spin_unlock_irqrestore(periph->lock, flags);
6155 -       return ret;
6156 +       return status & AT91_PMC_PCR_EN ? 1 : 0;
6159  static unsigned long
6160 @@ -214,19 +223,20 @@ clk_sam9x5_peripheral_recalc_rate(struct clk_hw *hw,
6161                                   unsigned long parent_rate)
6163         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6164 -       struct at91_pmc *pmc = periph->pmc;
6165 -       u32 tmp;
6166 +       unsigned long flags;
6167 +       unsigned int status;
6169         if (periph->id < PERIPHERAL_ID_MIN)
6170                 return parent_rate;
6172 -       pmc_lock(pmc);
6173 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6174 -       tmp = pmc_read(pmc, AT91_PMC_PCR);
6175 -       pmc_unlock(pmc);
6176 +       spin_lock_irqsave(periph->lock, flags);
6177 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6178 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6179 +       regmap_read(periph->regmap, AT91_PMC_PCR, &status);
6180 +       spin_unlock_irqrestore(periph->lock, flags);
6182 -       if (tmp & AT91_PMC_PCR_EN) {
6183 -               periph->div = PERIPHERAL_RSHIFT(tmp);
6184 +       if (status & AT91_PMC_PCR_EN) {
6185 +               periph->div = PERIPHERAL_RSHIFT(status);
6186                 periph->auto_div = false;
6187         } else {
6188                 clk_sam9x5_peripheral_autodiv(periph);
6189 @@ -318,15 +328,15 @@ static const struct clk_ops sam9x5_peripheral_ops = {
6190  };
6192  static struct clk * __init
6193 -at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6194 -                                   const char *parent_name, u32 id,
6195 -                                   const struct clk_range *range)
6196 +at91_clk_register_sam9x5_peripheral(struct regmap *regmap, spinlock_t *lock,
6197 +                                   const char *name, const char *parent_name,
6198 +                                   u32 id, const struct clk_range *range)
6200         struct clk_sam9x5_peripheral *periph;
6201         struct clk *clk = NULL;
6202         struct clk_init_data init;
6204 -       if (!pmc || !name || !parent_name)
6205 +       if (!name || !parent_name)
6206                 return ERR_PTR(-EINVAL);
6208         periph = kzalloc(sizeof(*periph), GFP_KERNEL);
6209 @@ -342,7 +352,8 @@ at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6210         periph->id = id;
6211         periph->hw.init = &init;
6212         periph->div = 0;
6213 -       periph->pmc = pmc;
6214 +       periph->regmap = regmap;
6215 +       periph->lock = lock;
6216         periph->auto_div = true;
6217         periph->range = *range;
6219 @@ -356,7 +367,7 @@ at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6222  static void __init
6223 -of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6224 +of_at91_clk_periph_setup(struct device_node *np, u8 type)
6226         int num;
6227         u32 id;
6228 @@ -364,6 +375,7 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6229         const char *parent_name;
6230         const char *name;
6231         struct device_node *periphclknp;
6232 +       struct regmap *regmap;
6234         parent_name = of_clk_get_parent_name(np, 0);
6235         if (!parent_name)
6236 @@ -373,6 +385,10 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6237         if (!num || num > PERIPHERAL_MAX)
6238                 return;
6240 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6241 +       if (IS_ERR(regmap))
6242 +               return;
6244         for_each_child_of_node(np, periphclknp) {
6245                 if (of_property_read_u32(periphclknp, "reg", &id))
6246                         continue;
6247 @@ -384,7 +400,7 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6248                         name = periphclknp->name;
6250                 if (type == PERIPHERAL_AT91RM9200) {
6251 -                       clk = at91_clk_register_peripheral(pmc, name,
6252 +                       clk = at91_clk_register_peripheral(regmap, name,
6253                                                            parent_name, id);
6254                 } else {
6255                         struct clk_range range = CLK_RANGE(0, 0);
6256 @@ -393,7 +409,9 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6257                                               "atmel,clk-output-range",
6258                                               &range);
6260 -                       clk = at91_clk_register_sam9x5_peripheral(pmc, name,
6261 +                       clk = at91_clk_register_sam9x5_peripheral(regmap,
6262 +                                                                 &pmc_pcr_lock,
6263 +                                                                 name,
6264                                                                   parent_name,
6265                                                                   id, &range);
6266                 }
6267 @@ -405,14 +423,16 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6268         }
6271 -void __init of_at91rm9200_clk_periph_setup(struct device_node *np,
6272 -                                          struct at91_pmc *pmc)
6273 +static void __init of_at91rm9200_clk_periph_setup(struct device_node *np)
6275 -       of_at91_clk_periph_setup(np, pmc, PERIPHERAL_AT91RM9200);
6276 +       of_at91_clk_periph_setup(np, PERIPHERAL_AT91RM9200);
6278 +CLK_OF_DECLARE(at91rm9200_clk_periph, "atmel,at91rm9200-clk-peripheral",
6279 +              of_at91rm9200_clk_periph_setup);
6281 -void __init of_at91sam9x5_clk_periph_setup(struct device_node *np,
6282 -                                          struct at91_pmc *pmc)
6283 +static void __init of_at91sam9x5_clk_periph_setup(struct device_node *np)
6285 -       of_at91_clk_periph_setup(np, pmc, PERIPHERAL_AT91SAM9X5);
6286 +       of_at91_clk_periph_setup(np, PERIPHERAL_AT91SAM9X5);
6288 +CLK_OF_DECLARE(at91sam9x5_clk_periph, "atmel,at91sam9x5-clk-peripheral",
6289 +              of_at91sam9x5_clk_periph_setup);
6290 diff --git a/drivers/clk/at91/clk-pll.c b/drivers/clk/at91/clk-pll.c
6291 index 18b60f4895a6..fb2e0b56d4b7 100644
6292 --- a/drivers/clk/at91/clk-pll.c
6293 +++ b/drivers/clk/at91/clk-pll.c
6294 @@ -12,14 +12,8 @@
6295  #include <linux/clkdev.h>
6296  #include <linux/clk/at91_pmc.h>
6297  #include <linux/of.h>
6298 -#include <linux/of_address.h>
6299 -#include <linux/of_irq.h>
6300 -#include <linux/io.h>
6301 -#include <linux/kernel.h>
6302 -#include <linux/wait.h>
6303 -#include <linux/sched.h>
6304 -#include <linux/interrupt.h>
6305 -#include <linux/irq.h>
6306 +#include <linux/mfd/syscon.h>
6307 +#include <linux/regmap.h>
6309  #include "pmc.h"
6311 @@ -58,9 +52,7 @@ struct clk_pll_layout {
6313  struct clk_pll {
6314         struct clk_hw hw;
6315 -       struct at91_pmc *pmc;
6316 -       unsigned int irq;
6317 -       wait_queue_head_t wait;
6318 +       struct regmap *regmap;
6319         u8 id;
6320         u8 div;
6321         u8 range;
6322 @@ -69,20 +61,19 @@ struct clk_pll {
6323         const struct clk_pll_characteristics *characteristics;
6324  };
6326 -static irqreturn_t clk_pll_irq_handler(int irq, void *dev_id)
6327 +static inline bool clk_pll_ready(struct regmap *regmap, int id)
6329 -       struct clk_pll *pll = (struct clk_pll *)dev_id;
6330 +       unsigned int status;
6332 -       wake_up(&pll->wait);
6333 -       disable_irq_nosync(pll->irq);
6334 +       regmap_read(regmap, AT91_PMC_SR, &status);
6336 -       return IRQ_HANDLED;
6337 +       return status & PLL_STATUS_MASK(id) ? 1 : 0;
6340  static int clk_pll_prepare(struct clk_hw *hw)
6342         struct clk_pll *pll = to_clk_pll(hw);
6343 -       struct at91_pmc *pmc = pll->pmc;
6344 +       struct regmap *regmap = pll->regmap;
6345         const struct clk_pll_layout *layout = pll->layout;
6346         const struct clk_pll_characteristics *characteristics =
6347                                                         pll->characteristics;
6348 @@ -90,39 +81,34 @@ static int clk_pll_prepare(struct clk_hw *hw)
6349         u32 mask = PLL_STATUS_MASK(id);
6350         int offset = PLL_REG(id);
6351         u8 out = 0;
6352 -       u32 pllr, icpr;
6353 +       unsigned int pllr;
6354 +       unsigned int status;
6355         u8 div;
6356         u16 mul;
6358 -       pllr = pmc_read(pmc, offset);
6359 +       regmap_read(regmap, offset, &pllr);
6360         div = PLL_DIV(pllr);
6361         mul = PLL_MUL(pllr, layout);
6363 -       if ((pmc_read(pmc, AT91_PMC_SR) & mask) &&
6364 +       regmap_read(regmap, AT91_PMC_SR, &status);
6365 +       if ((status & mask) &&
6366             (div == pll->div && mul == pll->mul))
6367                 return 0;
6369         if (characteristics->out)
6370                 out = characteristics->out[pll->range];
6371 -       if (characteristics->icpll) {
6372 -               icpr = pmc_read(pmc, AT91_PMC_PLLICPR) & ~PLL_ICPR_MASK(id);
6373 -               icpr |= (characteristics->icpll[pll->range] <<
6374 -                       PLL_ICPR_SHIFT(id));
6375 -               pmc_write(pmc, AT91_PMC_PLLICPR, icpr);
6376 -       }
6378 -       pllr &= ~layout->pllr_mask;
6379 -       pllr |= layout->pllr_mask &
6380 -              (pll->div | (PLL_MAX_COUNT << PLL_COUNT_SHIFT) |
6381 -               (out << PLL_OUT_SHIFT) |
6382 -               ((pll->mul & layout->mul_mask) << layout->mul_shift));
6383 -       pmc_write(pmc, offset, pllr);
6385 -       while (!(pmc_read(pmc, AT91_PMC_SR) & mask)) {
6386 -               enable_irq(pll->irq);
6387 -               wait_event(pll->wait,
6388 -                          pmc_read(pmc, AT91_PMC_SR) & mask);
6389 -       }
6390 +       if (characteristics->icpll)
6391 +               regmap_update_bits(regmap, AT91_PMC_PLLICPR, PLL_ICPR_MASK(id),
6392 +                       characteristics->icpll[pll->range] << PLL_ICPR_SHIFT(id));
6394 +       regmap_update_bits(regmap, offset, layout->pllr_mask,
6395 +                       pll->div | (PLL_MAX_COUNT << PLL_COUNT_SHIFT) |
6396 +                       (out << PLL_OUT_SHIFT) |
6397 +                       ((pll->mul & layout->mul_mask) << layout->mul_shift));
6399 +       while (!clk_pll_ready(regmap, pll->id))
6400 +               cpu_relax();
6402         return 0;
6404 @@ -130,32 +116,35 @@ static int clk_pll_prepare(struct clk_hw *hw)
6405  static int clk_pll_is_prepared(struct clk_hw *hw)
6407         struct clk_pll *pll = to_clk_pll(hw);
6408 -       struct at91_pmc *pmc = pll->pmc;
6410 -       return !!(pmc_read(pmc, AT91_PMC_SR) &
6411 -                 PLL_STATUS_MASK(pll->id));
6412 +       return clk_pll_ready(pll->regmap, pll->id);
6415  static void clk_pll_unprepare(struct clk_hw *hw)
6417         struct clk_pll *pll = to_clk_pll(hw);
6418 -       struct at91_pmc *pmc = pll->pmc;
6419 -       const struct clk_pll_layout *layout = pll->layout;
6420 -       int offset = PLL_REG(pll->id);
6421 -       u32 tmp = pmc_read(pmc, offset) & ~(layout->pllr_mask);
6422 +       unsigned int mask = pll->layout->pllr_mask;
6424 -       pmc_write(pmc, offset, tmp);
6425 +       regmap_update_bits(pll->regmap, PLL_REG(pll->id), mask, ~mask);
6428  static unsigned long clk_pll_recalc_rate(struct clk_hw *hw,
6429                                          unsigned long parent_rate)
6431         struct clk_pll *pll = to_clk_pll(hw);
6432 +       unsigned int pllr;
6433 +       u16 mul;
6434 +       u8 div;
6436 -       if (!pll->div || !pll->mul)
6437 +       regmap_read(pll->regmap, PLL_REG(pll->id), &pllr);
6439 +       div = PLL_DIV(pllr);
6440 +       mul = PLL_MUL(pllr, pll->layout);
6442 +       if (!div || !mul)
6443                 return 0;
6445 -       return (parent_rate / pll->div) * (pll->mul + 1);
6446 +       return (parent_rate / div) * (mul + 1);
6449  static long clk_pll_get_best_div_mul(struct clk_pll *pll, unsigned long rate,
6450 @@ -308,7 +297,7 @@ static const struct clk_ops pll_ops = {
6451  };
6453  static struct clk * __init
6454 -at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6455 +at91_clk_register_pll(struct regmap *regmap, const char *name,
6456                       const char *parent_name, u8 id,
6457                       const struct clk_pll_layout *layout,
6458                       const struct clk_pll_characteristics *characteristics)
6459 @@ -316,9 +305,8 @@ at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6460         struct clk_pll *pll;
6461         struct clk *clk = NULL;
6462         struct clk_init_data init;
6463 -       int ret;
6464         int offset = PLL_REG(id);
6465 -       u32 tmp;
6466 +       unsigned int pllr;
6468         if (id > PLL_MAX_ID)
6469                 return ERR_PTR(-EINVAL);
6470 @@ -337,23 +325,13 @@ at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6471         pll->hw.init = &init;
6472         pll->layout = layout;
6473         pll->characteristics = characteristics;
6474 -       pll->pmc = pmc;
6475 -       pll->irq = irq;
6476 -       tmp = pmc_read(pmc, offset) & layout->pllr_mask;
6477 -       pll->div = PLL_DIV(tmp);
6478 -       pll->mul = PLL_MUL(tmp, layout);
6479 -       init_waitqueue_head(&pll->wait);
6480 -       irq_set_status_flags(pll->irq, IRQ_NOAUTOEN);
6481 -       ret = request_irq(pll->irq, clk_pll_irq_handler, IRQF_TRIGGER_HIGH,
6482 -                         id ? "clk-pllb" : "clk-plla", pll);
6483 -       if (ret) {
6484 -               kfree(pll);
6485 -               return ERR_PTR(ret);
6486 -       }
6487 +       pll->regmap = regmap;
6488 +       regmap_read(regmap, offset, &pllr);
6489 +       pll->div = PLL_DIV(pllr);
6490 +       pll->mul = PLL_MUL(pllr, layout);
6492         clk = clk_register(NULL, &pll->hw);
6493         if (IS_ERR(clk)) {
6494 -               free_irq(pll->irq, pll);
6495                 kfree(pll);
6496         }
6498 @@ -483,12 +461,12 @@ of_at91_clk_pll_get_characteristics(struct device_node *np)
6501  static void __init
6502 -of_at91_clk_pll_setup(struct device_node *np, struct at91_pmc *pmc,
6503 +of_at91_clk_pll_setup(struct device_node *np,
6504                       const struct clk_pll_layout *layout)
6506         u32 id;
6507 -       unsigned int irq;
6508         struct clk *clk;
6509 +       struct regmap *regmap;
6510         const char *parent_name;
6511         const char *name = np->name;
6512         struct clk_pll_characteristics *characteristics;
6513 @@ -500,15 +478,15 @@ of_at91_clk_pll_setup(struct device_node *np, struct at91_pmc *pmc,
6515         of_property_read_string(np, "clock-output-names", &name);
6517 -       characteristics = of_at91_clk_pll_get_characteristics(np);
6518 -       if (!characteristics)
6519 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6520 +       if (IS_ERR(regmap))
6521                 return;
6523 -       irq = irq_of_parse_and_map(np, 0);
6524 -       if (!irq)
6525 +       characteristics = of_at91_clk_pll_get_characteristics(np);
6526 +       if (!characteristics)
6527                 return;
6529 -       clk = at91_clk_register_pll(pmc, irq, name, parent_name, id, layout,
6530 +       clk = at91_clk_register_pll(regmap, name, parent_name, id, layout,
6531                                     characteristics);
6532         if (IS_ERR(clk))
6533                 goto out_free_characteristics;
6534 @@ -520,26 +498,30 @@ of_at91_clk_pll_setup(struct device_node *np, struct at91_pmc *pmc,
6535         kfree(characteristics);
6538 -void __init of_at91rm9200_clk_pll_setup(struct device_node *np,
6539 -                                              struct at91_pmc *pmc)
6540 +static void __init of_at91rm9200_clk_pll_setup(struct device_node *np)
6542 -       of_at91_clk_pll_setup(np, pmc, &at91rm9200_pll_layout);
6543 +       of_at91_clk_pll_setup(np, &at91rm9200_pll_layout);
6545 +CLK_OF_DECLARE(at91rm9200_clk_pll, "atmel,at91rm9200-clk-pll",
6546 +              of_at91rm9200_clk_pll_setup);
6548 -void __init of_at91sam9g45_clk_pll_setup(struct device_node *np,
6549 -                                               struct at91_pmc *pmc)
6550 +static void __init of_at91sam9g45_clk_pll_setup(struct device_node *np)
6552 -       of_at91_clk_pll_setup(np, pmc, &at91sam9g45_pll_layout);
6553 +       of_at91_clk_pll_setup(np, &at91sam9g45_pll_layout);
6555 +CLK_OF_DECLARE(at91sam9g45_clk_pll, "atmel,at91sam9g45-clk-pll",
6556 +              of_at91sam9g45_clk_pll_setup);
6558 -void __init of_at91sam9g20_clk_pllb_setup(struct device_node *np,
6559 -                                                struct at91_pmc *pmc)
6560 +static void __init of_at91sam9g20_clk_pllb_setup(struct device_node *np)
6562 -       of_at91_clk_pll_setup(np, pmc, &at91sam9g20_pllb_layout);
6563 +       of_at91_clk_pll_setup(np, &at91sam9g20_pllb_layout);
6565 +CLK_OF_DECLARE(at91sam9g20_clk_pllb, "atmel,at91sam9g20-clk-pllb",
6566 +              of_at91sam9g20_clk_pllb_setup);
6568 -void __init of_sama5d3_clk_pll_setup(struct device_node *np,
6569 -                                           struct at91_pmc *pmc)
6570 +static void __init of_sama5d3_clk_pll_setup(struct device_node *np)
6572 -       of_at91_clk_pll_setup(np, pmc, &sama5d3_pll_layout);
6573 +       of_at91_clk_pll_setup(np, &sama5d3_pll_layout);
6575 +CLK_OF_DECLARE(sama5d3_clk_pll, "atmel,sama5d3-clk-pll",
6576 +              of_sama5d3_clk_pll_setup);
6577 diff --git a/drivers/clk/at91/clk-plldiv.c b/drivers/clk/at91/clk-plldiv.c
6578 index ea226562bb40..2bed26481027 100644
6579 --- a/drivers/clk/at91/clk-plldiv.c
6580 +++ b/drivers/clk/at91/clk-plldiv.c
6581 @@ -12,8 +12,8 @@
6582  #include <linux/clkdev.h>
6583  #include <linux/clk/at91_pmc.h>
6584  #include <linux/of.h>
6585 -#include <linux/of_address.h>
6586 -#include <linux/io.h>
6587 +#include <linux/mfd/syscon.h>
6588 +#include <linux/regmap.h>
6590  #include "pmc.h"
6592 @@ -21,16 +21,18 @@
6594  struct clk_plldiv {
6595         struct clk_hw hw;
6596 -       struct at91_pmc *pmc;
6597 +       struct regmap *regmap;
6598  };
6600  static unsigned long clk_plldiv_recalc_rate(struct clk_hw *hw,
6601                                             unsigned long parent_rate)
6603         struct clk_plldiv *plldiv = to_clk_plldiv(hw);
6604 -       struct at91_pmc *pmc = plldiv->pmc;
6605 +       unsigned int mckr;
6607 -       if (pmc_read(pmc, AT91_PMC_MCKR) & AT91_PMC_PLLADIV2)
6608 +       regmap_read(plldiv->regmap, AT91_PMC_MCKR, &mckr);
6610 +       if (mckr & AT91_PMC_PLLADIV2)
6611                 return parent_rate / 2;
6613         return parent_rate;
6614 @@ -57,18 +59,12 @@ static int clk_plldiv_set_rate(struct clk_hw *hw, unsigned long rate,
6615                                unsigned long parent_rate)
6617         struct clk_plldiv *plldiv = to_clk_plldiv(hw);
6618 -       struct at91_pmc *pmc = plldiv->pmc;
6619 -       u32 tmp;
6621 -       if (parent_rate != rate && (parent_rate / 2) != rate)
6622 +       if ((parent_rate != rate) && (parent_rate / 2 != rate))
6623                 return -EINVAL;
6625 -       pmc_lock(pmc);
6626 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & ~AT91_PMC_PLLADIV2;
6627 -       if ((parent_rate / 2) == rate)
6628 -               tmp |= AT91_PMC_PLLADIV2;
6629 -       pmc_write(pmc, AT91_PMC_MCKR, tmp);
6630 -       pmc_unlock(pmc);
6631 +       regmap_update_bits(plldiv->regmap, AT91_PMC_MCKR, AT91_PMC_PLLADIV2,
6632 +                          parent_rate != rate ? AT91_PMC_PLLADIV2 : 0);
6634         return 0;
6636 @@ -80,7 +76,7 @@ static const struct clk_ops plldiv_ops = {
6637  };
6639  static struct clk * __init
6640 -at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6641 +at91_clk_register_plldiv(struct regmap *regmap, const char *name,
6642                          const char *parent_name)
6644         struct clk_plldiv *plldiv;
6645 @@ -98,7 +94,7 @@ at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6646         init.flags = CLK_SET_RATE_GATE;
6648         plldiv->hw.init = &init;
6649 -       plldiv->pmc = pmc;
6650 +       plldiv->regmap = regmap;
6652         clk = clk_register(NULL, &plldiv->hw);
6654 @@ -109,27 +105,27 @@ at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6657  static void __init
6658 -of_at91_clk_plldiv_setup(struct device_node *np, struct at91_pmc *pmc)
6659 +of_at91sam9x5_clk_plldiv_setup(struct device_node *np)
6661         struct clk *clk;
6662         const char *parent_name;
6663         const char *name = np->name;
6664 +       struct regmap *regmap;
6666         parent_name = of_clk_get_parent_name(np, 0);
6668         of_property_read_string(np, "clock-output-names", &name);
6670 -       clk = at91_clk_register_plldiv(pmc, name, parent_name);
6671 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6672 +       if (IS_ERR(regmap))
6673 +               return;
6675 +       clk = at91_clk_register_plldiv(regmap, name, parent_name);
6676         if (IS_ERR(clk))
6677                 return;
6679         of_clk_add_provider(np, of_clk_src_simple_get, clk);
6680         return;
6683 -void __init of_at91sam9x5_clk_plldiv_setup(struct device_node *np,
6684 -                                          struct at91_pmc *pmc)
6686 -       of_at91_clk_plldiv_setup(np, pmc);
6688 +CLK_OF_DECLARE(at91sam9x5_clk_plldiv, "atmel,at91sam9x5-clk-plldiv",
6689 +              of_at91sam9x5_clk_plldiv_setup);
6690 diff --git a/drivers/clk/at91/clk-programmable.c b/drivers/clk/at91/clk-programmable.c
6691 index 14b270b85fec..bc0be629671b 100644
6692 --- a/drivers/clk/at91/clk-programmable.c
6693 +++ b/drivers/clk/at91/clk-programmable.c
6694 @@ -12,10 +12,8 @@
6695  #include <linux/clkdev.h>
6696  #include <linux/clk/at91_pmc.h>
6697  #include <linux/of.h>
6698 -#include <linux/of_address.h>
6699 -#include <linux/io.h>
6700 -#include <linux/wait.h>
6701 -#include <linux/sched.h>
6702 +#include <linux/mfd/syscon.h>
6703 +#include <linux/regmap.h>
6705  #include "pmc.h"
6707 @@ -24,6 +22,7 @@
6709  #define PROG_STATUS_MASK(id)   (1 << ((id) + 8))
6710  #define PROG_PRES_MASK         0x7
6711 +#define PROG_PRES(layout, pckr)        ((pckr >> layout->pres_shift) & PROG_PRES_MASK)
6712  #define PROG_MAX_RM9200_CSS    3
6714  struct clk_programmable_layout {
6715 @@ -34,7 +33,7 @@ struct clk_programmable_layout {
6717  struct clk_programmable {
6718         struct clk_hw hw;
6719 -       struct at91_pmc *pmc;
6720 +       struct regmap *regmap;
6721         u8 id;
6722         const struct clk_programmable_layout *layout;
6723  };
6724 @@ -44,14 +43,12 @@ struct clk_programmable {
6725  static unsigned long clk_programmable_recalc_rate(struct clk_hw *hw,
6726                                                   unsigned long parent_rate)
6728 -       u32 pres;
6729         struct clk_programmable *prog = to_clk_programmable(hw);
6730 -       struct at91_pmc *pmc = prog->pmc;
6731 -       const struct clk_programmable_layout *layout = prog->layout;
6732 +       unsigned int pckr;
6734 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6736 -       pres = (pmc_read(pmc, AT91_PMC_PCKR(prog->id)) >> layout->pres_shift) &
6737 -              PROG_PRES_MASK;
6738 -       return parent_rate >> pres;
6739 +       return parent_rate >> PROG_PRES(prog->layout, pckr);
6742  static int clk_programmable_determine_rate(struct clk_hw *hw,
6743 @@ -101,36 +98,36 @@ static int clk_programmable_set_parent(struct clk_hw *hw, u8 index)
6745         struct clk_programmable *prog = to_clk_programmable(hw);
6746         const struct clk_programmable_layout *layout = prog->layout;
6747 -       struct at91_pmc *pmc = prog->pmc;
6748 -       u32 tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id)) & ~layout->css_mask;
6749 +       unsigned int mask = layout->css_mask;
6750 +       unsigned int pckr = 0;
6752         if (layout->have_slck_mck)
6753 -               tmp &= AT91_PMC_CSSMCK_MCK;
6754 +               mask |= AT91_PMC_CSSMCK_MCK;
6756         if (index > layout->css_mask) {
6757 -               if (index > PROG_MAX_RM9200_CSS && layout->have_slck_mck) {
6758 -                       tmp |= AT91_PMC_CSSMCK_MCK;
6759 -                       return 0;
6760 -               } else {
6761 +               if (index > PROG_MAX_RM9200_CSS && !layout->have_slck_mck)
6762                         return -EINVAL;
6763 -               }
6765 +               pckr |= AT91_PMC_CSSMCK_MCK;
6766         }
6768 -       pmc_write(pmc, AT91_PMC_PCKR(prog->id), tmp | index);
6769 +       regmap_update_bits(prog->regmap, AT91_PMC_PCKR(prog->id), mask, pckr);
6771         return 0;
6774  static u8 clk_programmable_get_parent(struct clk_hw *hw)
6776 -       u32 tmp;
6777 -       u8 ret;
6778         struct clk_programmable *prog = to_clk_programmable(hw);
6779 -       struct at91_pmc *pmc = prog->pmc;
6780         const struct clk_programmable_layout *layout = prog->layout;
6781 +       unsigned int pckr;
6782 +       u8 ret;
6784 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6786 +       ret = pckr & layout->css_mask;
6788 -       tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id));
6789 -       ret = tmp & layout->css_mask;
6790 -       if (layout->have_slck_mck && (tmp & AT91_PMC_CSSMCK_MCK) && !ret)
6791 +       if (layout->have_slck_mck && (pckr & AT91_PMC_CSSMCK_MCK) && !ret)
6792                 ret = PROG_MAX_RM9200_CSS + 1;
6794         return ret;
6795 @@ -140,26 +137,27 @@ static int clk_programmable_set_rate(struct clk_hw *hw, unsigned long rate,
6796                                      unsigned long parent_rate)
6798         struct clk_programmable *prog = to_clk_programmable(hw);
6799 -       struct at91_pmc *pmc = prog->pmc;
6800         const struct clk_programmable_layout *layout = prog->layout;
6801         unsigned long div = parent_rate / rate;
6802 +       unsigned int pckr;
6803         int shift = 0;
6804 -       u32 tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id)) &
6805 -                 ~(PROG_PRES_MASK << layout->pres_shift);
6807 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6809         if (!div)
6810                 return -EINVAL;
6812         shift = fls(div) - 1;
6814 -       if (div != (1<<shift))
6815 +       if (div != (1 << shift))
6816                 return -EINVAL;
6818         if (shift >= PROG_PRES_MASK)
6819                 return -EINVAL;
6821 -       pmc_write(pmc, AT91_PMC_PCKR(prog->id),
6822 -                 tmp | (shift << layout->pres_shift));
6823 +       regmap_update_bits(prog->regmap, AT91_PMC_PCKR(prog->id),
6824 +                          PROG_PRES_MASK << layout->pres_shift,
6825 +                          shift << layout->pres_shift);
6827         return 0;
6829 @@ -173,7 +171,7 @@ static const struct clk_ops programmable_ops = {
6830  };
6832  static struct clk * __init
6833 -at91_clk_register_programmable(struct at91_pmc *pmc,
6834 +at91_clk_register_programmable(struct regmap *regmap,
6835                                const char *name, const char **parent_names,
6836                                u8 num_parents, u8 id,
6837                                const struct clk_programmable_layout *layout)
6838 @@ -198,7 +196,7 @@ at91_clk_register_programmable(struct at91_pmc *pmc,
6839         prog->id = id;
6840         prog->layout = layout;
6841         prog->hw.init = &init;
6842 -       prog->pmc = pmc;
6843 +       prog->regmap = regmap;
6845         clk = clk_register(NULL, &prog->hw);
6846         if (IS_ERR(clk))
6847 @@ -226,7 +224,7 @@ static const struct clk_programmable_layout at91sam9x5_programmable_layout = {
6848  };
6850  static void __init
6851 -of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6852 +of_at91_clk_prog_setup(struct device_node *np,
6853                        const struct clk_programmable_layout *layout)
6855         int num;
6856 @@ -236,6 +234,7 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6857         const char *parent_names[PROG_SOURCE_MAX];
6858         const char *name;
6859         struct device_node *progclknp;
6860 +       struct regmap *regmap;
6862         num_parents = of_clk_get_parent_count(np);
6863         if (num_parents <= 0 || num_parents > PROG_SOURCE_MAX)
6864 @@ -247,6 +246,10 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6865         if (!num || num > (PROG_ID_MAX + 1))
6866                 return;
6868 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6869 +       if (IS_ERR(regmap))
6870 +               return;
6872         for_each_child_of_node(np, progclknp) {
6873                 if (of_property_read_u32(progclknp, "reg", &id))
6874                         continue;
6875 @@ -254,7 +257,7 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6876                 if (of_property_read_string(np, "clock-output-names", &name))
6877                         name = progclknp->name;
6879 -               clk = at91_clk_register_programmable(pmc, name,
6880 +               clk = at91_clk_register_programmable(regmap, name,
6881                                                      parent_names, num_parents,
6882                                                      id, layout);
6883                 if (IS_ERR(clk))
6884 @@ -265,20 +268,23 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6888 -void __init of_at91rm9200_clk_prog_setup(struct device_node *np,
6889 -                                        struct at91_pmc *pmc)
6890 +static void __init of_at91rm9200_clk_prog_setup(struct device_node *np)
6892 -       of_at91_clk_prog_setup(np, pmc, &at91rm9200_programmable_layout);
6893 +       of_at91_clk_prog_setup(np, &at91rm9200_programmable_layout);
6895 +CLK_OF_DECLARE(at91rm9200_clk_prog, "atmel,at91rm9200-clk-programmable",
6896 +              of_at91rm9200_clk_prog_setup);
6898 -void __init of_at91sam9g45_clk_prog_setup(struct device_node *np,
6899 -                                         struct at91_pmc *pmc)
6900 +static void __init of_at91sam9g45_clk_prog_setup(struct device_node *np)
6902 -       of_at91_clk_prog_setup(np, pmc, &at91sam9g45_programmable_layout);
6903 +       of_at91_clk_prog_setup(np, &at91sam9g45_programmable_layout);
6905 +CLK_OF_DECLARE(at91sam9g45_clk_prog, "atmel,at91sam9g45-clk-programmable",
6906 +              of_at91sam9g45_clk_prog_setup);
6908 -void __init of_at91sam9x5_clk_prog_setup(struct device_node *np,
6909 -                                        struct at91_pmc *pmc)
6910 +static void __init of_at91sam9x5_clk_prog_setup(struct device_node *np)
6912 -       of_at91_clk_prog_setup(np, pmc, &at91sam9x5_programmable_layout);
6913 +       of_at91_clk_prog_setup(np, &at91sam9x5_programmable_layout);
6915 +CLK_OF_DECLARE(at91sam9x5_clk_prog, "atmel,at91sam9x5-clk-programmable",
6916 +              of_at91sam9x5_clk_prog_setup);
6917 diff --git a/drivers/clk/at91/clk-slow.c b/drivers/clk/at91/clk-slow.c
6918 index d0d5076a9b94..221c09684ba3 100644
6919 --- a/drivers/clk/at91/clk-slow.c
6920 +++ b/drivers/clk/at91/clk-slow.c
6921 @@ -13,17 +13,11 @@
6922  #include <linux/clk.h>
6923  #include <linux/clk-provider.h>
6924  #include <linux/clkdev.h>
6925 -#include <linux/slab.h>
6926  #include <linux/clk/at91_pmc.h>
6927  #include <linux/delay.h>
6928  #include <linux/of.h>
6929 -#include <linux/of_address.h>
6930 -#include <linux/of_irq.h>
6931 -#include <linux/io.h>
6932 -#include <linux/interrupt.h>
6933 -#include <linux/irq.h>
6934 -#include <linux/sched.h>
6935 -#include <linux/wait.h>
6936 +#include <linux/mfd/syscon.h>
6937 +#include <linux/regmap.h>
6939  #include "pmc.h"
6940  #include "sckc.h"
6941 @@ -59,7 +53,7 @@ struct clk_slow_rc_osc {
6943  struct clk_sam9260_slow {
6944         struct clk_hw hw;
6945 -       struct at91_pmc *pmc;
6946 +       struct regmap *regmap;
6947  };
6949  #define to_clk_sam9260_slow(hw) container_of(hw, struct clk_sam9260_slow, hw)
6950 @@ -393,8 +387,11 @@ void __init of_at91sam9x5_clk_slow_setup(struct device_node *np,
6951  static u8 clk_sam9260_slow_get_parent(struct clk_hw *hw)
6953         struct clk_sam9260_slow *slowck = to_clk_sam9260_slow(hw);
6954 +       unsigned int status;
6956 -       return !!(pmc_read(slowck->pmc, AT91_PMC_SR) & AT91_PMC_OSCSEL);
6957 +       regmap_read(slowck->regmap, AT91_PMC_SR, &status);
6959 +       return status & AT91_PMC_OSCSEL ? 1 : 0;
6962  static const struct clk_ops sam9260_slow_ops = {
6963 @@ -402,7 +399,7 @@ static const struct clk_ops sam9260_slow_ops = {
6964  };
6966  static struct clk * __init
6967 -at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6968 +at91_clk_register_sam9260_slow(struct regmap *regmap,
6969                                const char *name,
6970                                const char **parent_names,
6971                                int num_parents)
6972 @@ -411,7 +408,7 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6973         struct clk *clk = NULL;
6974         struct clk_init_data init;
6976 -       if (!pmc || !name)
6977 +       if (!name)
6978                 return ERR_PTR(-EINVAL);
6980         if (!parent_names || !num_parents)
6981 @@ -428,7 +425,7 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6982         init.flags = 0;
6984         slowck->hw.init = &init;
6985 -       slowck->pmc = pmc;
6986 +       slowck->regmap = regmap;
6988         clk = clk_register(NULL, &slowck->hw);
6989         if (IS_ERR(clk))
6990 @@ -439,29 +436,34 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6991         return clk;
6994 -void __init of_at91sam9260_clk_slow_setup(struct device_node *np,
6995 -                                         struct at91_pmc *pmc)
6996 +static void __init of_at91sam9260_clk_slow_setup(struct device_node *np)
6998         struct clk *clk;
6999         const char *parent_names[2];
7000         int num_parents;
7001         const char *name = np->name;
7002 +       struct regmap *regmap;
7004         num_parents = of_clk_get_parent_count(np);
7005         if (num_parents != 2)
7006                 return;
7008         of_clk_parent_fill(np, parent_names, num_parents);
7009 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7010 +       if (IS_ERR(regmap))
7011 +               return;
7013         of_property_read_string(np, "clock-output-names", &name);
7015 -       clk = at91_clk_register_sam9260_slow(pmc, name, parent_names,
7016 +       clk = at91_clk_register_sam9260_slow(regmap, name, parent_names,
7017                                              num_parents);
7018         if (IS_ERR(clk))
7019                 return;
7021         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7023 +CLK_OF_DECLARE(at91sam9260_clk_slow, "atmel,at91sam9260-clk-slow",
7024 +              of_at91sam9260_clk_slow_setup);
7026  /*
7027   * FIXME: All slow clk users are not properly claiming it (get + prepare +
7028 diff --git a/drivers/clk/at91/clk-smd.c b/drivers/clk/at91/clk-smd.c
7029 index a7f8501cfa05..e6948a52005a 100644
7030 --- a/drivers/clk/at91/clk-smd.c
7031 +++ b/drivers/clk/at91/clk-smd.c
7032 @@ -12,8 +12,8 @@
7033  #include <linux/clkdev.h>
7034  #include <linux/clk/at91_pmc.h>
7035  #include <linux/of.h>
7036 -#include <linux/of_address.h>
7037 -#include <linux/io.h>
7038 +#include <linux/mfd/syscon.h>
7039 +#include <linux/regmap.h>
7041  #include "pmc.h"
7043 @@ -24,7 +24,7 @@
7045  struct at91sam9x5_clk_smd {
7046         struct clk_hw hw;
7047 -       struct at91_pmc *pmc;
7048 +       struct regmap *regmap;
7049  };
7051  #define to_at91sam9x5_clk_smd(hw) \
7052 @@ -33,13 +33,13 @@ struct at91sam9x5_clk_smd {
7053  static unsigned long at91sam9x5_clk_smd_recalc_rate(struct clk_hw *hw,
7054                                                     unsigned long parent_rate)
7056 -       u32 tmp;
7057 -       u8 smddiv;
7058         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
7059 -       struct at91_pmc *pmc = smd->pmc;
7060 +       unsigned int smdr;
7061 +       u8 smddiv;
7063 +       regmap_read(smd->regmap, AT91_PMC_SMD, &smdr);
7064 +       smddiv = (smdr & AT91_PMC_SMD_DIV) >> SMD_DIV_SHIFT;
7066 -       tmp = pmc_read(pmc, AT91_PMC_SMD);
7067 -       smddiv = (tmp & AT91_PMC_SMD_DIV) >> SMD_DIV_SHIFT;
7068         return parent_rate / (smddiv + 1);
7071 @@ -67,40 +67,38 @@ static long at91sam9x5_clk_smd_round_rate(struct clk_hw *hw, unsigned long rate,
7073  static int at91sam9x5_clk_smd_set_parent(struct clk_hw *hw, u8 index)
7075 -       u32 tmp;
7076         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
7077 -       struct at91_pmc *pmc = smd->pmc;
7079         if (index > 1)
7080                 return -EINVAL;
7081 -       tmp = pmc_read(pmc, AT91_PMC_SMD) & ~AT91_PMC_SMDS;
7082 -       if (index)
7083 -               tmp |= AT91_PMC_SMDS;
7084 -       pmc_write(pmc, AT91_PMC_SMD, tmp);
7086 +       regmap_update_bits(smd->regmap, AT91_PMC_SMD, AT91_PMC_SMDS,
7087 +                          index ? AT91_PMC_SMDS : 0);
7089         return 0;
7092  static u8 at91sam9x5_clk_smd_get_parent(struct clk_hw *hw)
7094         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
7095 -       struct at91_pmc *pmc = smd->pmc;
7096 +       unsigned int smdr;
7098 -       return pmc_read(pmc, AT91_PMC_SMD) & AT91_PMC_SMDS;
7099 +       regmap_read(smd->regmap, AT91_PMC_SMD, &smdr);
7101 +       return smdr & AT91_PMC_SMDS;
7104  static int at91sam9x5_clk_smd_set_rate(struct clk_hw *hw, unsigned long rate,
7105                                        unsigned long parent_rate)
7107 -       u32 tmp;
7108         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
7109 -       struct at91_pmc *pmc = smd->pmc;
7110         unsigned long div = parent_rate / rate;
7112         if (parent_rate % rate || div < 1 || div > (SMD_MAX_DIV + 1))
7113                 return -EINVAL;
7114 -       tmp = pmc_read(pmc, AT91_PMC_SMD) & ~AT91_PMC_SMD_DIV;
7115 -       tmp |= (div - 1) << SMD_DIV_SHIFT;
7116 -       pmc_write(pmc, AT91_PMC_SMD, tmp);
7118 +       regmap_update_bits(smd->regmap, AT91_PMC_SMD, AT91_PMC_SMD_DIV,
7119 +                          (div - 1) << SMD_DIV_SHIFT);
7121         return 0;
7123 @@ -114,7 +112,7 @@ static const struct clk_ops at91sam9x5_smd_ops = {
7124  };
7126  static struct clk * __init
7127 -at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7128 +at91sam9x5_clk_register_smd(struct regmap *regmap, const char *name,
7129                             const char **parent_names, u8 num_parents)
7131         struct at91sam9x5_clk_smd *smd;
7132 @@ -132,7 +130,7 @@ at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7133         init.flags = CLK_SET_RATE_GATE | CLK_SET_PARENT_GATE;
7135         smd->hw.init = &init;
7136 -       smd->pmc = pmc;
7137 +       smd->regmap = regmap;
7139         clk = clk_register(NULL, &smd->hw);
7140         if (IS_ERR(clk))
7141 @@ -141,13 +139,13 @@ at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7142         return clk;
7145 -void __init of_at91sam9x5_clk_smd_setup(struct device_node *np,
7146 -                                       struct at91_pmc *pmc)
7147 +static void __init of_at91sam9x5_clk_smd_setup(struct device_node *np)
7149         struct clk *clk;
7150         int num_parents;
7151         const char *parent_names[SMD_SOURCE_MAX];
7152         const char *name = np->name;
7153 +       struct regmap *regmap;
7155         num_parents = of_clk_get_parent_count(np);
7156         if (num_parents <= 0 || num_parents > SMD_SOURCE_MAX)
7157 @@ -157,10 +155,16 @@ void __init of_at91sam9x5_clk_smd_setup(struct device_node *np,
7159         of_property_read_string(np, "clock-output-names", &name);
7161 -       clk = at91sam9x5_clk_register_smd(pmc, name, parent_names,
7162 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7163 +       if (IS_ERR(regmap))
7164 +               return;
7166 +       clk = at91sam9x5_clk_register_smd(regmap, name, parent_names,
7167                                           num_parents);
7168         if (IS_ERR(clk))
7169                 return;
7171         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7173 +CLK_OF_DECLARE(at91sam9x5_clk_smd, "atmel,at91sam9x5-clk-smd",
7174 +              of_at91sam9x5_clk_smd_setup);
7175 diff --git a/drivers/clk/at91/clk-system.c b/drivers/clk/at91/clk-system.c
7176 index 3f5314344286..8f35d8172909 100644
7177 --- a/drivers/clk/at91/clk-system.c
7178 +++ b/drivers/clk/at91/clk-system.c
7179 @@ -12,13 +12,8 @@
7180  #include <linux/clkdev.h>
7181  #include <linux/clk/at91_pmc.h>
7182  #include <linux/of.h>
7183 -#include <linux/of_address.h>
7184 -#include <linux/io.h>
7185 -#include <linux/irq.h>
7186 -#include <linux/of_irq.h>
7187 -#include <linux/interrupt.h>
7188 -#include <linux/wait.h>
7189 -#include <linux/sched.h>
7190 +#include <linux/mfd/syscon.h>
7191 +#include <linux/regmap.h>
7193  #include "pmc.h"
7195 @@ -29,9 +24,7 @@
7196  #define to_clk_system(hw) container_of(hw, struct clk_system, hw)
7197  struct clk_system {
7198         struct clk_hw hw;
7199 -       struct at91_pmc *pmc;
7200 -       unsigned int irq;
7201 -       wait_queue_head_t wait;
7202 +       struct regmap *regmap;
7203         u8 id;
7204  };
7206 @@ -39,58 +32,54 @@ static inline int is_pck(int id)
7208         return (id >= 8) && (id <= 15);
7210 -static irqreturn_t clk_system_irq_handler(int irq, void *dev_id)
7212 +static inline bool clk_system_ready(struct regmap *regmap, int id)
7214 -       struct clk_system *sys = (struct clk_system *)dev_id;
7215 +       unsigned int status;
7217 -       wake_up(&sys->wait);
7218 -       disable_irq_nosync(sys->irq);
7219 +       regmap_read(regmap, AT91_PMC_SR, &status);
7221 -       return IRQ_HANDLED;
7222 +       return status & (1 << id) ? 1 : 0;
7225  static int clk_system_prepare(struct clk_hw *hw)
7227         struct clk_system *sys = to_clk_system(hw);
7228 -       struct at91_pmc *pmc = sys->pmc;
7229 -       u32 mask = 1 << sys->id;
7231 -       pmc_write(pmc, AT91_PMC_SCER, mask);
7232 +       regmap_write(sys->regmap, AT91_PMC_SCER, 1 << sys->id);
7234         if (!is_pck(sys->id))
7235                 return 0;
7237 -       while (!(pmc_read(pmc, AT91_PMC_SR) & mask)) {
7238 -               if (sys->irq) {
7239 -                       enable_irq(sys->irq);
7240 -                       wait_event(sys->wait,
7241 -                                  pmc_read(pmc, AT91_PMC_SR) & mask);
7242 -               } else
7243 -                       cpu_relax();
7244 -       }
7245 +       while (!clk_system_ready(sys->regmap, sys->id))
7246 +               cpu_relax();
7248         return 0;
7251  static void clk_system_unprepare(struct clk_hw *hw)
7253         struct clk_system *sys = to_clk_system(hw);
7254 -       struct at91_pmc *pmc = sys->pmc;
7256 -       pmc_write(pmc, AT91_PMC_SCDR, 1 << sys->id);
7257 +       regmap_write(sys->regmap, AT91_PMC_SCDR, 1 << sys->id);
7260  static int clk_system_is_prepared(struct clk_hw *hw)
7262         struct clk_system *sys = to_clk_system(hw);
7263 -       struct at91_pmc *pmc = sys->pmc;
7264 +       unsigned int status;
7266 +       regmap_read(sys->regmap, AT91_PMC_SCSR, &status);
7268 -       if (!(pmc_read(pmc, AT91_PMC_SCSR) & (1 << sys->id)))
7269 +       if (!(status & (1 << sys->id)))
7270                 return 0;
7272         if (!is_pck(sys->id))
7273                 return 1;
7275 -       return !!(pmc_read(pmc, AT91_PMC_SR) & (1 << sys->id));
7276 +       regmap_read(sys->regmap, AT91_PMC_SR, &status);
7278 +       return status & (1 << sys->id) ? 1 : 0;
7281  static const struct clk_ops system_ops = {
7282 @@ -100,13 +89,12 @@ static const struct clk_ops system_ops = {
7283  };
7285  static struct clk * __init
7286 -at91_clk_register_system(struct at91_pmc *pmc, const char *name,
7287 -                        const char *parent_name, u8 id, int irq)
7288 +at91_clk_register_system(struct regmap *regmap, const char *name,
7289 +                        const char *parent_name, u8 id)
7291         struct clk_system *sys;
7292         struct clk *clk = NULL;
7293         struct clk_init_data init;
7294 -       int ret;
7296         if (!parent_name || id > SYSTEM_MAX_ID)
7297                 return ERR_PTR(-EINVAL);
7298 @@ -123,44 +111,33 @@ at91_clk_register_system(struct at91_pmc *pmc, const char *name,
7300         sys->id = id;
7301         sys->hw.init = &init;
7302 -       sys->pmc = pmc;
7303 -       sys->irq = irq;
7304 -       if (irq) {
7305 -               init_waitqueue_head(&sys->wait);
7306 -               irq_set_status_flags(sys->irq, IRQ_NOAUTOEN);
7307 -               ret = request_irq(sys->irq, clk_system_irq_handler,
7308 -                               IRQF_TRIGGER_HIGH, name, sys);
7309 -               if (ret) {
7310 -                       kfree(sys);
7311 -                       return ERR_PTR(ret);
7312 -               }
7313 -       }
7314 +       sys->regmap = regmap;
7316         clk = clk_register(NULL, &sys->hw);
7317 -       if (IS_ERR(clk)) {
7318 -               if (irq)
7319 -                       free_irq(sys->irq, sys);
7320 +       if (IS_ERR(clk))
7321                 kfree(sys);
7322 -       }
7324         return clk;
7327 -static void __init
7328 -of_at91_clk_sys_setup(struct device_node *np, struct at91_pmc *pmc)
7329 +static void __init of_at91rm9200_clk_sys_setup(struct device_node *np)
7331         int num;
7332 -       int irq = 0;
7333         u32 id;
7334         struct clk *clk;
7335         const char *name;
7336         struct device_node *sysclknp;
7337         const char *parent_name;
7338 +       struct regmap *regmap;
7340         num = of_get_child_count(np);
7341         if (num > (SYSTEM_MAX_ID + 1))
7342                 return;
7344 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7345 +       if (IS_ERR(regmap))
7346 +               return;
7348         for_each_child_of_node(np, sysclknp) {
7349                 if (of_property_read_u32(sysclknp, "reg", &id))
7350                         continue;
7351 @@ -168,21 +145,14 @@ of_at91_clk_sys_setup(struct device_node *np, struct at91_pmc *pmc)
7352                 if (of_property_read_string(np, "clock-output-names", &name))
7353                         name = sysclknp->name;
7355 -               if (is_pck(id))
7356 -                       irq = irq_of_parse_and_map(sysclknp, 0);
7358                 parent_name = of_clk_get_parent_name(sysclknp, 0);
7360 -               clk = at91_clk_register_system(pmc, name, parent_name, id, irq);
7361 +               clk = at91_clk_register_system(regmap, name, parent_name, id);
7362                 if (IS_ERR(clk))
7363                         continue;
7365                 of_clk_add_provider(sysclknp, of_clk_src_simple_get, clk);
7366         }
7369 -void __init of_at91rm9200_clk_sys_setup(struct device_node *np,
7370 -                                       struct at91_pmc *pmc)
7372 -       of_at91_clk_sys_setup(np, pmc);
7374 +CLK_OF_DECLARE(at91rm9200_clk_sys, "atmel,at91rm9200-clk-system",
7375 +              of_at91rm9200_clk_sys_setup);
7376 diff --git a/drivers/clk/at91/clk-usb.c b/drivers/clk/at91/clk-usb.c
7377 index 8ab8502778a2..650ca45892c0 100644
7378 --- a/drivers/clk/at91/clk-usb.c
7379 +++ b/drivers/clk/at91/clk-usb.c
7380 @@ -12,8 +12,8 @@
7381  #include <linux/clkdev.h>
7382  #include <linux/clk/at91_pmc.h>
7383  #include <linux/of.h>
7384 -#include <linux/of_address.h>
7385 -#include <linux/io.h>
7386 +#include <linux/mfd/syscon.h>
7387 +#include <linux/regmap.h>
7389  #include "pmc.h"
7391 @@ -27,7 +27,7 @@
7393  struct at91sam9x5_clk_usb {
7394         struct clk_hw hw;
7395 -       struct at91_pmc *pmc;
7396 +       struct regmap *regmap;
7397  };
7399  #define to_at91sam9x5_clk_usb(hw) \
7400 @@ -35,7 +35,7 @@ struct at91sam9x5_clk_usb {
7402  struct at91rm9200_clk_usb {
7403         struct clk_hw hw;
7404 -       struct at91_pmc *pmc;
7405 +       struct regmap *regmap;
7406         u32 divisors[4];
7407  };
7409 @@ -45,13 +45,12 @@ struct at91rm9200_clk_usb {
7410  static unsigned long at91sam9x5_clk_usb_recalc_rate(struct clk_hw *hw,
7411                                                     unsigned long parent_rate)
7413 -       u32 tmp;
7414 -       u8 usbdiv;
7415         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7416 -       struct at91_pmc *pmc = usb->pmc;
7417 +       unsigned int usbr;
7418 +       u8 usbdiv;
7420 -       tmp = pmc_read(pmc, AT91_PMC_USB);
7421 -       usbdiv = (tmp & AT91_PMC_OHCIUSBDIV) >> SAM9X5_USB_DIV_SHIFT;
7422 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7423 +       usbdiv = (usbr & AT91_PMC_OHCIUSBDIV) >> SAM9X5_USB_DIV_SHIFT;
7425         return DIV_ROUND_CLOSEST(parent_rate, (usbdiv + 1));
7427 @@ -109,33 +108,31 @@ static int at91sam9x5_clk_usb_determine_rate(struct clk_hw *hw,
7429  static int at91sam9x5_clk_usb_set_parent(struct clk_hw *hw, u8 index)
7431 -       u32 tmp;
7432         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7433 -       struct at91_pmc *pmc = usb->pmc;
7435         if (index > 1)
7436                 return -EINVAL;
7437 -       tmp = pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_USBS;
7438 -       if (index)
7439 -               tmp |= AT91_PMC_USBS;
7440 -       pmc_write(pmc, AT91_PMC_USB, tmp);
7442 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS,
7443 +                          index ? AT91_PMC_USBS : 0);
7445         return 0;
7448  static u8 at91sam9x5_clk_usb_get_parent(struct clk_hw *hw)
7450         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7451 -       struct at91_pmc *pmc = usb->pmc;
7452 +       unsigned int usbr;
7454 -       return pmc_read(pmc, AT91_PMC_USB) & AT91_PMC_USBS;
7455 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7457 +       return usbr & AT91_PMC_USBS;
7460  static int at91sam9x5_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7461                                        unsigned long parent_rate)
7463 -       u32 tmp;
7464         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7465 -       struct at91_pmc *pmc = usb->pmc;
7466         unsigned long div;
7468         if (!rate)
7469 @@ -145,9 +142,8 @@ static int at91sam9x5_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7470         if (div > SAM9X5_USB_MAX_DIV + 1 || !div)
7471                 return -EINVAL;
7473 -       tmp = pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_OHCIUSBDIV;
7474 -       tmp |= (div - 1) << SAM9X5_USB_DIV_SHIFT;
7475 -       pmc_write(pmc, AT91_PMC_USB, tmp);
7476 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_OHCIUSBDIV,
7477 +                          (div - 1) << SAM9X5_USB_DIV_SHIFT);
7479         return 0;
7481 @@ -163,28 +159,28 @@ static const struct clk_ops at91sam9x5_usb_ops = {
7482  static int at91sam9n12_clk_usb_enable(struct clk_hw *hw)
7484         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7485 -       struct at91_pmc *pmc = usb->pmc;
7487 -       pmc_write(pmc, AT91_PMC_USB,
7488 -                 pmc_read(pmc, AT91_PMC_USB) | AT91_PMC_USBS);
7489 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS,
7490 +                          AT91_PMC_USBS);
7492         return 0;
7495  static void at91sam9n12_clk_usb_disable(struct clk_hw *hw)
7497         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7498 -       struct at91_pmc *pmc = usb->pmc;
7500 -       pmc_write(pmc, AT91_PMC_USB,
7501 -                 pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_USBS);
7502 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS, 0);
7505  static int at91sam9n12_clk_usb_is_enabled(struct clk_hw *hw)
7507         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7508 -       struct at91_pmc *pmc = usb->pmc;
7509 +       unsigned int usbr;
7511 -       return !!(pmc_read(pmc, AT91_PMC_USB) & AT91_PMC_USBS);
7512 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7514 +       return usbr & AT91_PMC_USBS;
7517  static const struct clk_ops at91sam9n12_usb_ops = {
7518 @@ -197,7 +193,7 @@ static const struct clk_ops at91sam9n12_usb_ops = {
7519  };
7521  static struct clk * __init
7522 -at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7523 +at91sam9x5_clk_register_usb(struct regmap *regmap, const char *name,
7524                             const char **parent_names, u8 num_parents)
7526         struct at91sam9x5_clk_usb *usb;
7527 @@ -216,7 +212,7 @@ at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7528                      CLK_SET_RATE_PARENT;
7530         usb->hw.init = &init;
7531 -       usb->pmc = pmc;
7532 +       usb->regmap = regmap;
7534         clk = clk_register(NULL, &usb->hw);
7535         if (IS_ERR(clk))
7536 @@ -226,7 +222,7 @@ at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7539  static struct clk * __init
7540 -at91sam9n12_clk_register_usb(struct at91_pmc *pmc, const char *name,
7541 +at91sam9n12_clk_register_usb(struct regmap *regmap, const char *name,
7542                              const char *parent_name)
7544         struct at91sam9x5_clk_usb *usb;
7545 @@ -244,7 +240,7 @@ at91sam9n12_clk_register_usb(struct at91_pmc *pmc, const char *name,
7546         init.flags = CLK_SET_RATE_GATE | CLK_SET_RATE_PARENT;
7548         usb->hw.init = &init;
7549 -       usb->pmc = pmc;
7550 +       usb->regmap = regmap;
7552         clk = clk_register(NULL, &usb->hw);
7553         if (IS_ERR(clk))
7554 @@ -257,12 +253,12 @@ static unsigned long at91rm9200_clk_usb_recalc_rate(struct clk_hw *hw,
7555                                                     unsigned long parent_rate)
7557         struct at91rm9200_clk_usb *usb = to_at91rm9200_clk_usb(hw);
7558 -       struct at91_pmc *pmc = usb->pmc;
7559 -       u32 tmp;
7560 +       unsigned int pllbr;
7561         u8 usbdiv;
7563 -       tmp = pmc_read(pmc, AT91_CKGR_PLLBR);
7564 -       usbdiv = (tmp & AT91_PMC_USBDIV) >> RM9200_USB_DIV_SHIFT;
7565 +       regmap_read(usb->regmap, AT91_CKGR_PLLBR, &pllbr);
7567 +       usbdiv = (pllbr & AT91_PMC_USBDIV) >> RM9200_USB_DIV_SHIFT;
7568         if (usb->divisors[usbdiv])
7569                 return parent_rate / usb->divisors[usbdiv];
7571 @@ -310,10 +306,8 @@ static long at91rm9200_clk_usb_round_rate(struct clk_hw *hw, unsigned long rate,
7572  static int at91rm9200_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7573                                        unsigned long parent_rate)
7575 -       u32 tmp;
7576         int i;
7577         struct at91rm9200_clk_usb *usb = to_at91rm9200_clk_usb(hw);
7578 -       struct at91_pmc *pmc = usb->pmc;
7579         unsigned long div;
7581         if (!rate)
7582 @@ -323,10 +317,10 @@ static int at91rm9200_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7584         for (i = 0; i < RM9200_USB_DIV_TAB_SIZE; i++) {
7585                 if (usb->divisors[i] == div) {
7586 -                       tmp = pmc_read(pmc, AT91_CKGR_PLLBR) &
7587 -                             ~AT91_PMC_USBDIV;
7588 -                       tmp |= i << RM9200_USB_DIV_SHIFT;
7589 -                       pmc_write(pmc, AT91_CKGR_PLLBR, tmp);
7590 +                       regmap_update_bits(usb->regmap, AT91_CKGR_PLLBR,
7591 +                                          AT91_PMC_USBDIV,
7592 +                                          i << RM9200_USB_DIV_SHIFT);
7594                         return 0;
7595                 }
7596         }
7597 @@ -341,7 +335,7 @@ static const struct clk_ops at91rm9200_usb_ops = {
7598  };
7600  static struct clk * __init
7601 -at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7602 +at91rm9200_clk_register_usb(struct regmap *regmap, const char *name,
7603                             const char *parent_name, const u32 *divisors)
7605         struct at91rm9200_clk_usb *usb;
7606 @@ -359,7 +353,7 @@ at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7607         init.flags = CLK_SET_RATE_PARENT;
7609         usb->hw.init = &init;
7610 -       usb->pmc = pmc;
7611 +       usb->regmap = regmap;
7612         memcpy(usb->divisors, divisors, sizeof(usb->divisors));
7614         clk = clk_register(NULL, &usb->hw);
7615 @@ -369,13 +363,13 @@ at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7616         return clk;
7619 -void __init of_at91sam9x5_clk_usb_setup(struct device_node *np,
7620 -                                       struct at91_pmc *pmc)
7621 +static void __init of_at91sam9x5_clk_usb_setup(struct device_node *np)
7623         struct clk *clk;
7624         int num_parents;
7625         const char *parent_names[USB_SOURCE_MAX];
7626         const char *name = np->name;
7627 +       struct regmap *regmap;
7629         num_parents = of_clk_get_parent_count(np);
7630         if (num_parents <= 0 || num_parents > USB_SOURCE_MAX)
7631 @@ -385,19 +379,26 @@ void __init of_at91sam9x5_clk_usb_setup(struct device_node *np,
7633         of_property_read_string(np, "clock-output-names", &name);
7635 -       clk = at91sam9x5_clk_register_usb(pmc, name, parent_names, num_parents);
7636 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7637 +       if (IS_ERR(regmap))
7638 +               return;
7640 +       clk = at91sam9x5_clk_register_usb(regmap, name, parent_names,
7641 +                                         num_parents);
7642         if (IS_ERR(clk))
7643                 return;
7645         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7647 +CLK_OF_DECLARE(at91sam9x5_clk_usb, "atmel,at91sam9x5-clk-usb",
7648 +              of_at91sam9x5_clk_usb_setup);
7650 -void __init of_at91sam9n12_clk_usb_setup(struct device_node *np,
7651 -                                        struct at91_pmc *pmc)
7652 +static void __init of_at91sam9n12_clk_usb_setup(struct device_node *np)
7654         struct clk *clk;
7655         const char *parent_name;
7656         const char *name = np->name;
7657 +       struct regmap *regmap;
7659         parent_name = of_clk_get_parent_name(np, 0);
7660         if (!parent_name)
7661 @@ -405,20 +406,26 @@ void __init of_at91sam9n12_clk_usb_setup(struct device_node *np,
7663         of_property_read_string(np, "clock-output-names", &name);
7665 -       clk = at91sam9n12_clk_register_usb(pmc, name, parent_name);
7666 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7667 +       if (IS_ERR(regmap))
7668 +               return;
7670 +       clk = at91sam9n12_clk_register_usb(regmap, name, parent_name);
7671         if (IS_ERR(clk))
7672                 return;
7674         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7676 +CLK_OF_DECLARE(at91sam9n12_clk_usb, "atmel,at91sam9n12-clk-usb",
7677 +              of_at91sam9n12_clk_usb_setup);
7679 -void __init of_at91rm9200_clk_usb_setup(struct device_node *np,
7680 -                                       struct at91_pmc *pmc)
7681 +static void __init of_at91rm9200_clk_usb_setup(struct device_node *np)
7683         struct clk *clk;
7684         const char *parent_name;
7685         const char *name = np->name;
7686         u32 divisors[4] = {0, 0, 0, 0};
7687 +       struct regmap *regmap;
7689         parent_name = of_clk_get_parent_name(np, 0);
7690         if (!parent_name)
7691 @@ -430,9 +437,15 @@ void __init of_at91rm9200_clk_usb_setup(struct device_node *np,
7693         of_property_read_string(np, "clock-output-names", &name);
7695 -       clk = at91rm9200_clk_register_usb(pmc, name, parent_name, divisors);
7696 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7697 +       if (IS_ERR(regmap))
7698 +               return;
7700 +       clk = at91rm9200_clk_register_usb(regmap, name, parent_name, divisors);
7701         if (IS_ERR(clk))
7702                 return;
7704         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7706 +CLK_OF_DECLARE(at91rm9200_clk_usb, "atmel,at91rm9200-clk-usb",
7707 +              of_at91rm9200_clk_usb_setup);
7708 diff --git a/drivers/clk/at91/clk-utmi.c b/drivers/clk/at91/clk-utmi.c
7709 index ca561e90a60f..61fcf399e58c 100644
7710 --- a/drivers/clk/at91/clk-utmi.c
7711 +++ b/drivers/clk/at91/clk-utmi.c
7712 @@ -11,14 +11,9 @@
7713  #include <linux/clk-provider.h>
7714  #include <linux/clkdev.h>
7715  #include <linux/clk/at91_pmc.h>
7716 -#include <linux/interrupt.h>
7717 -#include <linux/irq.h>
7718  #include <linux/of.h>
7719 -#include <linux/of_address.h>
7720 -#include <linux/of_irq.h>
7721 -#include <linux/io.h>
7722 -#include <linux/sched.h>
7723 -#include <linux/wait.h>
7724 +#include <linux/mfd/syscon.h>
7725 +#include <linux/regmap.h>
7727  #include "pmc.h"
7729 @@ -26,37 +21,30 @@
7731  struct clk_utmi {
7732         struct clk_hw hw;
7733 -       struct at91_pmc *pmc;
7734 -       unsigned int irq;
7735 -       wait_queue_head_t wait;
7736 +       struct regmap *regmap;
7737  };
7739  #define to_clk_utmi(hw) container_of(hw, struct clk_utmi, hw)
7741 -static irqreturn_t clk_utmi_irq_handler(int irq, void *dev_id)
7742 +static inline bool clk_utmi_ready(struct regmap *regmap)
7744 -       struct clk_utmi *utmi = (struct clk_utmi *)dev_id;
7745 +       unsigned int status;
7747 -       wake_up(&utmi->wait);
7748 -       disable_irq_nosync(utmi->irq);
7749 +       regmap_read(regmap, AT91_PMC_SR, &status);
7751 -       return IRQ_HANDLED;
7752 +       return status & AT91_PMC_LOCKU;
7755  static int clk_utmi_prepare(struct clk_hw *hw)
7757         struct clk_utmi *utmi = to_clk_utmi(hw);
7758 -       struct at91_pmc *pmc = utmi->pmc;
7759 -       u32 tmp = pmc_read(pmc, AT91_CKGR_UCKR) | AT91_PMC_UPLLEN |
7760 -                 AT91_PMC_UPLLCOUNT | AT91_PMC_BIASEN;
7761 +       unsigned int uckr = AT91_PMC_UPLLEN | AT91_PMC_UPLLCOUNT |
7762 +                           AT91_PMC_BIASEN;
7764 -       pmc_write(pmc, AT91_CKGR_UCKR, tmp);
7765 +       regmap_update_bits(utmi->regmap, AT91_CKGR_UCKR, uckr, uckr);
7767 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU)) {
7768 -               enable_irq(utmi->irq);
7769 -               wait_event(utmi->wait,
7770 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU);
7771 -       }
7772 +       while (!clk_utmi_ready(utmi->regmap))
7773 +               cpu_relax();
7775         return 0;
7777 @@ -64,18 +52,15 @@ static int clk_utmi_prepare(struct clk_hw *hw)
7778  static int clk_utmi_is_prepared(struct clk_hw *hw)
7780         struct clk_utmi *utmi = to_clk_utmi(hw);
7781 -       struct at91_pmc *pmc = utmi->pmc;
7783 -       return !!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU);
7784 +       return clk_utmi_ready(utmi->regmap);
7787  static void clk_utmi_unprepare(struct clk_hw *hw)
7789         struct clk_utmi *utmi = to_clk_utmi(hw);
7790 -       struct at91_pmc *pmc = utmi->pmc;
7791 -       u32 tmp = pmc_read(pmc, AT91_CKGR_UCKR) & ~AT91_PMC_UPLLEN;
7793 -       pmc_write(pmc, AT91_CKGR_UCKR, tmp);
7794 +       regmap_update_bits(utmi->regmap, AT91_CKGR_UCKR, AT91_PMC_UPLLEN, 0);
7797  static unsigned long clk_utmi_recalc_rate(struct clk_hw *hw,
7798 @@ -93,10 +78,9 @@ static const struct clk_ops utmi_ops = {
7799  };
7801  static struct clk * __init
7802 -at91_clk_register_utmi(struct at91_pmc *pmc, unsigned int irq,
7803 +at91_clk_register_utmi(struct regmap *regmap,
7804                        const char *name, const char *parent_name)
7806 -       int ret;
7807         struct clk_utmi *utmi;
7808         struct clk *clk = NULL;
7809         struct clk_init_data init;
7810 @@ -112,52 +96,36 @@ at91_clk_register_utmi(struct at91_pmc *pmc, unsigned int irq,
7811         init.flags = CLK_SET_RATE_GATE;
7813         utmi->hw.init = &init;
7814 -       utmi->pmc = pmc;
7815 -       utmi->irq = irq;
7816 -       init_waitqueue_head(&utmi->wait);
7817 -       irq_set_status_flags(utmi->irq, IRQ_NOAUTOEN);
7818 -       ret = request_irq(utmi->irq, clk_utmi_irq_handler,
7819 -                         IRQF_TRIGGER_HIGH, "clk-utmi", utmi);
7820 -       if (ret) {
7821 -               kfree(utmi);
7822 -               return ERR_PTR(ret);
7823 -       }
7824 +       utmi->regmap = regmap;
7826         clk = clk_register(NULL, &utmi->hw);
7827 -       if (IS_ERR(clk)) {
7828 -               free_irq(utmi->irq, utmi);
7829 +       if (IS_ERR(clk))
7830                 kfree(utmi);
7831 -       }
7833         return clk;
7836 -static void __init
7837 -of_at91_clk_utmi_setup(struct device_node *np, struct at91_pmc *pmc)
7838 +static void __init of_at91sam9x5_clk_utmi_setup(struct device_node *np)
7840 -       unsigned int irq;
7841         struct clk *clk;
7842         const char *parent_name;
7843         const char *name = np->name;
7844 +       struct regmap *regmap;
7846         parent_name = of_clk_get_parent_name(np, 0);
7848         of_property_read_string(np, "clock-output-names", &name);
7850 -       irq = irq_of_parse_and_map(np, 0);
7851 -       if (!irq)
7852 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7853 +       if (IS_ERR(regmap))
7854                 return;
7856 -       clk = at91_clk_register_utmi(pmc, irq, name, parent_name);
7857 +       clk = at91_clk_register_utmi(regmap, name, parent_name);
7858         if (IS_ERR(clk))
7859                 return;
7861         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7862         return;
7865 -void __init of_at91sam9x5_clk_utmi_setup(struct device_node *np,
7866 -                                        struct at91_pmc *pmc)
7868 -       of_at91_clk_utmi_setup(np, pmc);
7870 +CLK_OF_DECLARE(at91sam9x5_clk_utmi, "atmel,at91sam9x5-clk-utmi",
7871 +              of_at91sam9x5_clk_utmi_setup);
7872 diff --git a/drivers/clk/at91/pmc.c b/drivers/clk/at91/pmc.c
7873 index 8476b570779b..526df5ba042d 100644
7874 --- a/drivers/clk/at91/pmc.c
7875 +++ b/drivers/clk/at91/pmc.c
7876 @@ -12,36 +12,13 @@
7877  #include <linux/clkdev.h>
7878  #include <linux/clk/at91_pmc.h>
7879  #include <linux/of.h>
7880 -#include <linux/of_address.h>
7881 -#include <linux/io.h>
7882 -#include <linux/interrupt.h>
7883 -#include <linux/irq.h>
7884 -#include <linux/irqchip/chained_irq.h>
7885 -#include <linux/irqdomain.h>
7886 -#include <linux/of_irq.h>
7887 +#include <linux/mfd/syscon.h>
7888 +#include <linux/regmap.h>
7890  #include <asm/proc-fns.h>
7892  #include "pmc.h"
7894 -void __iomem *at91_pmc_base;
7895 -EXPORT_SYMBOL_GPL(at91_pmc_base);
7897 -void at91rm9200_idle(void)
7899 -       /*
7900 -        * Disable the processor clock.  The processor will be automatically
7901 -        * re-enabled by an interrupt or by a reset.
7902 -        */
7903 -       at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK);
7906 -void at91sam9_idle(void)
7908 -       at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK);
7909 -       cpu_do_idle();
7912  int of_at91_get_clk_range(struct device_node *np, const char *propname,
7913                           struct clk_range *range)
7915 @@ -64,402 +41,3 @@ int of_at91_get_clk_range(struct device_node *np, const char *propname,
7916         return 0;
7918  EXPORT_SYMBOL_GPL(of_at91_get_clk_range);
7920 -static void pmc_irq_mask(struct irq_data *d)
7922 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7924 -       pmc_write(pmc, AT91_PMC_IDR, 1 << d->hwirq);
7927 -static void pmc_irq_unmask(struct irq_data *d)
7929 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7931 -       pmc_write(pmc, AT91_PMC_IER, 1 << d->hwirq);
7934 -static int pmc_irq_set_type(struct irq_data *d, unsigned type)
7936 -       if (type != IRQ_TYPE_LEVEL_HIGH) {
7937 -               pr_warn("PMC: type not supported (support only IRQ_TYPE_LEVEL_HIGH type)\n");
7938 -               return -EINVAL;
7939 -       }
7941 -       return 0;
7944 -static void pmc_irq_suspend(struct irq_data *d)
7946 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7948 -       pmc->imr = pmc_read(pmc, AT91_PMC_IMR);
7949 -       pmc_write(pmc, AT91_PMC_IDR, pmc->imr);
7952 -static void pmc_irq_resume(struct irq_data *d)
7954 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7956 -       pmc_write(pmc, AT91_PMC_IER, pmc->imr);
7959 -static struct irq_chip pmc_irq = {
7960 -       .name = "PMC",
7961 -       .irq_disable = pmc_irq_mask,
7962 -       .irq_mask = pmc_irq_mask,
7963 -       .irq_unmask = pmc_irq_unmask,
7964 -       .irq_set_type = pmc_irq_set_type,
7965 -       .irq_suspend = pmc_irq_suspend,
7966 -       .irq_resume = pmc_irq_resume,
7969 -static struct lock_class_key pmc_lock_class;
7971 -static int pmc_irq_map(struct irq_domain *h, unsigned int virq,
7972 -                      irq_hw_number_t hw)
7974 -       struct at91_pmc *pmc = h->host_data;
7976 -       irq_set_lockdep_class(virq, &pmc_lock_class);
7978 -       irq_set_chip_and_handler(virq, &pmc_irq,
7979 -                                handle_level_irq);
7980 -       irq_set_chip_data(virq, pmc);
7982 -       return 0;
7985 -static int pmc_irq_domain_xlate(struct irq_domain *d,
7986 -                               struct device_node *ctrlr,
7987 -                               const u32 *intspec, unsigned int intsize,
7988 -                               irq_hw_number_t *out_hwirq,
7989 -                               unsigned int *out_type)
7991 -       struct at91_pmc *pmc = d->host_data;
7992 -       const struct at91_pmc_caps *caps = pmc->caps;
7994 -       if (WARN_ON(intsize < 1))
7995 -               return -EINVAL;
7997 -       *out_hwirq = intspec[0];
7999 -       if (!(caps->available_irqs & (1 << *out_hwirq)))
8000 -               return -EINVAL;
8002 -       *out_type = IRQ_TYPE_LEVEL_HIGH;
8004 -       return 0;
8007 -static const struct irq_domain_ops pmc_irq_ops = {
8008 -       .map    = pmc_irq_map,
8009 -       .xlate  = pmc_irq_domain_xlate,
8012 -static irqreturn_t pmc_irq_handler(int irq, void *data)
8014 -       struct at91_pmc *pmc = (struct at91_pmc *)data;
8015 -       unsigned long sr;
8016 -       int n;
8018 -       sr = pmc_read(pmc, AT91_PMC_SR) & pmc_read(pmc, AT91_PMC_IMR);
8019 -       if (!sr)
8020 -               return IRQ_NONE;
8022 -       for_each_set_bit(n, &sr, BITS_PER_LONG)
8023 -               generic_handle_irq(irq_find_mapping(pmc->irqdomain, n));
8025 -       return IRQ_HANDLED;
8028 -static const struct at91_pmc_caps at91rm9200_caps = {
8029 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
8030 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
8031 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
8032 -                         AT91_PMC_PCK3RDY,
8035 -static const struct at91_pmc_caps at91sam9260_caps = {
8036 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
8037 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
8038 -                         AT91_PMC_PCK1RDY,
8041 -static const struct at91_pmc_caps at91sam9g45_caps = {
8042 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
8043 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
8044 -                         AT91_PMC_PCK1RDY,
8047 -static const struct at91_pmc_caps at91sam9n12_caps = {
8048 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
8049 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
8050 -                         AT91_PMC_PCK1RDY | AT91_PMC_MOSCSELS |
8051 -                         AT91_PMC_MOSCRCS | AT91_PMC_CFDEV,
8054 -static const struct at91_pmc_caps at91sam9x5_caps = {
8055 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
8056 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
8057 -                         AT91_PMC_PCK1RDY | AT91_PMC_MOSCSELS |
8058 -                         AT91_PMC_MOSCRCS | AT91_PMC_CFDEV,
8061 -static const struct at91_pmc_caps sama5d2_caps = {
8062 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
8063 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
8064 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
8065 -                         AT91_PMC_MOSCSELS | AT91_PMC_MOSCRCS |
8066 -                         AT91_PMC_CFDEV | AT91_PMC_GCKRDY,
8069 -static const struct at91_pmc_caps sama5d3_caps = {
8070 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
8071 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
8072 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
8073 -                         AT91_PMC_MOSCSELS | AT91_PMC_MOSCRCS |
8074 -                         AT91_PMC_CFDEV,
8077 -static struct at91_pmc *__init at91_pmc_init(struct device_node *np,
8078 -                                            void __iomem *regbase, int virq,
8079 -                                            const struct at91_pmc_caps *caps)
8081 -       struct at91_pmc *pmc;
8083 -       if (!regbase || !virq ||  !caps)
8084 -               return NULL;
8086 -       at91_pmc_base = regbase;
8088 -       pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
8089 -       if (!pmc)
8090 -               return NULL;
8092 -       spin_lock_init(&pmc->lock);
8093 -       pmc->regbase = regbase;
8094 -       pmc->virq = virq;
8095 -       pmc->caps = caps;
8097 -       pmc->irqdomain = irq_domain_add_linear(np, 32, &pmc_irq_ops, pmc);
8099 -       if (!pmc->irqdomain)
8100 -               goto out_free_pmc;
8102 -       pmc_write(pmc, AT91_PMC_IDR, 0xffffffff);
8103 -       if (request_irq(pmc->virq, pmc_irq_handler,
8104 -                       IRQF_SHARED | IRQF_COND_SUSPEND, "pmc", pmc))
8105 -               goto out_remove_irqdomain;
8107 -       return pmc;
8109 -out_remove_irqdomain:
8110 -       irq_domain_remove(pmc->irqdomain);
8111 -out_free_pmc:
8112 -       kfree(pmc);
8114 -       return NULL;
8117 -static const struct of_device_id pmc_clk_ids[] __initconst = {
8118 -       /* Slow oscillator */
8119 -       {
8120 -               .compatible = "atmel,at91sam9260-clk-slow",
8121 -               .data = of_at91sam9260_clk_slow_setup,
8122 -       },
8123 -       /* Main clock */
8124 -       {
8125 -               .compatible = "atmel,at91rm9200-clk-main-osc",
8126 -               .data = of_at91rm9200_clk_main_osc_setup,
8127 -       },
8128 -       {
8129 -               .compatible = "atmel,at91sam9x5-clk-main-rc-osc",
8130 -               .data = of_at91sam9x5_clk_main_rc_osc_setup,
8131 -       },
8132 -       {
8133 -               .compatible = "atmel,at91rm9200-clk-main",
8134 -               .data = of_at91rm9200_clk_main_setup,
8135 -       },
8136 -       {
8137 -               .compatible = "atmel,at91sam9x5-clk-main",
8138 -               .data = of_at91sam9x5_clk_main_setup,
8139 -       },
8140 -       /* PLL clocks */
8141 -       {
8142 -               .compatible = "atmel,at91rm9200-clk-pll",
8143 -               .data = of_at91rm9200_clk_pll_setup,
8144 -       },
8145 -       {
8146 -               .compatible = "atmel,at91sam9g45-clk-pll",
8147 -               .data = of_at91sam9g45_clk_pll_setup,
8148 -       },
8149 -       {
8150 -               .compatible = "atmel,at91sam9g20-clk-pllb",
8151 -               .data = of_at91sam9g20_clk_pllb_setup,
8152 -       },
8153 -       {
8154 -               .compatible = "atmel,sama5d3-clk-pll",
8155 -               .data = of_sama5d3_clk_pll_setup,
8156 -       },
8157 -       {
8158 -               .compatible = "atmel,at91sam9x5-clk-plldiv",
8159 -               .data = of_at91sam9x5_clk_plldiv_setup,
8160 -       },
8161 -       /* Master clock */
8162 -       {
8163 -               .compatible = "atmel,at91rm9200-clk-master",
8164 -               .data = of_at91rm9200_clk_master_setup,
8165 -       },
8166 -       {
8167 -               .compatible = "atmel,at91sam9x5-clk-master",
8168 -               .data = of_at91sam9x5_clk_master_setup,
8169 -       },
8170 -       /* System clocks */
8171 -       {
8172 -               .compatible = "atmel,at91rm9200-clk-system",
8173 -               .data = of_at91rm9200_clk_sys_setup,
8174 -       },
8175 -       /* Peripheral clocks */
8176 -       {
8177 -               .compatible = "atmel,at91rm9200-clk-peripheral",
8178 -               .data = of_at91rm9200_clk_periph_setup,
8179 -       },
8180 -       {
8181 -               .compatible = "atmel,at91sam9x5-clk-peripheral",
8182 -               .data = of_at91sam9x5_clk_periph_setup,
8183 -       },
8184 -       /* Programmable clocks */
8185 -       {
8186 -               .compatible = "atmel,at91rm9200-clk-programmable",
8187 -               .data = of_at91rm9200_clk_prog_setup,
8188 -       },
8189 -       {
8190 -               .compatible = "atmel,at91sam9g45-clk-programmable",
8191 -               .data = of_at91sam9g45_clk_prog_setup,
8192 -       },
8193 -       {
8194 -               .compatible = "atmel,at91sam9x5-clk-programmable",
8195 -               .data = of_at91sam9x5_clk_prog_setup,
8196 -       },
8197 -       /* UTMI clock */
8198 -#if defined(CONFIG_HAVE_AT91_UTMI)
8199 -       {
8200 -               .compatible = "atmel,at91sam9x5-clk-utmi",
8201 -               .data = of_at91sam9x5_clk_utmi_setup,
8202 -       },
8203 -#endif
8204 -       /* USB clock */
8205 -#if defined(CONFIG_HAVE_AT91_USB_CLK)
8206 -       {
8207 -               .compatible = "atmel,at91rm9200-clk-usb",
8208 -               .data = of_at91rm9200_clk_usb_setup,
8209 -       },
8210 -       {
8211 -               .compatible = "atmel,at91sam9x5-clk-usb",
8212 -               .data = of_at91sam9x5_clk_usb_setup,
8213 -       },
8214 -       {
8215 -               .compatible = "atmel,at91sam9n12-clk-usb",
8216 -               .data = of_at91sam9n12_clk_usb_setup,
8217 -       },
8218 -#endif
8219 -       /* SMD clock */
8220 -#if defined(CONFIG_HAVE_AT91_SMD)
8221 -       {
8222 -               .compatible = "atmel,at91sam9x5-clk-smd",
8223 -               .data = of_at91sam9x5_clk_smd_setup,
8224 -       },
8225 -#endif
8226 -#if defined(CONFIG_HAVE_AT91_H32MX)
8227 -       {
8228 -               .compatible = "atmel,sama5d4-clk-h32mx",
8229 -               .data = of_sama5d4_clk_h32mx_setup,
8230 -       },
8231 -#endif
8232 -#if defined(CONFIG_HAVE_AT91_GENERATED_CLK)
8233 -       {
8234 -               .compatible = "atmel,sama5d2-clk-generated",
8235 -               .data = of_sama5d2_clk_generated_setup,
8236 -       },
8237 -#endif
8238 -       { /*sentinel*/ }
8241 -static void __init of_at91_pmc_setup(struct device_node *np,
8242 -                                    const struct at91_pmc_caps *caps)
8244 -       struct at91_pmc *pmc;
8245 -       struct device_node *childnp;
8246 -       void (*clk_setup)(struct device_node *, struct at91_pmc *);
8247 -       const struct of_device_id *clk_id;
8248 -       void __iomem *regbase = of_iomap(np, 0);
8249 -       int virq;
8251 -       if (!regbase)
8252 -               return;
8254 -       virq = irq_of_parse_and_map(np, 0);
8255 -       if (!virq)
8256 -               return;
8258 -       pmc = at91_pmc_init(np, regbase, virq, caps);
8259 -       if (!pmc)
8260 -               return;
8261 -       for_each_child_of_node(np, childnp) {
8262 -               clk_id = of_match_node(pmc_clk_ids, childnp);
8263 -               if (!clk_id)
8264 -                       continue;
8265 -               clk_setup = clk_id->data;
8266 -               clk_setup(childnp, pmc);
8267 -       }
8270 -static void __init of_at91rm9200_pmc_setup(struct device_node *np)
8272 -       of_at91_pmc_setup(np, &at91rm9200_caps);
8274 -CLK_OF_DECLARE(at91rm9200_clk_pmc, "atmel,at91rm9200-pmc",
8275 -              of_at91rm9200_pmc_setup);
8277 -static void __init of_at91sam9260_pmc_setup(struct device_node *np)
8279 -       of_at91_pmc_setup(np, &at91sam9260_caps);
8281 -CLK_OF_DECLARE(at91sam9260_clk_pmc, "atmel,at91sam9260-pmc",
8282 -              of_at91sam9260_pmc_setup);
8284 -static void __init of_at91sam9g45_pmc_setup(struct device_node *np)
8286 -       of_at91_pmc_setup(np, &at91sam9g45_caps);
8288 -CLK_OF_DECLARE(at91sam9g45_clk_pmc, "atmel,at91sam9g45-pmc",
8289 -              of_at91sam9g45_pmc_setup);
8291 -static void __init of_at91sam9n12_pmc_setup(struct device_node *np)
8293 -       of_at91_pmc_setup(np, &at91sam9n12_caps);
8295 -CLK_OF_DECLARE(at91sam9n12_clk_pmc, "atmel,at91sam9n12-pmc",
8296 -              of_at91sam9n12_pmc_setup);
8298 -static void __init of_at91sam9x5_pmc_setup(struct device_node *np)
8300 -       of_at91_pmc_setup(np, &at91sam9x5_caps);
8302 -CLK_OF_DECLARE(at91sam9x5_clk_pmc, "atmel,at91sam9x5-pmc",
8303 -              of_at91sam9x5_pmc_setup);
8305 -static void __init of_sama5d2_pmc_setup(struct device_node *np)
8307 -       of_at91_pmc_setup(np, &sama5d2_caps);
8309 -CLK_OF_DECLARE(sama5d2_clk_pmc, "atmel,sama5d2-pmc",
8310 -              of_sama5d2_pmc_setup);
8312 -static void __init of_sama5d3_pmc_setup(struct device_node *np)
8314 -       of_at91_pmc_setup(np, &sama5d3_caps);
8316 -CLK_OF_DECLARE(sama5d3_clk_pmc, "atmel,sama5d3-pmc",
8317 -              of_sama5d3_pmc_setup);
8318 diff --git a/drivers/clk/at91/pmc.h b/drivers/clk/at91/pmc.h
8319 index f65739272779..5771fff0ee3f 100644
8320 --- a/drivers/clk/at91/pmc.h
8321 +++ b/drivers/clk/at91/pmc.h
8322 @@ -14,8 +14,11 @@
8324  #include <linux/io.h>
8325  #include <linux/irqdomain.h>
8326 +#include <linux/regmap.h>
8327  #include <linux/spinlock.h>
8329 +extern spinlock_t pmc_pcr_lock;
8331  struct clk_range {
8332         unsigned long min;
8333         unsigned long max;
8334 @@ -23,102 +26,7 @@ struct clk_range {
8336  #define CLK_RANGE(MIN, MAX) {.min = MIN, .max = MAX,}
8338 -struct at91_pmc_caps {
8339 -       u32 available_irqs;
8342 -struct at91_pmc {
8343 -       void __iomem *regbase;
8344 -       int virq;
8345 -       spinlock_t lock;
8346 -       const struct at91_pmc_caps *caps;
8347 -       struct irq_domain *irqdomain;
8348 -       u32 imr;
8351 -static inline void pmc_lock(struct at91_pmc *pmc)
8353 -       spin_lock(&pmc->lock);
8356 -static inline void pmc_unlock(struct at91_pmc *pmc)
8358 -       spin_unlock(&pmc->lock);
8361 -static inline u32 pmc_read(struct at91_pmc *pmc, int offset)
8363 -       return readl(pmc->regbase + offset);
8366 -static inline void pmc_write(struct at91_pmc *pmc, int offset, u32 value)
8368 -       writel(value, pmc->regbase + offset);
8371  int of_at91_get_clk_range(struct device_node *np, const char *propname,
8372                           struct clk_range *range);
8374 -void of_at91sam9260_clk_slow_setup(struct device_node *np,
8375 -                                  struct at91_pmc *pmc);
8377 -void of_at91rm9200_clk_main_osc_setup(struct device_node *np,
8378 -                                     struct at91_pmc *pmc);
8379 -void of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np,
8380 -                                        struct at91_pmc *pmc);
8381 -void of_at91rm9200_clk_main_setup(struct device_node *np,
8382 -                                 struct at91_pmc *pmc);
8383 -void of_at91sam9x5_clk_main_setup(struct device_node *np,
8384 -                                 struct at91_pmc *pmc);
8386 -void of_at91rm9200_clk_pll_setup(struct device_node *np,
8387 -                                struct at91_pmc *pmc);
8388 -void of_at91sam9g45_clk_pll_setup(struct device_node *np,
8389 -                                 struct at91_pmc *pmc);
8390 -void of_at91sam9g20_clk_pllb_setup(struct device_node *np,
8391 -                                  struct at91_pmc *pmc);
8392 -void of_sama5d3_clk_pll_setup(struct device_node *np,
8393 -                             struct at91_pmc *pmc);
8394 -void of_at91sam9x5_clk_plldiv_setup(struct device_node *np,
8395 -                                   struct at91_pmc *pmc);
8397 -void of_at91rm9200_clk_master_setup(struct device_node *np,
8398 -                                   struct at91_pmc *pmc);
8399 -void of_at91sam9x5_clk_master_setup(struct device_node *np,
8400 -                                   struct at91_pmc *pmc);
8402 -void of_at91rm9200_clk_sys_setup(struct device_node *np,
8403 -                                struct at91_pmc *pmc);
8405 -void of_at91rm9200_clk_periph_setup(struct device_node *np,
8406 -                                   struct at91_pmc *pmc);
8407 -void of_at91sam9x5_clk_periph_setup(struct device_node *np,
8408 -                                   struct at91_pmc *pmc);
8410 -void of_at91rm9200_clk_prog_setup(struct device_node *np,
8411 -                                 struct at91_pmc *pmc);
8412 -void of_at91sam9g45_clk_prog_setup(struct device_node *np,
8413 -                                  struct at91_pmc *pmc);
8414 -void of_at91sam9x5_clk_prog_setup(struct device_node *np,
8415 -                                 struct at91_pmc *pmc);
8417 -void of_at91sam9x5_clk_utmi_setup(struct device_node *np,
8418 -                                 struct at91_pmc *pmc);
8420 -void of_at91rm9200_clk_usb_setup(struct device_node *np,
8421 -                                struct at91_pmc *pmc);
8422 -void of_at91sam9x5_clk_usb_setup(struct device_node *np,
8423 -                                struct at91_pmc *pmc);
8424 -void of_at91sam9n12_clk_usb_setup(struct device_node *np,
8425 -                                 struct at91_pmc *pmc);
8427 -void of_at91sam9x5_clk_smd_setup(struct device_node *np,
8428 -                                struct at91_pmc *pmc);
8430 -void of_sama5d4_clk_h32mx_setup(struct device_node *np,
8431 -                               struct at91_pmc *pmc);
8433 -void of_sama5d2_clk_generated_setup(struct device_node *np,
8434 -                                   struct at91_pmc *pmc);
8436  #endif /* __PMC_H_ */
8437 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
8438 index 4da2af9694a2..5b6f57f500b8 100644
8439 --- a/drivers/clocksource/tcb_clksrc.c
8440 +++ b/drivers/clocksource/tcb_clksrc.c
8441 @@ -23,8 +23,7 @@
8442   *     this 32 bit free-running counter. the second channel is not used.
8443   *
8444   *   - The third channel may be used to provide a 16-bit clockevent
8445 - *     source, used in either periodic or oneshot mode.  This runs
8446 - *     at 32 KiHZ, and can handle delays of up to two seconds.
8447 + *     source, used in either periodic or oneshot mode.
8448   *
8449   * A boot clocksource and clockevent source are also currently needed,
8450   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
8451 @@ -74,6 +73,8 @@ static struct clocksource clksrc = {
8452  struct tc_clkevt_device {
8453         struct clock_event_device       clkevt;
8454         struct clk                      *clk;
8455 +       bool                            clk_enabled;
8456 +       u32                             freq;
8457         void __iomem                    *regs;
8458  };
8460 @@ -82,15 +83,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
8461         return container_of(clkevt, struct tc_clkevt_device, clkevt);
8464 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
8465 - * because using one of the divided clocks would usually mean the
8466 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
8467 - *
8468 - * A divided clock could be good for high resolution timers, since
8469 - * 30.5 usec resolution can seem "low".
8470 - */
8471  static u32 timer_clock;
8473 +static void tc_clk_disable(struct clock_event_device *d)
8475 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8477 +       clk_disable(tcd->clk);
8478 +       tcd->clk_enabled = false;
8481 +static void tc_clk_enable(struct clock_event_device *d)
8483 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8485 +       if (tcd->clk_enabled)
8486 +               return;
8487 +       clk_enable(tcd->clk);
8488 +       tcd->clk_enabled = true;
8491  static int tc_shutdown(struct clock_event_device *d)
8493         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8494 @@ -98,8 +110,14 @@ static int tc_shutdown(struct clock_event_device *d)
8496         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
8497         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
8498 +       return 0;
8501 +static int tc_shutdown_clk_off(struct clock_event_device *d)
8503 +       tc_shutdown(d);
8504         if (!clockevent_state_detached(d))
8505 -               clk_disable(tcd->clk);
8506 +               tc_clk_disable(d);
8508         return 0;
8510 @@ -112,9 +130,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
8511         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
8512                 tc_shutdown(d);
8514 -       clk_enable(tcd->clk);
8515 +       tc_clk_enable(d);
8517 -       /* slow clock, count up to RC, then irq and stop */
8518 +       /* count up to RC, then irq and stop */
8519         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
8520                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
8521         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8522 @@ -134,12 +152,12 @@ static int tc_set_periodic(struct clock_event_device *d)
8523         /* By not making the gentime core emulate periodic mode on top
8524          * of oneshot, we get lower overhead and improved accuracy.
8525          */
8526 -       clk_enable(tcd->clk);
8527 +       tc_clk_enable(d);
8529 -       /* slow clock, count up to RC, then irq and restart */
8530 +       /* count up to RC, then irq and restart */
8531         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
8532                      regs + ATMEL_TC_REG(2, CMR));
8533 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8534 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8536         /* Enable clock and interrupts on RC compare */
8537         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8538 @@ -166,9 +184,13 @@ static struct tc_clkevt_device clkevt = {
8539                 .features               = CLOCK_EVT_FEAT_PERIODIC |
8540                                           CLOCK_EVT_FEAT_ONESHOT,
8541                 /* Should be lower than at91rm9200's system timer */
8542 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8543                 .rating                 = 125,
8544 +#else
8545 +               .rating                 = 200,
8546 +#endif
8547                 .set_next_event         = tc_next_event,
8548 -               .set_state_shutdown     = tc_shutdown,
8549 +               .set_state_shutdown     = tc_shutdown_clk_off,
8550                 .set_state_periodic     = tc_set_periodic,
8551                 .set_state_oneshot      = tc_set_oneshot,
8552         },
8553 @@ -188,8 +210,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
8554         return IRQ_NONE;
8557 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8558 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
8560 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
8561         int ret;
8562         struct clk *t2_clk = tc->clk[2];
8563         int irq = tc->irq[2];
8564 @@ -210,7 +233,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8565         clkevt.regs = tc->regs;
8566         clkevt.clk = t2_clk;
8568 -       timer_clock = clk32k_divisor_idx;
8569 +       timer_clock = divisor_idx;
8570 +       if (!divisor)
8571 +               clkevt.freq = 32768;
8572 +       else
8573 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
8575         clkevt.clkevt.cpumask = cpumask_of(0);
8577 @@ -221,7 +248,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8578                 return ret;
8579         }
8581 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
8582 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
8584         return ret;
8586 @@ -358,7 +385,11 @@ static int __init tcb_clksrc_init(void)
8587                 goto err_disable_t1;
8589         /* channel 2:  periodic and oneshot timer support */
8590 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8591         ret = setup_clkevents(tc, clk32k_divisor_idx);
8592 +#else
8593 +       ret = setup_clkevents(tc, best_divisor_idx);
8594 +#endif
8595         if (ret)
8596                 goto err_unregister_clksrc;
8598 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
8599 index d911c5dca8f1..7a40f7e88468 100644
8600 --- a/drivers/clocksource/timer-atmel-pit.c
8601 +++ b/drivers/clocksource/timer-atmel-pit.c
8602 @@ -46,6 +46,7 @@ struct pit_data {
8603         u32             cycle;
8604         u32             cnt;
8605         unsigned int    irq;
8606 +       bool            irq_requested;
8607         struct clk      *mck;
8608  };
8610 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
8612         /* disable irq, leaving the clocksource active */
8613         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
8614 +       if (data->irq_requested) {
8615 +               free_irq(data->irq, data);
8616 +               data->irq_requested = false;
8617 +       }
8618         return 0;
8621 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
8622  /*
8623   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
8624   */
8625  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
8627         struct pit_data *data = clkevt_to_pit_data(dev);
8628 +       int ret;
8630 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8631 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8632 +                         "at91_tick", data);
8633 +       if (ret)
8634 +               panic(pr_fmt("Unable to setup IRQ\n"));
8636 +       data->irq_requested = true;
8638         /* update clocksource counter */
8639         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
8640 @@ -181,7 +196,6 @@ static void __init at91sam926x_pit_common_init(struct pit_data *data)
8642         unsigned long   pit_rate;
8643         unsigned        bits;
8644 -       int             ret;
8646         /*
8647          * Use our actual MCK to figure out how many MCK/16 ticks per
8648 @@ -206,13 +220,6 @@ static void __init at91sam926x_pit_common_init(struct pit_data *data)
8649         data->clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS;
8650         clocksource_register_hz(&data->clksrc, pit_rate);
8652 -       /* Set up irq handler */
8653 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8654 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8655 -                         "at91_tick", data);
8656 -       if (ret)
8657 -               panic(pr_fmt("Unable to setup IRQ\n"));
8659         /* Set up and register clockevents */
8660         data->clkevt.name = "pit";
8661         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
8662 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
8663 index 29d21d68df5a..103d0fd70cc4 100644
8664 --- a/drivers/clocksource/timer-atmel-st.c
8665 +++ b/drivers/clocksource/timer-atmel-st.c
8666 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
8667         last_crtr = read_CRTR();
8670 +static int atmel_st_irq;
8672  static int clkevt32k_shutdown(struct clock_event_device *evt)
8674         clkdev32k_disable_and_flush_irq();
8675         irqmask = 0;
8676         regmap_write(regmap_st, AT91_ST_IER, irqmask);
8677 +       free_irq(atmel_st_irq, regmap_st);
8678         return 0;
8681  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8683 +       int ret;
8685         clkdev32k_disable_and_flush_irq();
8687 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8688 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8689 +                         "at91_tick", regmap_st);
8690 +       if (ret)
8691 +               panic(pr_fmt("Unable to setup IRQ\n"));
8693         /*
8694          * ALM for oneshot irqs, set by next_event()
8695          * before 32 seconds have passed.
8696 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8698  static int clkevt32k_set_periodic(struct clock_event_device *dev)
8700 +       int ret;
8702         clkdev32k_disable_and_flush_irq();
8704 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8705 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8706 +                         "at91_tick", regmap_st);
8707 +       if (ret)
8708 +               panic(pr_fmt("Unable to setup IRQ\n"));
8710         /* PIT for periodic irqs; fixed rate of 1/HZ */
8711         irqmask = AT91_ST_PITS;
8712         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
8713 @@ -198,7 +217,7 @@ static void __init atmel_st_timer_init(struct device_node *node)
8715         struct clk *sclk;
8716         unsigned int sclk_rate, val;
8717 -       int irq, ret;
8718 +       int ret;
8720         regmap_st = syscon_node_to_regmap(node);
8721         if (IS_ERR(regmap_st))
8722 @@ -210,17 +229,10 @@ static void __init atmel_st_timer_init(struct device_node *node)
8723         regmap_read(regmap_st, AT91_ST_SR, &val);
8725         /* Get the interrupts property */
8726 -       irq  = irq_of_parse_and_map(node, 0);
8727 -       if (!irq)
8728 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
8729 +       if (!atmel_st_irq)
8730                 panic(pr_fmt("Unable to get IRQ from DT\n"));
8732 -       /* Make IRQs happen for the system timer */
8733 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
8734 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8735 -                         "at91_tick", regmap_st);
8736 -       if (ret)
8737 -               panic(pr_fmt("Unable to setup IRQ\n"));
8739         sclk = of_clk_get(node, 0);
8740         if (IS_ERR(sclk))
8741                 panic(pr_fmt("Unable to get slow clock\n"));
8742 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
8743 index c59bdcb83217..8f23161d80be 100644
8744 --- a/drivers/cpufreq/Kconfig.x86
8745 +++ b/drivers/cpufreq/Kconfig.x86
8746 @@ -123,7 +123,7 @@ config X86_POWERNOW_K7_ACPI
8748  config X86_POWERNOW_K8
8749         tristate "AMD Opteron/Athlon64 PowerNow!"
8750 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
8751 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
8752         help
8753           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
8754           Support for K10 and newer processors is now in acpi-cpufreq.
8755 diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
8756 index 344058f8501a..d5657d50ac40 100644
8757 --- a/drivers/cpuidle/coupled.c
8758 +++ b/drivers/cpuidle/coupled.c
8759 @@ -119,7 +119,6 @@ struct cpuidle_coupled {
8761  #define CPUIDLE_COUPLED_NOT_IDLE       (-1)
8763 -static DEFINE_MUTEX(cpuidle_coupled_lock);
8764  static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
8766  /*
8767 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8768 index 6ed7d63a0688..9da7482ad256 100644
8769 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8770 +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8771 @@ -1264,7 +1264,9 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
8772         if (ret)
8773                 return ret;
8775 +#ifndef CONFIG_PREEMPT_RT_BASE
8776         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
8777 +#endif
8779         i915_gem_execbuffer_move_to_active(vmas, params->request);
8780         i915_gem_execbuffer_retire_commands(params);
8781 diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
8782 index c0a96f1ee18e..deb1e207fa3c 100644
8783 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
8784 +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
8785 @@ -39,7 +39,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
8786         if (!mutex_is_locked(mutex))
8787                 return false;
8789 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
8790 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
8791         return mutex->owner == task;
8792  #else
8793         /* Since UP may be pre-empted, we cannot assume that we own the lock */
8794 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
8795 index b7b0a38acd67..148dcb1349d5 100644
8796 --- a/drivers/gpu/drm/i915/i915_irq.c
8797 +++ b/drivers/gpu/drm/i915/i915_irq.c
8798 @@ -812,6 +812,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8799         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
8801         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8802 +       preempt_disable_rt();
8804         /* Get optional system timestamp before query. */
8805         if (stime)
8806 @@ -863,6 +864,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8807                 *etime = ktime_get();
8809         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8810 +       preempt_enable_rt();
8812         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
8814 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
8815 index 4f5d07bb3511..8ecd5c016dba 100644
8816 --- a/drivers/gpu/drm/i915/intel_display.c
8817 +++ b/drivers/gpu/drm/i915/intel_display.c
8818 @@ -11400,7 +11400,7 @@ void intel_check_page_flip(struct drm_device *dev, int pipe)
8819         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
8820         struct intel_unpin_work *work;
8822 -       WARN_ON(!in_interrupt());
8823 +       WARN_ON_NONRT(!in_interrupt());
8825         if (crtc == NULL)
8826                 return;
8827 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
8828 index 2cc6aa072f4c..b79d33f14868 100644
8829 --- a/drivers/gpu/drm/i915/intel_sprite.c
8830 +++ b/drivers/gpu/drm/i915/intel_sprite.c
8831 @@ -38,6 +38,7 @@
8832  #include "intel_drv.h"
8833  #include <drm/i915_drm.h>
8834  #include "i915_drv.h"
8835 +#include <linux/locallock.h>
8837  static bool
8838  format_is_yuv(uint32_t format)
8839 @@ -64,6 +65,8 @@ static int usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
8840                             1000 * adjusted_mode->crtc_htotal);
8843 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
8845  /**
8846   * intel_pipe_update_start() - start update of a set of display registers
8847   * @crtc: the crtc of which the registers are going to be updated
8848 @@ -96,7 +99,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8849         min = vblank_start - usecs_to_scanlines(adjusted_mode, 100);
8850         max = vblank_start - 1;
8852 -       local_irq_disable();
8853 +       local_lock_irq(pipe_update_lock);
8855         if (min <= 0 || max <= 0)
8856                 return;
8857 @@ -126,11 +129,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8858                         break;
8859                 }
8861 -               local_irq_enable();
8862 +               local_unlock_irq(pipe_update_lock);
8864                 timeout = schedule_timeout(timeout);
8866 -               local_irq_disable();
8867 +               local_lock_irq(pipe_update_lock);
8868         }
8870         finish_wait(wq, &wait);
8871 @@ -164,7 +167,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc)
8873         trace_i915_pipe_update_end(crtc, end_vbl_count, scanline_end);
8875 -       local_irq_enable();
8876 +       local_unlock_irq(pipe_update_lock);
8878         if (crtc->debug.start_vbl_count &&
8879             crtc->debug.start_vbl_count != end_vbl_count) {
8880 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
8881 index 3645b223aa37..642854b2ed2c 100644
8882 --- a/drivers/gpu/drm/radeon/radeon_display.c
8883 +++ b/drivers/gpu/drm/radeon/radeon_display.c
8884 @@ -1862,6 +1862,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8885         struct radeon_device *rdev = dev->dev_private;
8887         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8888 +       preempt_disable_rt();
8890         /* Get optional system timestamp before query. */
8891         if (stime)
8892 @@ -1954,6 +1955,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8893                 *etime = ktime_get();
8895         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8896 +       preempt_enable_rt();
8898         /* Decode into vertical and horizontal scanout position. */
8899         *vpos = position & 0x1fff;
8900 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
8901 index 802dcb409030..d6d9427860d8 100644
8902 --- a/drivers/hv/vmbus_drv.c
8903 +++ b/drivers/hv/vmbus_drv.c
8904 @@ -820,7 +820,7 @@ static void vmbus_isr(void)
8905                         tasklet_schedule(&msg_dpc);
8906         }
8908 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
8909 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, 0);
8913 diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c
8914 index 08d26ba61ed3..46b89dd42b10 100644
8915 --- a/drivers/i2c/busses/i2c-omap.c
8916 +++ b/drivers/i2c/busses/i2c-omap.c
8917 @@ -995,15 +995,12 @@ omap_i2c_isr(int irq, void *dev_id)
8918         u16 mask;
8919         u16 stat;
8921 -       spin_lock(&omap->lock);
8922 -       mask = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
8923         stat = omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG);
8924 +       mask = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
8926         if (stat & mask)
8927                 ret = IRQ_WAKE_THREAD;
8929 -       spin_unlock(&omap->lock);
8931         return ret;
8934 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
8935 index 36f76e28a0bf..394f142f90c7 100644
8936 --- a/drivers/ide/alim15x3.c
8937 +++ b/drivers/ide/alim15x3.c
8938 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
8940         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
8942 -       local_irq_save(flags);
8943 +       local_irq_save_nort(flags);
8945         if (m5229_revision < 0xC2) {
8946                 /*
8947 @@ -325,7 +325,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
8948         }
8949         pci_dev_put(north);
8950         pci_dev_put(isa_dev);
8951 -       local_irq_restore(flags);
8952 +       local_irq_restore_nort(flags);
8953         return 0;
8956 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
8957 index 696b6c1ec940..0d0a96629b73 100644
8958 --- a/drivers/ide/hpt366.c
8959 +++ b/drivers/ide/hpt366.c
8960 @@ -1241,7 +1241,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8962         dma_old = inb(base + 2);
8964 -       local_irq_save(flags);
8965 +       local_irq_save_nort(flags);
8967         dma_new = dma_old;
8968         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
8969 @@ -1252,7 +1252,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8970         if (dma_new != dma_old)
8971                 outb(dma_new, base + 2);
8973 -       local_irq_restore(flags);
8974 +       local_irq_restore_nort(flags);
8976         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
8977                          hwif->name, base, base + 7);
8978 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
8979 index 19763977568c..4169433faab5 100644
8980 --- a/drivers/ide/ide-io-std.c
8981 +++ b/drivers/ide/ide-io-std.c
8982 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8983                 unsigned long uninitialized_var(flags);
8985                 if ((io_32bit & 2) && !mmio) {
8986 -                       local_irq_save(flags);
8987 +                       local_irq_save_nort(flags);
8988                         ata_vlb_sync(io_ports->nsect_addr);
8989                 }
8991 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8992                         insl(data_addr, buf, words);
8994                 if ((io_32bit & 2) && !mmio)
8995 -                       local_irq_restore(flags);
8996 +                       local_irq_restore_nort(flags);
8998                 if (((len + 1) & 3) < 2)
8999                         return;
9000 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
9001                 unsigned long uninitialized_var(flags);
9003                 if ((io_32bit & 2) && !mmio) {
9004 -                       local_irq_save(flags);
9005 +                       local_irq_save_nort(flags);
9006                         ata_vlb_sync(io_ports->nsect_addr);
9007                 }
9009 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
9010                         outsl(data_addr, buf, words);
9012                 if ((io_32bit & 2) && !mmio)
9013 -                       local_irq_restore(flags);
9014 +                       local_irq_restore_nort(flags);
9016                 if (((len + 1) & 3) < 2)
9017                         return;
9018 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
9019 index 669ea1e45795..e12e43e62245 100644
9020 --- a/drivers/ide/ide-io.c
9021 +++ b/drivers/ide/ide-io.c
9022 @@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
9023                 /* disable_irq_nosync ?? */
9024                 disable_irq(hwif->irq);
9025                 /* local CPU only, as if we were handling an interrupt */
9026 -               local_irq_disable();
9027 +               local_irq_disable_nort();
9028                 if (hwif->polling) {
9029                         startstop = handler(drive);
9030                 } else if (drive_is_ready(drive)) {
9031 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
9032 index 376f2dc410c5..f014dd1b73dc 100644
9033 --- a/drivers/ide/ide-iops.c
9034 +++ b/drivers/ide/ide-iops.c
9035 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
9036                                 if ((stat & ATA_BUSY) == 0)
9037                                         break;
9039 -                               local_irq_restore(flags);
9040 +                               local_irq_restore_nort(flags);
9041                                 *rstat = stat;
9042                                 return -EBUSY;
9043                         }
9044                 }
9045 -               local_irq_restore(flags);
9046 +               local_irq_restore_nort(flags);
9047         }
9048         /*
9049          * Allow status to settle, then read it again.
9050 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
9051 index 0b63facd1d87..4ceba37afc0c 100644
9052 --- a/drivers/ide/ide-probe.c
9053 +++ b/drivers/ide/ide-probe.c
9054 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
9055         int bswap = 1;
9057         /* local CPU only; some systems need this */
9058 -       local_irq_save(flags);
9059 +       local_irq_save_nort(flags);
9060         /* read 512 bytes of id info */
9061         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
9062 -       local_irq_restore(flags);
9063 +       local_irq_restore_nort(flags);
9065         drive->dev_flags |= IDE_DFLAG_ID_READ;
9066  #ifdef DEBUG
9067 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
9068 index a716693417a3..be0568c722d6 100644
9069 --- a/drivers/ide/ide-taskfile.c
9070 +++ b/drivers/ide/ide-taskfile.c
9071 @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
9073                 page_is_high = PageHighMem(page);
9074                 if (page_is_high)
9075 -                       local_irq_save(flags);
9076 +                       local_irq_save_nort(flags);
9078                 buf = kmap_atomic(page) + offset;
9080 @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
9081                 kunmap_atomic(buf);
9083                 if (page_is_high)
9084 -                       local_irq_restore(flags);
9085 +                       local_irq_restore_nort(flags);
9087                 len -= nr_bytes;
9088         }
9089 @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
9090         }
9092         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
9093 -               local_irq_disable();
9094 +               local_irq_disable_nort();
9096         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
9098 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
9099 index 5580ab0b5781..a123d0439c4c 100644
9100 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
9101 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
9102 @@ -862,7 +862,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
9104         ipoib_dbg_mcast(priv, "restarting multicast task\n");
9106 -       local_irq_save(flags);
9107 +       local_irq_save_nort(flags);
9108         netif_addr_lock(dev);
9109         spin_lock(&priv->lock);
9111 @@ -944,7 +944,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
9113         spin_unlock(&priv->lock);
9114         netif_addr_unlock(dev);
9115 -       local_irq_restore(flags);
9116 +       local_irq_restore_nort(flags);
9118         /*
9119          * make sure the in-flight joins have finished before we attempt
9120 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
9121 index 4a2a9e370be7..e970d9afd179 100644
9122 --- a/drivers/input/gameport/gameport.c
9123 +++ b/drivers/input/gameport/gameport.c
9124 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
9125         tx = ~0;
9127         for (i = 0; i < 50; i++) {
9128 -               local_irq_save(flags);
9129 +               local_irq_save_nort(flags);
9130                 t1 = ktime_get_ns();
9131                 for (t = 0; t < 50; t++)
9132                         gameport_read(gameport);
9133                 t2 = ktime_get_ns();
9134                 t3 = ktime_get_ns();
9135 -               local_irq_restore(flags);
9136 +               local_irq_restore_nort(flags);
9137                 udelay(i * 10);
9138                 t = (t2 - t1) - (t3 - t2);
9139                 if (t < tx)
9140 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
9141         tx = 1 << 30;
9143         for(i = 0; i < 50; i++) {
9144 -               local_irq_save(flags);
9145 +               local_irq_save_nort(flags);
9146                 GET_TIME(t1);
9147                 for (t = 0; t < 50; t++) gameport_read(gameport);
9148                 GET_TIME(t2);
9149                 GET_TIME(t3);
9150 -               local_irq_restore(flags);
9151 +               local_irq_restore_nort(flags);
9152                 udelay(i * 10);
9153                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
9154         }
9155 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
9156         tx = 1 << 30;
9158         for(i = 0; i < 50; i++) {
9159 -               local_irq_save(flags);
9160 +               local_irq_save_nort(flags);
9161                 t1 = rdtsc();
9162                 for (t = 0; t < 50; t++) gameport_read(gameport);
9163                 t2 = rdtsc();
9164 -               local_irq_restore(flags);
9165 +               local_irq_restore_nort(flags);
9166                 udelay(i * 10);
9167                 if (t2 - t1 < tx) tx = t2 - t1;
9168         }
9169 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
9170 index 52c36394dba5..d777d0197f64 100644
9171 --- a/drivers/iommu/amd_iommu.c
9172 +++ b/drivers/iommu/amd_iommu.c
9173 @@ -2022,10 +2022,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
9174         int ret;
9176         /*
9177 -        * Must be called with IRQs disabled. Warn here to detect early
9178 -        * when its not.
9179 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
9180 +        * detect early when its not.
9181          */
9182 -       WARN_ON(!irqs_disabled());
9183 +       WARN_ON_NONRT(!irqs_disabled());
9185         /* lock domain */
9186         spin_lock(&domain->lock);
9187 @@ -2188,10 +2188,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
9188         struct protection_domain *domain;
9190         /*
9191 -        * Must be called with IRQs disabled. Warn here to detect early
9192 -        * when its not.
9193 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
9194 +        * detect early when its not.
9195          */
9196 -       WARN_ON(!irqs_disabled());
9197 +       WARN_ON_NONRT(!irqs_disabled());
9199         if (WARN_ON(!dev_data->domain))
9200                 return;
9201 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
9202 index 5bda6a9b56bb..d6286584c807 100644
9203 --- a/drivers/leds/trigger/Kconfig
9204 +++ b/drivers/leds/trigger/Kconfig
9205 @@ -61,7 +61,7 @@ config LEDS_TRIGGER_BACKLIGHT
9207  config LEDS_TRIGGER_CPU
9208         bool "LED CPU Trigger"
9209 -       depends on LEDS_TRIGGERS
9210 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
9211         help
9212           This allows LEDs to be controlled by active CPUs. This shows
9213           the active CPUs across an array of LEDs so you can see which
9214 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
9215 index 4d200883c505..98b64ed5cb81 100644
9216 --- a/drivers/md/bcache/Kconfig
9217 +++ b/drivers/md/bcache/Kconfig
9218 @@ -1,6 +1,7 @@
9220  config BCACHE
9221         tristate "Block device as cache"
9222 +       depends on !PREEMPT_RT_FULL
9223         ---help---
9224         Allows a block device to be used as cache for other devices; uses
9225         a btree for indexing and the layout is optimized for SSDs.
9226 diff --git a/drivers/md/dm.c b/drivers/md/dm.c
9227 index 9ec6948e3b8b..ecbc23575114 100644
9228 --- a/drivers/md/dm.c
9229 +++ b/drivers/md/dm.c
9230 @@ -2185,7 +2185,7 @@ static void dm_request_fn(struct request_queue *q)
9231                 /* Establish tio->ti before queuing work (map_tio_request) */
9232                 tio->ti = ti;
9233                 queue_kthread_work(&md->kworker, &tio->work);
9234 -               BUG_ON(!irqs_disabled());
9235 +               BUG_ON_NONRT(!irqs_disabled());
9236         }
9238         goto out;
9239 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
9240 index 86ab6d14d782..573b9ac810da 100644
9241 --- a/drivers/md/raid5.c
9242 +++ b/drivers/md/raid5.c
9243 @@ -429,7 +429,7 @@ void raid5_release_stripe(struct stripe_head *sh)
9244                 md_wakeup_thread(conf->mddev->thread);
9245         return;
9246  slow_path:
9247 -       local_irq_save(flags);
9248 +       local_irq_save_nort(flags);
9249         /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
9250         if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
9251                 INIT_LIST_HEAD(&list);
9252 @@ -438,7 +438,7 @@ void raid5_release_stripe(struct stripe_head *sh)
9253                 spin_unlock(&conf->device_lock);
9254                 release_inactive_stripe_list(conf, &list, hash);
9255         }
9256 -       local_irq_restore(flags);
9257 +       local_irq_restore_nort(flags);
9260  static inline void remove_hash(struct stripe_head *sh)
9261 @@ -1929,8 +1929,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9262         struct raid5_percpu *percpu;
9263         unsigned long cpu;
9265 -       cpu = get_cpu();
9266 +       cpu = get_cpu_light();
9267         percpu = per_cpu_ptr(conf->percpu, cpu);
9268 +       spin_lock(&percpu->lock);
9269         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
9270                 ops_run_biofill(sh);
9271                 overlap_clear++;
9272 @@ -1986,7 +1987,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9273                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
9274                                 wake_up(&sh->raid_conf->wait_for_overlap);
9275                 }
9276 -       put_cpu();
9277 +       spin_unlock(&percpu->lock);
9278 +       put_cpu_light();
9281  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
9282 @@ -6433,6 +6435,7 @@ static int raid5_alloc_percpu(struct r5conf *conf)
9283                                __func__, cpu);
9284                         break;
9285                 }
9286 +               spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
9287         }
9288         put_online_cpus();
9290 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
9291 index 517d4b68a1be..efe91887ecd7 100644
9292 --- a/drivers/md/raid5.h
9293 +++ b/drivers/md/raid5.h
9294 @@ -504,6 +504,7 @@ struct r5conf {
9295         int                     recovery_disabled;
9296         /* per cpu variables */
9297         struct raid5_percpu {
9298 +               spinlock_t      lock;           /* Protection for -RT */
9299                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
9300                 struct flex_array *scribble;   /* space for constructing buffer
9301                                               * lists and performing address
9302 diff --git a/drivers/media/platform/vsp1/vsp1_video.c b/drivers/media/platform/vsp1/vsp1_video.c
9303 index 5ce88e1f5d71..b4f8cd74ecb8 100644
9304 --- a/drivers/media/platform/vsp1/vsp1_video.c
9305 +++ b/drivers/media/platform/vsp1/vsp1_video.c
9306 @@ -520,7 +520,7 @@ static bool vsp1_pipeline_stopped(struct vsp1_pipeline *pipe)
9307         bool stopped;
9309         spin_lock_irqsave(&pipe->irqlock, flags);
9310 -       stopped = pipe->state == VSP1_PIPELINE_STOPPED,
9311 +       stopped = pipe->state == VSP1_PIPELINE_STOPPED;
9312         spin_unlock_irqrestore(&pipe->irqlock, flags);
9314         return stopped;
9315 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
9316 index 4bf7d50b1bc7..6f7e99ad6e29 100644
9317 --- a/drivers/misc/Kconfig
9318 +++ b/drivers/misc/Kconfig
9319 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
9320  config ATMEL_TCLIB
9321         bool "Atmel AT32/AT91 Timer/Counter Library"
9322         depends on (AVR32 || ARCH_AT91)
9323 +       default y if PREEMPT_RT_FULL
9324         help
9325           Select this if you want a library to allocate the Timer/Counter
9326           blocks found on many Atmel processors.  This facilitates using
9327 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
9328           are combined to make a single 32-bit timer.
9330           When GENERIC_CLOCKEVENTS is defined, the third timer channel
9331 -         may be used as a clock event device supporting oneshot mode
9332 -         (delays of up to two seconds) based on the 32 KiHz clock.
9333 +         may be used as a clock event device supporting oneshot mode.
9335  config ATMEL_TCB_CLKSRC_BLOCK
9336         int
9337 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
9338           TC can be used for other purposes, such as PWM generation and
9339           interval timing.
9341 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
9342 +       bool "TC Block use 32 KiHz clock"
9343 +       depends on ATMEL_TCB_CLKSRC
9344 +       default y if !PREEMPT_RT_FULL
9345 +       help
9346 +         Select this to use 32 KiHz base clock rate as TC block clock
9347 +         source for clock events.
9350  config DUMMY_IRQ
9351         tristate "Dummy IRQ handler"
9352         default n
9353 @@ -113,6 +122,35 @@ config IBM_ASM
9354           for information on the specific driver level and support statement
9355           for your IBM server.
9357 +config HWLAT_DETECTOR
9358 +       tristate "Testing module to detect hardware-induced latencies"
9359 +       depends on DEBUG_FS
9360 +       depends on RING_BUFFER
9361 +       default m
9362 +       ---help---
9363 +         A simple hardware latency detector. Use this module to detect
9364 +         large latencies introduced by the behavior of the underlying
9365 +         system firmware external to Linux. We do this using periodic
9366 +         use of stop_machine to grab all available CPUs and measure
9367 +         for unexplainable gaps in the CPU timestamp counter(s). By
9368 +         default, the module is not enabled until the "enable" file
9369 +         within the "hwlat_detector" debugfs directory is toggled.
9371 +         This module is often used to detect SMI (System Management
9372 +         Interrupts) on x86 systems, though is not x86 specific. To
9373 +         this end, we default to using a sample window of 1 second,
9374 +         during which we will sample for 0.5 seconds. If an SMI or
9375 +         similar event occurs during that time, it is recorded
9376 +         into an 8K samples global ring buffer until retreived.
9378 +         WARNING: This software should never be enabled (it can be built
9379 +         but should not be turned on after it is loaded) in a production
9380 +         environment where high latencies are a concern since the
9381 +         sampling mechanism actually introduces latencies for
9382 +         regular tasks while the CPU(s) are being held.
9384 +         If unsure, say N
9386  config PHANTOM
9387         tristate "Sensable PHANToM (PCI)"
9388         depends on PCI
9389 diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
9390 index 537d7f3b78da..ec4aecba0656 100644
9391 --- a/drivers/misc/Makefile
9392 +++ b/drivers/misc/Makefile
9393 @@ -39,6 +39,7 @@ obj-$(CONFIG_C2PORT)          += c2port/
9394  obj-$(CONFIG_HMC6352)          += hmc6352.o
9395  obj-y                          += eeprom/
9396  obj-y                          += cb710/
9397 +obj-$(CONFIG_HWLAT_DETECTOR)   += hwlat_detector.o
9398  obj-$(CONFIG_SPEAR13XX_PCIE_GADGET)    += spear13xx_pcie_gadget.o
9399  obj-$(CONFIG_VMWARE_BALLOON)   += vmw_balloon.o
9400  obj-$(CONFIG_ARM_CHARLCD)      += arm-charlcd.o
9401 diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
9402 new file mode 100644
9403 index 000000000000..52f5ad5fd9c0
9404 --- /dev/null
9405 +++ b/drivers/misc/hwlat_detector.c
9406 @@ -0,0 +1,1240 @@
9408 + * hwlat_detector.c - A simple Hardware Latency detector.
9409 + *
9410 + * Use this module to detect large system latencies induced by the behavior of
9411 + * certain underlying system hardware or firmware, independent of Linux itself.
9412 + * The code was developed originally to detect the presence of SMIs on Intel
9413 + * and AMD systems, although there is no dependency upon x86 herein.
9414 + *
9415 + * The classical example usage of this module is in detecting the presence of
9416 + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
9417 + * somewhat special form of hardware interrupt spawned from earlier CPU debug
9418 + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
9419 + * LPC (or other device) to generate a special interrupt under certain
9420 + * circumstances, for example, upon expiration of a special SMI timer device,
9421 + * due to certain external thermal readings, on certain I/O address accesses,
9422 + * and other situations. An SMI hits a special CPU pin, triggers a special
9423 + * SMI mode (complete with special memory map), and the OS is unaware.
9424 + *
9425 + * Although certain hardware-inducing latencies are necessary (for example,
9426 + * a modern system often requires an SMI handler for correct thermal control
9427 + * and remote management) they can wreak havoc upon any OS-level performance
9428 + * guarantees toward low-latency, especially when the OS is not even made
9429 + * aware of the presence of these interrupts. For this reason, we need a
9430 + * somewhat brute force mechanism to detect these interrupts. In this case,
9431 + * we do it by hogging all of the CPU(s) for configurable timer intervals,
9432 + * sampling the built-in CPU timer, looking for discontiguous readings.
9433 + *
9434 + * WARNING: This implementation necessarily introduces latencies. Therefore,
9435 + *          you should NEVER use this module in a production environment
9436 + *          requiring any kind of low-latency performance guarantee(s).
9437 + *
9438 + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
9439 + *
9440 + * Includes useful feedback from Clark Williams <clark@redhat.com>
9441 + *
9442 + * This file is licensed under the terms of the GNU General Public
9443 + * License version 2. This program is licensed "as is" without any
9444 + * warranty of any kind, whether express or implied.
9445 + */
9447 +#include <linux/module.h>
9448 +#include <linux/init.h>
9449 +#include <linux/ring_buffer.h>
9450 +#include <linux/time.h>
9451 +#include <linux/hrtimer.h>
9452 +#include <linux/kthread.h>
9453 +#include <linux/debugfs.h>
9454 +#include <linux/seq_file.h>
9455 +#include <linux/uaccess.h>
9456 +#include <linux/version.h>
9457 +#include <linux/delay.h>
9458 +#include <linux/slab.h>
9459 +#include <linux/trace_clock.h>
9461 +#define BUF_SIZE_DEFAULT       262144UL                /* 8K*(sizeof(entry)) */
9462 +#define BUF_FLAGS              (RB_FL_OVERWRITE)       /* no block on full */
9463 +#define U64STR_SIZE            22                      /* 20 digits max */
9465 +#define VERSION                        "1.0.0"
9466 +#define BANNER                 "hwlat_detector: "
9467 +#define DRVNAME                        "hwlat_detector"
9468 +#define DEFAULT_SAMPLE_WINDOW  1000000                 /* 1s */
9469 +#define DEFAULT_SAMPLE_WIDTH   500000                  /* 0.5s */
9470 +#define DEFAULT_LAT_THRESHOLD  10                      /* 10us */
9472 +/* Module metadata */
9474 +MODULE_LICENSE("GPL");
9475 +MODULE_AUTHOR("Jon Masters <jcm@redhat.com>");
9476 +MODULE_DESCRIPTION("A simple hardware latency detector");
9477 +MODULE_VERSION(VERSION);
9479 +/* Module parameters */
9481 +static int debug;
9482 +static int enabled;
9483 +static int threshold;
9485 +module_param(debug, int, 0);                   /* enable debug */
9486 +module_param(enabled, int, 0);                 /* enable detector */
9487 +module_param(threshold, int, 0);               /* latency threshold */
9489 +/* Buffering and sampling */
9491 +static struct ring_buffer *ring_buffer;                /* sample buffer */
9492 +static DEFINE_MUTEX(ring_buffer_mutex);                /* lock changes */
9493 +static unsigned long buf_size = BUF_SIZE_DEFAULT;
9494 +static struct task_struct *kthread;            /* sampling thread */
9496 +/* DebugFS filesystem entries */
9498 +static struct dentry *debug_dir;               /* debugfs directory */
9499 +static struct dentry *debug_max;               /* maximum TSC delta */
9500 +static struct dentry *debug_count;             /* total detect count */
9501 +static struct dentry *debug_sample_width;      /* sample width us */
9502 +static struct dentry *debug_sample_window;     /* sample window us */
9503 +static struct dentry *debug_sample;            /* raw samples us */
9504 +static struct dentry *debug_threshold;         /* threshold us */
9505 +static struct dentry *debug_enable;            /* enable/disable */
9507 +/* Individual samples and global state */
9509 +struct sample;                                 /* latency sample */
9510 +struct data;                                   /* Global state */
9512 +/* Sampling functions */
9513 +static int __buffer_add_sample(struct sample *sample);
9514 +static struct sample *buffer_get_sample(struct sample *sample);
9516 +/* Threading and state */
9517 +static int kthread_fn(void *unused);
9518 +static int start_kthread(void);
9519 +static int stop_kthread(void);
9520 +static void __reset_stats(void);
9521 +static int init_stats(void);
9523 +/* Debugfs interface */
9524 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
9525 +                               size_t cnt, loff_t *ppos, const u64 *entry);
9526 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
9527 +                                size_t cnt, loff_t *ppos, u64 *entry);
9528 +static int debug_sample_fopen(struct inode *inode, struct file *filp);
9529 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
9530 +                                 size_t cnt, loff_t *ppos);
9531 +static int debug_sample_release(struct inode *inode, struct file *filp);
9532 +static int debug_enable_fopen(struct inode *inode, struct file *filp);
9533 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
9534 +                                 size_t cnt, loff_t *ppos);
9535 +static ssize_t debug_enable_fwrite(struct file *file,
9536 +                                  const char __user *user_buffer,
9537 +                                  size_t user_size, loff_t *offset);
9539 +/* Initialization functions */
9540 +static int init_debugfs(void);
9541 +static void free_debugfs(void);
9542 +static int detector_init(void);
9543 +static void detector_exit(void);
9545 +/* Individual latency samples are stored here when detected and packed into
9546 + * the ring_buffer circular buffer, where they are overwritten when
9547 + * more than buf_size/sizeof(sample) samples are received. */
9548 +struct sample {
9549 +       u64             seqnum;         /* unique sequence */
9550 +       u64             duration;       /* ktime delta */
9551 +       u64             outer_duration; /* ktime delta (outer loop) */
9552 +       struct timespec timestamp;      /* wall time */
9553 +       unsigned long   lost;
9556 +/* keep the global state somewhere. */
9557 +static struct data {
9559 +       struct mutex lock;              /* protect changes */
9561 +       u64     count;                  /* total since reset */
9562 +       u64     max_sample;             /* max hardware latency */
9563 +       u64     threshold;              /* sample threshold level */
9565 +       u64     sample_window;          /* total sampling window (on+off) */
9566 +       u64     sample_width;           /* active sampling portion of window */
9568 +       atomic_t sample_open;           /* whether the sample file is open */
9570 +       wait_queue_head_t wq;           /* waitqeue for new sample values */
9572 +} data;
9574 +/**
9575 + * __buffer_add_sample - add a new latency sample recording to the ring buffer
9576 + * @sample: The new latency sample value
9577 + *
9578 + * This receives a new latency sample and records it in a global ring buffer.
9579 + * No additional locking is used in this case.
9580 + */
9581 +static int __buffer_add_sample(struct sample *sample)
9583 +       return ring_buffer_write(ring_buffer,
9584 +                                sizeof(struct sample), sample);
9587 +/**
9588 + * buffer_get_sample - remove a hardware latency sample from the ring buffer
9589 + * @sample: Pre-allocated storage for the sample
9590 + *
9591 + * This retrieves a hardware latency sample from the global circular buffer
9592 + */
9593 +static struct sample *buffer_get_sample(struct sample *sample)
9595 +       struct ring_buffer_event *e = NULL;
9596 +       struct sample *s = NULL;
9597 +       unsigned int cpu = 0;
9599 +       if (!sample)
9600 +               return NULL;
9602 +       mutex_lock(&ring_buffer_mutex);
9603 +       for_each_online_cpu(cpu) {
9604 +               e = ring_buffer_consume(ring_buffer, cpu, NULL, &sample->lost);
9605 +               if (e)
9606 +                       break;
9607 +       }
9609 +       if (e) {
9610 +               s = ring_buffer_event_data(e);
9611 +               memcpy(sample, s, sizeof(struct sample));
9612 +       } else
9613 +               sample = NULL;
9614 +       mutex_unlock(&ring_buffer_mutex);
9616 +       return sample;
9619 +#ifndef CONFIG_TRACING
9620 +#define time_type      ktime_t
9621 +#define time_get()     ktime_get()
9622 +#define time_to_us(x)  ktime_to_us(x)
9623 +#define time_sub(a, b) ktime_sub(a, b)
9624 +#define init_time(a, b)        (a).tv64 = b
9625 +#define time_u64(a)    ((a).tv64)
9626 +#else
9627 +#define time_type      u64
9628 +#define time_get()     trace_clock_local()
9629 +#define time_to_us(x)  div_u64(x, 1000)
9630 +#define time_sub(a, b) ((a) - (b))
9631 +#define init_time(a, b)        (a = b)
9632 +#define time_u64(a)    a
9633 +#endif
9634 +/**
9635 + * get_sample - sample the CPU TSC and look for likely hardware latencies
9636 + *
9637 + * Used to repeatedly capture the CPU TSC (or similar), looking for potential
9638 + * hardware-induced latency. Called with interrupts disabled and with
9639 + * data.lock held.
9640 + */
9641 +static int get_sample(void)
9643 +       time_type start, t1, t2, last_t2;
9644 +       s64 diff, total = 0;
9645 +       u64 sample = 0;
9646 +       u64 outer_sample = 0;
9647 +       int ret = -1;
9649 +       init_time(last_t2, 0);
9650 +       start = time_get(); /* start timestamp */
9652 +       do {
9654 +               t1 = time_get();        /* we'll look for a discontinuity */
9655 +               t2 = time_get();
9657 +               if (time_u64(last_t2)) {
9658 +                       /* Check the delta from outer loop (t2 to next t1) */
9659 +                       diff = time_to_us(time_sub(t1, last_t2));
9660 +                       /* This shouldn't happen */
9661 +                       if (diff < 0) {
9662 +                               pr_err(BANNER "time running backwards\n");
9663 +                               goto out;
9664 +                       }
9665 +                       if (diff > outer_sample)
9666 +                               outer_sample = diff;
9667 +               }
9668 +               last_t2 = t2;
9670 +               total = time_to_us(time_sub(t2, start)); /* sample width */
9672 +               /* This checks the inner loop (t1 to t2) */
9673 +               diff = time_to_us(time_sub(t2, t1));     /* current diff */
9675 +               /* This shouldn't happen */
9676 +               if (diff < 0) {
9677 +                       pr_err(BANNER "time running backwards\n");
9678 +                       goto out;
9679 +               }
9681 +               if (diff > sample)
9682 +                       sample = diff; /* only want highest value */
9684 +       } while (total <= data.sample_width);
9686 +       ret = 0;
9688 +       /* If we exceed the threshold value, we have found a hardware latency */
9689 +       if (sample > data.threshold || outer_sample > data.threshold) {
9690 +               struct sample s;
9692 +               ret = 1;
9694 +               data.count++;
9695 +               s.seqnum = data.count;
9696 +               s.duration = sample;
9697 +               s.outer_duration = outer_sample;
9698 +               s.timestamp = CURRENT_TIME;
9699 +               __buffer_add_sample(&s);
9701 +               /* Keep a running maximum ever recorded hardware latency */
9702 +               if (sample > data.max_sample)
9703 +                       data.max_sample = sample;
9704 +       }
9706 +out:
9707 +       return ret;
9711 + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
9712 + * @unused: A required part of the kthread API.
9713 + *
9714 + * Used to periodically sample the CPU TSC via a call to get_sample. We
9715 + * disable interrupts, which does (intentionally) introduce latency since we
9716 + * need to ensure nothing else might be running (and thus pre-empting).
9717 + * Obviously this should never be used in production environments.
9718 + *
9719 + * Currently this runs on which ever CPU it was scheduled on, but most
9720 + * real-worald hardware latency situations occur across several CPUs,
9721 + * but we might later generalize this if we find there are any actualy
9722 + * systems with alternate SMI delivery or other hardware latencies.
9723 + */
9724 +static int kthread_fn(void *unused)
9726 +       int ret;
9727 +       u64 interval;
9729 +       while (!kthread_should_stop()) {
9731 +               mutex_lock(&data.lock);
9733 +               local_irq_disable();
9734 +               ret = get_sample();
9735 +               local_irq_enable();
9737 +               if (ret > 0)
9738 +                       wake_up(&data.wq); /* wake up reader(s) */
9740 +               interval = data.sample_window - data.sample_width;
9741 +               do_div(interval, USEC_PER_MSEC); /* modifies interval value */
9743 +               mutex_unlock(&data.lock);
9745 +               if (msleep_interruptible(interval))
9746 +                       break;
9747 +       }
9749 +       return 0;
9752 +/**
9753 + * start_kthread - Kick off the hardware latency sampling/detector kthread
9754 + *
9755 + * This starts a kernel thread that will sit and sample the CPU timestamp
9756 + * counter (TSC or similar) and look for potential hardware latencies.
9757 + */
9758 +static int start_kthread(void)
9760 +       kthread = kthread_run(kthread_fn, NULL,
9761 +                                       DRVNAME);
9762 +       if (IS_ERR(kthread)) {
9763 +               pr_err(BANNER "could not start sampling thread\n");
9764 +               enabled = 0;
9765 +               return -ENOMEM;
9766 +       }
9768 +       return 0;
9771 +/**
9772 + * stop_kthread - Inform the hardware latency samping/detector kthread to stop
9773 + *
9774 + * This kicks the running hardware latency sampling/detector kernel thread and
9775 + * tells it to stop sampling now. Use this on unload and at system shutdown.
9776 + */
9777 +static int stop_kthread(void)
9779 +       int ret;
9781 +       ret = kthread_stop(kthread);
9783 +       return ret;
9786 +/**
9787 + * __reset_stats - Reset statistics for the hardware latency detector
9788 + *
9789 + * We use data to store various statistics and global state. We call this
9790 + * function in order to reset those when "enable" is toggled on or off, and
9791 + * also at initialization. Should be called with data.lock held.
9792 + */
9793 +static void __reset_stats(void)
9795 +       data.count = 0;
9796 +       data.max_sample = 0;
9797 +       ring_buffer_reset(ring_buffer); /* flush out old sample entries */
9800 +/**
9801 + * init_stats - Setup global state statistics for the hardware latency detector
9802 + *
9803 + * We use data to store various statistics and global state. We also use
9804 + * a global ring buffer (ring_buffer) to keep raw samples of detected hardware
9805 + * induced system latencies. This function initializes these structures and
9806 + * allocates the global ring buffer also.
9807 + */
9808 +static int init_stats(void)
9810 +       int ret = -ENOMEM;
9812 +       mutex_init(&data.lock);
9813 +       init_waitqueue_head(&data.wq);
9814 +       atomic_set(&data.sample_open, 0);
9816 +       ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS);
9818 +       if (WARN(!ring_buffer, KERN_ERR BANNER
9819 +                              "failed to allocate ring buffer!\n"))
9820 +               goto out;
9822 +       __reset_stats();
9823 +       data.threshold = threshold ?: DEFAULT_LAT_THRESHOLD; /* threshold us */
9824 +       data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
9825 +       data.sample_width = DEFAULT_SAMPLE_WIDTH;   /* width us */
9827 +       ret = 0;
9829 +out:
9830 +       return ret;
9835 + * simple_data_read - Wrapper read function for global state debugfs entries
9836 + * @filp: The active open file structure for the debugfs "file"
9837 + * @ubuf: The userspace provided buffer to read value into
9838 + * @cnt: The maximum number of bytes to read
9839 + * @ppos: The current "file" position
9840 + * @entry: The entry to read from
9841 + *
9842 + * This function provides a generic read implementation for the global state
9843 + * "data" structure debugfs filesystem entries. It would be nice to use
9844 + * simple_attr_read directly, but we need to make sure that the data.lock
9845 + * is held during the actual read.
9846 + */
9847 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
9848 +                               size_t cnt, loff_t *ppos, const u64 *entry)
9850 +       char buf[U64STR_SIZE];
9851 +       u64 val = 0;
9852 +       int len = 0;
9854 +       memset(buf, 0, sizeof(buf));
9856 +       if (!entry)
9857 +               return -EFAULT;
9859 +       mutex_lock(&data.lock);
9860 +       val = *entry;
9861 +       mutex_unlock(&data.lock);
9863 +       len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val);
9865 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
9870 + * simple_data_write - Wrapper write function for global state debugfs entries
9871 + * @filp: The active open file structure for the debugfs "file"
9872 + * @ubuf: The userspace provided buffer to write value from
9873 + * @cnt: The maximum number of bytes to write
9874 + * @ppos: The current "file" position
9875 + * @entry: The entry to write to
9876 + *
9877 + * This function provides a generic write implementation for the global state
9878 + * "data" structure debugfs filesystem entries. It would be nice to use
9879 + * simple_attr_write directly, but we need to make sure that the data.lock
9880 + * is held during the actual write.
9881 + */
9882 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
9883 +                                size_t cnt, loff_t *ppos, u64 *entry)
9885 +       char buf[U64STR_SIZE];
9886 +       int csize = min(cnt, sizeof(buf));
9887 +       u64 val = 0;
9888 +       int err = 0;
9890 +       memset(buf, '\0', sizeof(buf));
9891 +       if (copy_from_user(buf, ubuf, csize))
9892 +               return -EFAULT;
9894 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
9895 +       err = kstrtoull(buf, 10, &val);
9896 +       if (err)
9897 +               return -EINVAL;
9899 +       mutex_lock(&data.lock);
9900 +       *entry = val;
9901 +       mutex_unlock(&data.lock);
9903 +       return csize;
9906 +/**
9907 + * debug_count_fopen - Open function for "count" debugfs entry
9908 + * @inode: The in-kernel inode representation of the debugfs "file"
9909 + * @filp: The active open file structure for the debugfs "file"
9910 + *
9911 + * This function provides an open implementation for the "count" debugfs
9912 + * interface to the hardware latency detector.
9913 + */
9914 +static int debug_count_fopen(struct inode *inode, struct file *filp)
9916 +       return 0;
9919 +/**
9920 + * debug_count_fread - Read function for "count" debugfs entry
9921 + * @filp: The active open file structure for the debugfs "file"
9922 + * @ubuf: The userspace provided buffer to read value into
9923 + * @cnt: The maximum number of bytes to read
9924 + * @ppos: The current "file" position
9925 + *
9926 + * This function provides a read implementation for the "count" debugfs
9927 + * interface to the hardware latency detector. Can be used to read the
9928 + * number of latency readings exceeding the configured threshold since
9929 + * the detector was last reset (e.g. by writing a zero into "count").
9930 + */
9931 +static ssize_t debug_count_fread(struct file *filp, char __user *ubuf,
9932 +                                    size_t cnt, loff_t *ppos)
9934 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.count);
9937 +/**
9938 + * debug_count_fwrite - Write function for "count" debugfs entry
9939 + * @filp: The active open file structure for the debugfs "file"
9940 + * @ubuf: The user buffer that contains the value to write
9941 + * @cnt: The maximum number of bytes to write to "file"
9942 + * @ppos: The current position in the debugfs "file"
9943 + *
9944 + * This function provides a write implementation for the "count" debugfs
9945 + * interface to the hardware latency detector. Can be used to write a
9946 + * desired value, especially to zero the total count.
9947 + */
9948 +static ssize_t  debug_count_fwrite(struct file *filp,
9949 +                                      const char __user *ubuf,
9950 +                                      size_t cnt,
9951 +                                      loff_t *ppos)
9953 +       return simple_data_write(filp, ubuf, cnt, ppos, &data.count);
9956 +/**
9957 + * debug_enable_fopen - Dummy open function for "enable" debugfs interface
9958 + * @inode: The in-kernel inode representation of the debugfs "file"
9959 + * @filp: The active open file structure for the debugfs "file"
9960 + *
9961 + * This function provides an open implementation for the "enable" debugfs
9962 + * interface to the hardware latency detector.
9963 + */
9964 +static int debug_enable_fopen(struct inode *inode, struct file *filp)
9966 +       return 0;
9969 +/**
9970 + * debug_enable_fread - Read function for "enable" debugfs interface
9971 + * @filp: The active open file structure for the debugfs "file"
9972 + * @ubuf: The userspace provided buffer to read value into
9973 + * @cnt: The maximum number of bytes to read
9974 + * @ppos: The current "file" position
9975 + *
9976 + * This function provides a read implementation for the "enable" debugfs
9977 + * interface to the hardware latency detector. Can be used to determine
9978 + * whether the detector is currently enabled ("0\n" or "1\n" returned).
9979 + */
9980 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
9981 +                                     size_t cnt, loff_t *ppos)
9983 +       char buf[4];
9985 +       if ((cnt < sizeof(buf)) || (*ppos))
9986 +               return 0;
9988 +       buf[0] = enabled ? '1' : '0';
9989 +       buf[1] = '\n';
9990 +       buf[2] = '\0';
9991 +       if (copy_to_user(ubuf, buf, strlen(buf)))
9992 +               return -EFAULT;
9993 +       return *ppos = strlen(buf);
9996 +/**
9997 + * debug_enable_fwrite - Write function for "enable" debugfs interface
9998 + * @filp: The active open file structure for the debugfs "file"
9999 + * @ubuf: The user buffer that contains the value to write
10000 + * @cnt: The maximum number of bytes to write to "file"
10001 + * @ppos: The current position in the debugfs "file"
10002 + *
10003 + * This function provides a write implementation for the "enable" debugfs
10004 + * interface to the hardware latency detector. Can be used to enable or
10005 + * disable the detector, which will have the side-effect of possibly
10006 + * also resetting the global stats and kicking off the measuring
10007 + * kthread (on an enable) or the converse (upon a disable).
10008 + */
10009 +static ssize_t  debug_enable_fwrite(struct file *filp,
10010 +                                       const char __user *ubuf,
10011 +                                       size_t cnt,
10012 +                                       loff_t *ppos)
10014 +       char buf[4];
10015 +       int csize = min(cnt, sizeof(buf));
10016 +       long val = 0;
10017 +       int err = 0;
10019 +       memset(buf, '\0', sizeof(buf));
10020 +       if (copy_from_user(buf, ubuf, csize))
10021 +               return -EFAULT;
10023 +       buf[sizeof(buf)-1] = '\0';                      /* just in case */
10024 +       err = kstrtoul(buf, 10, &val);
10025 +       if (err)
10026 +               return -EINVAL;
10028 +       if (val) {
10029 +               if (enabled)
10030 +                       goto unlock;
10031 +               enabled = 1;
10032 +               __reset_stats();
10033 +               if (start_kthread())
10034 +                       return -EFAULT;
10035 +       } else {
10036 +               if (!enabled)
10037 +                       goto unlock;
10038 +               enabled = 0;
10039 +               err = stop_kthread();
10040 +               if (err) {
10041 +                       pr_err(BANNER "cannot stop kthread\n");
10042 +                       return -EFAULT;
10043 +               }
10044 +               wake_up(&data.wq);              /* reader(s) should return */
10045 +       }
10046 +unlock:
10047 +       return csize;
10050 +/**
10051 + * debug_max_fopen - Open function for "max" debugfs entry
10052 + * @inode: The in-kernel inode representation of the debugfs "file"
10053 + * @filp: The active open file structure for the debugfs "file"
10054 + *
10055 + * This function provides an open implementation for the "max" debugfs
10056 + * interface to the hardware latency detector.
10057 + */
10058 +static int debug_max_fopen(struct inode *inode, struct file *filp)
10060 +       return 0;
10063 +/**
10064 + * debug_max_fread - Read function for "max" debugfs entry
10065 + * @filp: The active open file structure for the debugfs "file"
10066 + * @ubuf: The userspace provided buffer to read value into
10067 + * @cnt: The maximum number of bytes to read
10068 + * @ppos: The current "file" position
10069 + *
10070 + * This function provides a read implementation for the "max" debugfs
10071 + * interface to the hardware latency detector. Can be used to determine
10072 + * the maximum latency value observed since it was last reset.
10073 + */
10074 +static ssize_t debug_max_fread(struct file *filp, char __user *ubuf,
10075 +                                  size_t cnt, loff_t *ppos)
10077 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample);
10080 +/**
10081 + * debug_max_fwrite - Write function for "max" debugfs entry
10082 + * @filp: The active open file structure for the debugfs "file"
10083 + * @ubuf: The user buffer that contains the value to write
10084 + * @cnt: The maximum number of bytes to write to "file"
10085 + * @ppos: The current position in the debugfs "file"
10086 + *
10087 + * This function provides a write implementation for the "max" debugfs
10088 + * interface to the hardware latency detector. Can be used to reset the
10089 + * maximum or set it to some other desired value - if, then, subsequent
10090 + * measurements exceed this value, the maximum will be updated.
10091 + */
10092 +static ssize_t  debug_max_fwrite(struct file *filp,
10093 +                                    const char __user *ubuf,
10094 +                                    size_t cnt,
10095 +                                    loff_t *ppos)
10097 +       return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample);
10101 +/**
10102 + * debug_sample_fopen - An open function for "sample" debugfs interface
10103 + * @inode: The in-kernel inode representation of this debugfs "file"
10104 + * @filp: The active open file structure for the debugfs "file"
10105 + *
10106 + * This function handles opening the "sample" file within the hardware
10107 + * latency detector debugfs directory interface. This file is used to read
10108 + * raw samples from the global ring_buffer and allows the user to see a
10109 + * running latency history. Can be opened blocking or non-blocking,
10110 + * affecting whether it behaves as a buffer read pipe, or does not.
10111 + * Implements simple locking to prevent multiple simultaneous use.
10112 + */
10113 +static int debug_sample_fopen(struct inode *inode, struct file *filp)
10115 +       if (!atomic_add_unless(&data.sample_open, 1, 1))
10116 +               return -EBUSY;
10117 +       else
10118 +               return 0;
10121 +/**
10122 + * debug_sample_fread - A read function for "sample" debugfs interface
10123 + * @filp: The active open file structure for the debugfs "file"
10124 + * @ubuf: The user buffer that will contain the samples read
10125 + * @cnt: The maximum bytes to read from the debugfs "file"
10126 + * @ppos: The current position in the debugfs "file"
10127 + *
10128 + * This function handles reading from the "sample" file within the hardware
10129 + * latency detector debugfs directory interface. This file is used to read
10130 + * raw samples from the global ring_buffer and allows the user to see a
10131 + * running latency history. By default this will block pending a new
10132 + * value written into the sample buffer, unless there are already a
10133 + * number of value(s) waiting in the buffer, or the sample file was
10134 + * previously opened in a non-blocking mode of operation.
10135 + */
10136 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
10137 +                                       size_t cnt, loff_t *ppos)
10139 +       int len = 0;
10140 +       char buf[64];
10141 +       struct sample *sample = NULL;
10143 +       if (!enabled)
10144 +               return 0;
10146 +       sample = kzalloc(sizeof(struct sample), GFP_KERNEL);
10147 +       if (!sample)
10148 +               return -ENOMEM;
10150 +       while (!buffer_get_sample(sample)) {
10152 +               DEFINE_WAIT(wait);
10154 +               if (filp->f_flags & O_NONBLOCK) {
10155 +                       len = -EAGAIN;
10156 +                       goto out;
10157 +               }
10159 +               prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE);
10160 +               schedule();
10161 +               finish_wait(&data.wq, &wait);
10163 +               if (signal_pending(current)) {
10164 +                       len = -EINTR;
10165 +                       goto out;
10166 +               }
10168 +               if (!enabled) {                 /* enable was toggled */
10169 +                       len = 0;
10170 +                       goto out;
10171 +               }
10172 +       }
10174 +       len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\t%llu\n",
10175 +                      sample->timestamp.tv_sec,
10176 +                      sample->timestamp.tv_nsec,
10177 +                      sample->duration,
10178 +                      sample->outer_duration);
10181 +       /* handling partial reads is more trouble than it's worth */
10182 +       if (len > cnt)
10183 +               goto out;
10185 +       if (copy_to_user(ubuf, buf, len))
10186 +               len = -EFAULT;
10188 +out:
10189 +       kfree(sample);
10190 +       return len;
10193 +/**
10194 + * debug_sample_release - Release function for "sample" debugfs interface
10195 + * @inode: The in-kernel inode represenation of the debugfs "file"
10196 + * @filp: The active open file structure for the debugfs "file"
10197 + *
10198 + * This function completes the close of the debugfs interface "sample" file.
10199 + * Frees the sample_open "lock" so that other users may open the interface.
10200 + */
10201 +static int debug_sample_release(struct inode *inode, struct file *filp)
10203 +       atomic_dec(&data.sample_open);
10205 +       return 0;
10208 +/**
10209 + * debug_threshold_fopen - Open function for "threshold" debugfs entry
10210 + * @inode: The in-kernel inode representation of the debugfs "file"
10211 + * @filp: The active open file structure for the debugfs "file"
10212 + *
10213 + * This function provides an open implementation for the "threshold" debugfs
10214 + * interface to the hardware latency detector.
10215 + */
10216 +static int debug_threshold_fopen(struct inode *inode, struct file *filp)
10218 +       return 0;
10221 +/**
10222 + * debug_threshold_fread - Read function for "threshold" debugfs entry
10223 + * @filp: The active open file structure for the debugfs "file"
10224 + * @ubuf: The userspace provided buffer to read value into
10225 + * @cnt: The maximum number of bytes to read
10226 + * @ppos: The current "file" position
10227 + *
10228 + * This function provides a read implementation for the "threshold" debugfs
10229 + * interface to the hardware latency detector. It can be used to determine
10230 + * the current threshold level at which a latency will be recorded in the
10231 + * global ring buffer, typically on the order of 10us.
10232 + */
10233 +static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf,
10234 +                                        size_t cnt, loff_t *ppos)
10236 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold);
10239 +/**
10240 + * debug_threshold_fwrite - Write function for "threshold" debugfs entry
10241 + * @filp: The active open file structure for the debugfs "file"
10242 + * @ubuf: The user buffer that contains the value to write
10243 + * @cnt: The maximum number of bytes to write to "file"
10244 + * @ppos: The current position in the debugfs "file"
10245 + *
10246 + * This function provides a write implementation for the "threshold" debugfs
10247 + * interface to the hardware latency detector. It can be used to configure
10248 + * the threshold level at which any subsequently detected latencies will
10249 + * be recorded into the global ring buffer.
10250 + */
10251 +static ssize_t  debug_threshold_fwrite(struct file *filp,
10252 +                                       const char __user *ubuf,
10253 +                                       size_t cnt,
10254 +                                       loff_t *ppos)
10256 +       int ret;
10258 +       ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold);
10260 +       if (enabled)
10261 +               wake_up_process(kthread);
10263 +       return ret;
10266 +/**
10267 + * debug_width_fopen - Open function for "width" debugfs entry
10268 + * @inode: The in-kernel inode representation of the debugfs "file"
10269 + * @filp: The active open file structure for the debugfs "file"
10270 + *
10271 + * This function provides an open implementation for the "width" debugfs
10272 + * interface to the hardware latency detector.
10273 + */
10274 +static int debug_width_fopen(struct inode *inode, struct file *filp)
10276 +       return 0;
10279 +/**
10280 + * debug_width_fread - Read function for "width" debugfs entry
10281 + * @filp: The active open file structure for the debugfs "file"
10282 + * @ubuf: The userspace provided buffer to read value into
10283 + * @cnt: The maximum number of bytes to read
10284 + * @ppos: The current "file" position
10285 + *
10286 + * This function provides a read implementation for the "width" debugfs
10287 + * interface to the hardware latency detector. It can be used to determine
10288 + * for how many us of the total window us we will actively sample for any
10289 + * hardware-induced latecy periods. Obviously, it is not possible to
10290 + * sample constantly and have the system respond to a sample reader, or,
10291 + * worse, without having the system appear to have gone out to lunch.
10292 + */
10293 +static ssize_t debug_width_fread(struct file *filp, char __user *ubuf,
10294 +                                    size_t cnt, loff_t *ppos)
10296 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width);
10299 +/**
10300 + * debug_width_fwrite - Write function for "width" debugfs entry
10301 + * @filp: The active open file structure for the debugfs "file"
10302 + * @ubuf: The user buffer that contains the value to write
10303 + * @cnt: The maximum number of bytes to write to "file"
10304 + * @ppos: The current position in the debugfs "file"
10305 + *
10306 + * This function provides a write implementation for the "width" debugfs
10307 + * interface to the hardware latency detector. It can be used to configure
10308 + * for how many us of the total window us we will actively sample for any
10309 + * hardware-induced latency periods. Obviously, it is not possible to
10310 + * sample constantly and have the system respond to a sample reader, or,
10311 + * worse, without having the system appear to have gone out to lunch. It
10312 + * is enforced that width is less that the total window size.
10313 + */
10314 +static ssize_t  debug_width_fwrite(struct file *filp,
10315 +                                      const char __user *ubuf,
10316 +                                      size_t cnt,
10317 +                                      loff_t *ppos)
10319 +       char buf[U64STR_SIZE];
10320 +       int csize = min(cnt, sizeof(buf));
10321 +       u64 val = 0;
10322 +       int err = 0;
10324 +       memset(buf, '\0', sizeof(buf));
10325 +       if (copy_from_user(buf, ubuf, csize))
10326 +               return -EFAULT;
10328 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
10329 +       err = kstrtoull(buf, 10, &val);
10330 +       if (err)
10331 +               return -EINVAL;
10333 +       mutex_lock(&data.lock);
10334 +       if (val < data.sample_window)
10335 +               data.sample_width = val;
10336 +       else {
10337 +               mutex_unlock(&data.lock);
10338 +               return -EINVAL;
10339 +       }
10340 +       mutex_unlock(&data.lock);
10342 +       if (enabled)
10343 +               wake_up_process(kthread);
10345 +       return csize;
10348 +/**
10349 + * debug_window_fopen - Open function for "window" debugfs entry
10350 + * @inode: The in-kernel inode representation of the debugfs "file"
10351 + * @filp: The active open file structure for the debugfs "file"
10352 + *
10353 + * This function provides an open implementation for the "window" debugfs
10354 + * interface to the hardware latency detector. The window is the total time
10355 + * in us that will be considered one sample period. Conceptually, windows
10356 + * occur back-to-back and contain a sample width period during which
10357 + * actual sampling occurs.
10358 + */
10359 +static int debug_window_fopen(struct inode *inode, struct file *filp)
10361 +       return 0;
10364 +/**
10365 + * debug_window_fread - Read function for "window" debugfs entry
10366 + * @filp: The active open file structure for the debugfs "file"
10367 + * @ubuf: The userspace provided buffer to read value into
10368 + * @cnt: The maximum number of bytes to read
10369 + * @ppos: The current "file" position
10370 + *
10371 + * This function provides a read implementation for the "window" debugfs
10372 + * interface to the hardware latency detector. The window is the total time
10373 + * in us that will be considered one sample period. Conceptually, windows
10374 + * occur back-to-back and contain a sample width period during which
10375 + * actual sampling occurs. Can be used to read the total window size.
10376 + */
10377 +static ssize_t debug_window_fread(struct file *filp, char __user *ubuf,
10378 +                                     size_t cnt, loff_t *ppos)
10380 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window);
10383 +/**
10384 + * debug_window_fwrite - Write function for "window" debugfs entry
10385 + * @filp: The active open file structure for the debugfs "file"
10386 + * @ubuf: The user buffer that contains the value to write
10387 + * @cnt: The maximum number of bytes to write to "file"
10388 + * @ppos: The current position in the debugfs "file"
10389 + *
10390 + * This function provides a write implementation for the "window" debufds
10391 + * interface to the hardware latency detetector. The window is the total time
10392 + * in us that will be considered one sample period. Conceptually, windows
10393 + * occur back-to-back and contain a sample width period during which
10394 + * actual sampling occurs. Can be used to write a new total window size. It
10395 + * is enfoced that any value written must be greater than the sample width
10396 + * size, or an error results.
10397 + */
10398 +static ssize_t  debug_window_fwrite(struct file *filp,
10399 +                                       const char __user *ubuf,
10400 +                                       size_t cnt,
10401 +                                       loff_t *ppos)
10403 +       char buf[U64STR_SIZE];
10404 +       int csize = min(cnt, sizeof(buf));
10405 +       u64 val = 0;
10406 +       int err = 0;
10408 +       memset(buf, '\0', sizeof(buf));
10409 +       if (copy_from_user(buf, ubuf, csize))
10410 +               return -EFAULT;
10412 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
10413 +       err = kstrtoull(buf, 10, &val);
10414 +       if (err)
10415 +               return -EINVAL;
10417 +       mutex_lock(&data.lock);
10418 +       if (data.sample_width < val)
10419 +               data.sample_window = val;
10420 +       else {
10421 +               mutex_unlock(&data.lock);
10422 +               return -EINVAL;
10423 +       }
10424 +       mutex_unlock(&data.lock);
10426 +       return csize;
10430 + * Function pointers for the "count" debugfs file operations
10431 + */
10432 +static const struct file_operations count_fops = {
10433 +       .open           = debug_count_fopen,
10434 +       .read           = debug_count_fread,
10435 +       .write          = debug_count_fwrite,
10436 +       .owner          = THIS_MODULE,
10440 + * Function pointers for the "enable" debugfs file operations
10441 + */
10442 +static const struct file_operations enable_fops = {
10443 +       .open           = debug_enable_fopen,
10444 +       .read           = debug_enable_fread,
10445 +       .write          = debug_enable_fwrite,
10446 +       .owner          = THIS_MODULE,
10450 + * Function pointers for the "max" debugfs file operations
10451 + */
10452 +static const struct file_operations max_fops = {
10453 +       .open           = debug_max_fopen,
10454 +       .read           = debug_max_fread,
10455 +       .write          = debug_max_fwrite,
10456 +       .owner          = THIS_MODULE,
10460 + * Function pointers for the "sample" debugfs file operations
10461 + */
10462 +static const struct file_operations sample_fops = {
10463 +       .open           = debug_sample_fopen,
10464 +       .read           = debug_sample_fread,
10465 +       .release        = debug_sample_release,
10466 +       .owner          = THIS_MODULE,
10470 + * Function pointers for the "threshold" debugfs file operations
10471 + */
10472 +static const struct file_operations threshold_fops = {
10473 +       .open           = debug_threshold_fopen,
10474 +       .read           = debug_threshold_fread,
10475 +       .write          = debug_threshold_fwrite,
10476 +       .owner          = THIS_MODULE,
10480 + * Function pointers for the "width" debugfs file operations
10481 + */
10482 +static const struct file_operations width_fops = {
10483 +       .open           = debug_width_fopen,
10484 +       .read           = debug_width_fread,
10485 +       .write          = debug_width_fwrite,
10486 +       .owner          = THIS_MODULE,
10490 + * Function pointers for the "window" debugfs file operations
10491 + */
10492 +static const struct file_operations window_fops = {
10493 +       .open           = debug_window_fopen,
10494 +       .read           = debug_window_fread,
10495 +       .write          = debug_window_fwrite,
10496 +       .owner          = THIS_MODULE,
10499 +/**
10500 + * init_debugfs - A function to initialize the debugfs interface files
10501 + *
10502 + * This function creates entries in debugfs for "hwlat_detector", including
10503 + * files to read values from the detector, current samples, and the
10504 + * maximum sample that has been captured since the hardware latency
10505 + * dectector was started.
10506 + */
10507 +static int init_debugfs(void)
10509 +       int ret = -ENOMEM;
10511 +       debug_dir = debugfs_create_dir(DRVNAME, NULL);
10512 +       if (!debug_dir)
10513 +               goto err_debug_dir;
10515 +       debug_sample = debugfs_create_file("sample", 0444,
10516 +                                              debug_dir, NULL,
10517 +                                              &sample_fops);
10518 +       if (!debug_sample)
10519 +               goto err_sample;
10521 +       debug_count = debugfs_create_file("count", 0444,
10522 +                                             debug_dir, NULL,
10523 +                                             &count_fops);
10524 +       if (!debug_count)
10525 +               goto err_count;
10527 +       debug_max = debugfs_create_file("max", 0444,
10528 +                                           debug_dir, NULL,
10529 +                                           &max_fops);
10530 +       if (!debug_max)
10531 +               goto err_max;
10533 +       debug_sample_window = debugfs_create_file("window", 0644,
10534 +                                                     debug_dir, NULL,
10535 +                                                     &window_fops);
10536 +       if (!debug_sample_window)
10537 +               goto err_window;
10539 +       debug_sample_width = debugfs_create_file("width", 0644,
10540 +                                                    debug_dir, NULL,
10541 +                                                    &width_fops);
10542 +       if (!debug_sample_width)
10543 +               goto err_width;
10545 +       debug_threshold = debugfs_create_file("threshold", 0644,
10546 +                                                 debug_dir, NULL,
10547 +                                                 &threshold_fops);
10548 +       if (!debug_threshold)
10549 +               goto err_threshold;
10551 +       debug_enable = debugfs_create_file("enable", 0644,
10552 +                                              debug_dir, &enabled,
10553 +                                              &enable_fops);
10554 +       if (!debug_enable)
10555 +               goto err_enable;
10557 +       else {
10558 +               ret = 0;
10559 +               goto out;
10560 +       }
10562 +err_enable:
10563 +       debugfs_remove(debug_threshold);
10564 +err_threshold:
10565 +       debugfs_remove(debug_sample_width);
10566 +err_width:
10567 +       debugfs_remove(debug_sample_window);
10568 +err_window:
10569 +       debugfs_remove(debug_max);
10570 +err_max:
10571 +       debugfs_remove(debug_count);
10572 +err_count:
10573 +       debugfs_remove(debug_sample);
10574 +err_sample:
10575 +       debugfs_remove(debug_dir);
10576 +err_debug_dir:
10577 +out:
10578 +       return ret;
10581 +/**
10582 + * free_debugfs - A function to cleanup the debugfs file interface
10583 + */
10584 +static void free_debugfs(void)
10586 +       /* could also use a debugfs_remove_recursive */
10587 +       debugfs_remove(debug_enable);
10588 +       debugfs_remove(debug_threshold);
10589 +       debugfs_remove(debug_sample_width);
10590 +       debugfs_remove(debug_sample_window);
10591 +       debugfs_remove(debug_max);
10592 +       debugfs_remove(debug_count);
10593 +       debugfs_remove(debug_sample);
10594 +       debugfs_remove(debug_dir);
10597 +/**
10598 + * detector_init - Standard module initialization code
10599 + */
10600 +static int detector_init(void)
10602 +       int ret = -ENOMEM;
10604 +       pr_info(BANNER "version %s\n", VERSION);
10606 +       ret = init_stats();
10607 +       if (ret)
10608 +               goto out;
10610 +       ret = init_debugfs();
10611 +       if (ret)
10612 +               goto err_stats;
10614 +       if (enabled)
10615 +               ret = start_kthread();
10617 +       goto out;
10619 +err_stats:
10620 +       ring_buffer_free(ring_buffer);
10621 +out:
10622 +       return ret;
10626 +/**
10627 + * detector_exit - Standard module cleanup code
10628 + */
10629 +static void detector_exit(void)
10631 +       int err;
10633 +       if (enabled) {
10634 +               enabled = 0;
10635 +               err = stop_kthread();
10636 +               if (err)
10637 +                       pr_err(BANNER "cannot stop kthread\n");
10638 +       }
10640 +       free_debugfs();
10641 +       ring_buffer_free(ring_buffer);  /* free up the ring buffer */
10645 +module_init(detector_init);
10646 +module_exit(detector_exit);
10647 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
10648 index acece3299756..58ea04a03fa9 100644
10649 --- a/drivers/mmc/host/mmci.c
10650 +++ b/drivers/mmc/host/mmci.c
10651 @@ -1155,15 +1155,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
10652         struct sg_mapping_iter *sg_miter = &host->sg_miter;
10653         struct variant_data *variant = host->variant;
10654         void __iomem *base = host->base;
10655 -       unsigned long flags;
10656         u32 status;
10658         status = readl(base + MMCISTATUS);
10660         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
10662 -       local_irq_save(flags);
10664         do {
10665                 unsigned int remain, len;
10666                 char *buffer;
10667 @@ -1203,8 +1200,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
10669         sg_miter_stop(sg_miter);
10671 -       local_irq_restore(flags);
10673         /*
10674          * If we have less than the fifo 'half-full' threshold to transfer,
10675          * trigger a PIO interrupt as soon as any data is available.
10676 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
10677 index 2839af00f20c..4348b9c850d3 100644
10678 --- a/drivers/net/ethernet/3com/3c59x.c
10679 +++ b/drivers/net/ethernet/3com/3c59x.c
10680 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
10682         struct vortex_private *vp = netdev_priv(dev);
10683         unsigned long flags;
10684 -       local_irq_save(flags);
10685 +       local_irq_save_nort(flags);
10686         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
10687 -       local_irq_restore(flags);
10688 +       local_irq_restore_nort(flags);
10690  #endif
10692 @@ -1916,12 +1916,12 @@ static void vortex_tx_timeout(struct net_device *dev)
10693                          * Block interrupts because vortex_interrupt does a bare spin_lock()
10694                          */
10695                         unsigned long flags;
10696 -                       local_irq_save(flags);
10697 +                       local_irq_save_nort(flags);
10698                         if (vp->full_bus_master_tx)
10699                                 boomerang_interrupt(dev->irq, dev);
10700                         else
10701                                 vortex_interrupt(dev->irq, dev);
10702 -                       local_irq_restore(flags);
10703 +                       local_irq_restore_nort(flags);
10704                 }
10705         }
10707 diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10708 index 8b5988e210d5..cf9928ccdd7e 100644
10709 --- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10710 +++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10711 @@ -2221,11 +2221,7 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb,
10712         }
10714         tpd_req = atl1c_cal_tpd_req(skb);
10715 -       if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) {
10716 -               if (netif_msg_pktdata(adapter))
10717 -                       dev_info(&adapter->pdev->dev, "tx locked\n");
10718 -               return NETDEV_TX_LOCKED;
10719 -       }
10720 +       spin_lock_irqsave(&adapter->tx_lock, flags);
10722         if (atl1c_tpd_avail(adapter, type) < tpd_req) {
10723                 /* no enough descriptor, just stop queue */
10724 diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10725 index 59a03a193e83..734f7a7ad2c3 100644
10726 --- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10727 +++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10728 @@ -1880,8 +1880,7 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
10729                 return NETDEV_TX_OK;
10730         }
10731         tpd_req = atl1e_cal_tdp_req(skb);
10732 -       if (!spin_trylock_irqsave(&adapter->tx_lock, flags))
10733 -               return NETDEV_TX_LOCKED;
10734 +       spin_lock_irqsave(&adapter->tx_lock, flags);
10736         if (atl1e_tpd_avail(adapter) < tpd_req) {
10737                 /* no enough descriptor, just stop queue */
10738 diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
10739 index 526ea74e82d9..86f467a2c485 100644
10740 --- a/drivers/net/ethernet/chelsio/cxgb/sge.c
10741 +++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
10742 @@ -1664,8 +1664,7 @@ static int t1_sge_tx(struct sk_buff *skb, struct adapter *adapter,
10743         struct cmdQ *q = &sge->cmdQ[qid];
10744         unsigned int credits, pidx, genbit, count, use_sched_skb = 0;
10746 -       if (!spin_trylock(&q->lock))
10747 -               return NETDEV_TX_LOCKED;
10748 +       spin_lock(&q->lock);
10750         reclaim_completed_tx(sge, q);
10752 diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
10753 index 9ba975853ec6..813cfa698160 100644
10754 --- a/drivers/net/ethernet/neterion/s2io.c
10755 +++ b/drivers/net/ethernet/neterion/s2io.c
10756 @@ -4084,12 +4084,7 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev)
10757                         [skb->priority & (MAX_TX_FIFOS - 1)];
10758         fifo = &mac_control->fifos[queue];
10760 -       if (do_spin_lock)
10761 -               spin_lock_irqsave(&fifo->tx_lock, flags);
10762 -       else {
10763 -               if (unlikely(!spin_trylock_irqsave(&fifo->tx_lock, flags)))
10764 -                       return NETDEV_TX_LOCKED;
10765 -       }
10766 +       spin_lock_irqsave(&fifo->tx_lock, flags);
10768         if (sp->config.multiq) {
10769                 if (__netif_subqueue_stopped(dev, fifo->fifo_no)) {
10770 diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10771 index 3b98b263bad0..ca4add749410 100644
10772 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10773 +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10774 @@ -2137,10 +2137,8 @@ static int pch_gbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
10775         struct pch_gbe_tx_ring *tx_ring = adapter->tx_ring;
10776         unsigned long flags;
10778 -       if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
10779 -               /* Collision - tell upper layer to requeue */
10780 -               return NETDEV_TX_LOCKED;
10781 -       }
10782 +       spin_lock_irqsave(&tx_ring->tx_lock, flags);
10784         if (unlikely(!PCH_GBE_DESC_UNUSED(tx_ring))) {
10785                 netif_stop_queue(netdev);
10786                 spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
10787 diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
10788 index ef668d300800..d987d571fdd6 100644
10789 --- a/drivers/net/ethernet/realtek/8139too.c
10790 +++ b/drivers/net/ethernet/realtek/8139too.c
10791 @@ -2229,7 +2229,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
10792         struct rtl8139_private *tp = netdev_priv(dev);
10793         const int irq = tp->pci_dev->irq;
10795 -       disable_irq(irq);
10796 +       disable_irq_nosync(irq);
10797         rtl8139_interrupt(irq, dev);
10798         enable_irq(irq);
10800 diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
10801 index 14c9d1baa85c..e1a5305418a8 100644
10802 --- a/drivers/net/ethernet/tehuti/tehuti.c
10803 +++ b/drivers/net/ethernet/tehuti/tehuti.c
10804 @@ -1629,13 +1629,8 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb,
10805         unsigned long flags;
10807         ENTER;
10808 -       local_irq_save(flags);
10809 -       if (!spin_trylock(&priv->tx_lock)) {
10810 -               local_irq_restore(flags);
10811 -               DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n",
10812 -                   BDX_DRV_NAME, ndev->name);
10813 -               return NETDEV_TX_LOCKED;
10814 -       }
10816 +       spin_lock_irqsave(&priv->tx_lock, flags);
10818         /* build tx descriptor */
10819         BDX_ASSERT(f->m.wptr >= f->m.memsz);    /* started with valid wptr */
10820 diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
10821 index e7034c55e796..2e4ee0f912bf 100644
10822 --- a/drivers/net/rionet.c
10823 +++ b/drivers/net/rionet.c
10824 @@ -174,11 +174,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
10825         unsigned long flags;
10826         int add_num = 1;
10828 -       local_irq_save(flags);
10829 -       if (!spin_trylock(&rnet->tx_lock)) {
10830 -               local_irq_restore(flags);
10831 -               return NETDEV_TX_LOCKED;
10832 -       }
10833 +       spin_lock_irqsave(&rnet->tx_lock, flags);
10835         if (is_multicast_ether_addr(eth->h_dest))
10836                 add_num = nets[rnet->mport->id].nact;
10837 diff --git a/drivers/net/wireless/orinoco/orinoco_usb.c b/drivers/net/wireless/orinoco/orinoco_usb.c
10838 index f2cd513d54b2..6c0f4c9638a2 100644
10839 --- a/drivers/net/wireless/orinoco/orinoco_usb.c
10840 +++ b/drivers/net/wireless/orinoco/orinoco_usb.c
10841 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
10842                         while (!ctx->done.done && msecs--)
10843                                 udelay(1000);
10844                 } else {
10845 -                       wait_event_interruptible(ctx->done.wait,
10846 +                       swait_event_interruptible(ctx->done.wait,
10847                                                  ctx->done.done);
10848                 }
10849                 break;
10850 diff --git a/drivers/pci/access.c b/drivers/pci/access.c
10851 index 59ac36fe7c42..7a45a20af78a 100644
10852 --- a/drivers/pci/access.c
10853 +++ b/drivers/pci/access.c
10854 @@ -561,7 +561,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
10855         WARN_ON(!dev->block_cfg_access);
10857         dev->block_cfg_access = 0;
10858 -       wake_up_all(&pci_cfg_wait);
10859 +       wake_up_all_locked(&pci_cfg_wait);
10860         raw_spin_unlock_irqrestore(&pci_lock, flags);
10862  EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
10863 diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
10864 index 9736f9be5447..5fe9b173dcb3 100644
10865 --- a/drivers/pinctrl/qcom/pinctrl-msm.c
10866 +++ b/drivers/pinctrl/qcom/pinctrl-msm.c
10867 @@ -60,7 +60,7 @@ struct msm_pinctrl {
10868         struct notifier_block restart_nb;
10869         int irq;
10871 -       spinlock_t lock;
10872 +       raw_spinlock_t lock;
10874         DECLARE_BITMAP(dual_edge_irqs, MAX_NR_GPIO);
10875         DECLARE_BITMAP(enabled_irqs, MAX_NR_GPIO);
10876 @@ -156,14 +156,14 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
10877         if (WARN_ON(i == g->nfuncs))
10878                 return -EINVAL;
10880 -       spin_lock_irqsave(&pctrl->lock, flags);
10881 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10883         val = readl(pctrl->regs + g->ctl_reg);
10884         val &= ~(0x7 << g->mux_bit);
10885         val |= i << g->mux_bit;
10886         writel(val, pctrl->regs + g->ctl_reg);
10888 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10889 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10891         return 0;
10893 @@ -326,14 +326,14 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
10894                         break;
10895                 case PIN_CONFIG_OUTPUT:
10896                         /* set output value */
10897 -                       spin_lock_irqsave(&pctrl->lock, flags);
10898 +                       raw_spin_lock_irqsave(&pctrl->lock, flags);
10899                         val = readl(pctrl->regs + g->io_reg);
10900                         if (arg)
10901                                 val |= BIT(g->out_bit);
10902                         else
10903                                 val &= ~BIT(g->out_bit);
10904                         writel(val, pctrl->regs + g->io_reg);
10905 -                       spin_unlock_irqrestore(&pctrl->lock, flags);
10906 +                       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10908                         /* enable output */
10909                         arg = 1;
10910 @@ -354,12 +354,12 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
10911                         return -EINVAL;
10912                 }
10914 -               spin_lock_irqsave(&pctrl->lock, flags);
10915 +               raw_spin_lock_irqsave(&pctrl->lock, flags);
10916                 val = readl(pctrl->regs + g->ctl_reg);
10917                 val &= ~(mask << bit);
10918                 val |= arg << bit;
10919                 writel(val, pctrl->regs + g->ctl_reg);
10920 -               spin_unlock_irqrestore(&pctrl->lock, flags);
10921 +               raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10922         }
10924         return 0;
10925 @@ -387,13 +387,13 @@ static int msm_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
10927         g = &pctrl->soc->groups[offset];
10929 -       spin_lock_irqsave(&pctrl->lock, flags);
10930 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10932         val = readl(pctrl->regs + g->ctl_reg);
10933         val &= ~BIT(g->oe_bit);
10934         writel(val, pctrl->regs + g->ctl_reg);
10936 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10937 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10939         return 0;
10941 @@ -407,7 +407,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
10943         g = &pctrl->soc->groups[offset];
10945 -       spin_lock_irqsave(&pctrl->lock, flags);
10946 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10948         val = readl(pctrl->regs + g->io_reg);
10949         if (value)
10950 @@ -420,7 +420,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
10951         val |= BIT(g->oe_bit);
10952         writel(val, pctrl->regs + g->ctl_reg);
10954 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10955 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10957         return 0;
10959 @@ -446,7 +446,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
10961         g = &pctrl->soc->groups[offset];
10963 -       spin_lock_irqsave(&pctrl->lock, flags);
10964 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10966         val = readl(pctrl->regs + g->io_reg);
10967         if (value)
10968 @@ -455,7 +455,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
10969                 val &= ~BIT(g->out_bit);
10970         writel(val, pctrl->regs + g->io_reg);
10972 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10973 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10976  #ifdef CONFIG_DEBUG_FS
10977 @@ -574,7 +574,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
10979         g = &pctrl->soc->groups[d->hwirq];
10981 -       spin_lock_irqsave(&pctrl->lock, flags);
10982 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10984         val = readl(pctrl->regs + g->intr_cfg_reg);
10985         val &= ~BIT(g->intr_enable_bit);
10986 @@ -582,7 +582,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
10988         clear_bit(d->hwirq, pctrl->enabled_irqs);
10990 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10991 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10994  static void msm_gpio_irq_unmask(struct irq_data *d)
10995 @@ -595,7 +595,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
10997         g = &pctrl->soc->groups[d->hwirq];
10999 -       spin_lock_irqsave(&pctrl->lock, flags);
11000 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
11002         val = readl(pctrl->regs + g->intr_cfg_reg);
11003         val |= BIT(g->intr_enable_bit);
11004 @@ -603,7 +603,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
11006         set_bit(d->hwirq, pctrl->enabled_irqs);
11008 -       spin_unlock_irqrestore(&pctrl->lock, flags);
11009 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
11012  static void msm_gpio_irq_ack(struct irq_data *d)
11013 @@ -616,7 +616,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
11015         g = &pctrl->soc->groups[d->hwirq];
11017 -       spin_lock_irqsave(&pctrl->lock, flags);
11018 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
11020         val = readl(pctrl->regs + g->intr_status_reg);
11021         if (g->intr_ack_high)
11022 @@ -628,7 +628,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
11023         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
11024                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
11026 -       spin_unlock_irqrestore(&pctrl->lock, flags);
11027 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
11030  static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
11031 @@ -641,7 +641,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
11033         g = &pctrl->soc->groups[d->hwirq];
11035 -       spin_lock_irqsave(&pctrl->lock, flags);
11036 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
11038         /*
11039          * For hw without possibility of detecting both edges
11040 @@ -715,7 +715,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
11041         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
11042                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
11044 -       spin_unlock_irqrestore(&pctrl->lock, flags);
11045 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
11047         if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
11048                 irq_set_handler_locked(d, handle_level_irq);
11049 @@ -731,11 +731,11 @@ static int msm_gpio_irq_set_wake(struct irq_data *d, unsigned int on)
11050         struct msm_pinctrl *pctrl = to_msm_pinctrl(gc);
11051         unsigned long flags;
11053 -       spin_lock_irqsave(&pctrl->lock, flags);
11054 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
11056         irq_set_irq_wake(pctrl->irq, on);
11058 -       spin_unlock_irqrestore(&pctrl->lock, flags);
11059 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
11061         return 0;
11063 @@ -881,7 +881,7 @@ int msm_pinctrl_probe(struct platform_device *pdev,
11064         pctrl->soc = soc_data;
11065         pctrl->chip = msm_gpio_template;
11067 -       spin_lock_init(&pctrl->lock);
11068 +       raw_spin_lock_init(&pctrl->lock);
11070         res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
11071         pctrl->regs = devm_ioremap_resource(&pdev->dev, res);
11072 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
11073 index f4424063b860..cbbbebd86c6e 100644
11074 --- a/drivers/scsi/fcoe/fcoe.c
11075 +++ b/drivers/scsi/fcoe/fcoe.c
11076 @@ -1286,7 +1286,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
11077         struct sk_buff *skb;
11078  #ifdef CONFIG_SMP
11079         struct fcoe_percpu_s *p0;
11080 -       unsigned targ_cpu = get_cpu();
11081 +       unsigned targ_cpu = get_cpu_light();
11082  #endif /* CONFIG_SMP */
11084         FCOE_DBG("Destroying receive thread for CPU %d\n", cpu);
11085 @@ -1342,7 +1342,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
11086                         kfree_skb(skb);
11087                 spin_unlock_bh(&p->fcoe_rx_list.lock);
11088         }
11089 -       put_cpu();
11090 +       put_cpu_light();
11091  #else
11092         /*
11093          * This a non-SMP scenario where the singular Rx thread is
11094 @@ -1566,11 +1566,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev,
11095  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
11097         struct fcoe_percpu_s *fps;
11098 -       int rc;
11099 +       int rc, cpu = get_cpu_light();
11101 -       fps = &get_cpu_var(fcoe_percpu);
11102 +       fps = &per_cpu(fcoe_percpu, cpu);
11103         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
11104 -       put_cpu_var(fcoe_percpu);
11105 +       put_cpu_light();
11107         return rc;
11109 @@ -1766,11 +1766,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
11110                 return 0;
11111         }
11113 -       stats = per_cpu_ptr(lport->stats, get_cpu());
11114 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
11115         stats->InvalidCRCCount++;
11116         if (stats->InvalidCRCCount < 5)
11117                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
11118 -       put_cpu();
11119 +       put_cpu_light();
11120         return -EINVAL;
11123 @@ -1814,7 +1814,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
11124          */
11125         hp = (struct fcoe_hdr *) skb_network_header(skb);
11127 -       stats = per_cpu_ptr(lport->stats, get_cpu());
11128 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
11129         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
11130                 if (stats->ErrorFrames < 5)
11131                         printk(KERN_WARNING "fcoe: FCoE version "
11132 @@ -1846,13 +1846,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
11133                 goto drop;
11135         if (!fcoe_filter_frames(lport, fp)) {
11136 -               put_cpu();
11137 +               put_cpu_light();
11138                 fc_exch_recv(lport, fp);
11139                 return;
11140         }
11141  drop:
11142         stats->ErrorFrames++;
11143 -       put_cpu();
11144 +       put_cpu_light();
11145         kfree_skb(skb);
11148 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
11149 index 34a1b1f333b4..d91131210695 100644
11150 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
11151 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
11152 @@ -831,7 +831,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
11154         INIT_LIST_HEAD(&del_list);
11156 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
11157 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
11159         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
11160                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
11161 @@ -867,7 +867,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
11162                                 sel_time = fcf->time;
11163                 }
11164         }
11165 -       put_cpu();
11166 +       put_cpu_light();
11168         list_for_each_entry_safe(fcf, next, &del_list, list) {
11169                 /* Removes fcf from current list */
11170 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
11171 index 30f9ef0c0d4f..6c686bc01a82 100644
11172 --- a/drivers/scsi/libfc/fc_exch.c
11173 +++ b/drivers/scsi/libfc/fc_exch.c
11174 @@ -814,10 +814,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
11175         }
11176         memset(ep, 0, sizeof(*ep));
11178 -       cpu = get_cpu();
11179 +       cpu = get_cpu_light();
11180         pool = per_cpu_ptr(mp->pool, cpu);
11181         spin_lock_bh(&pool->lock);
11182 -       put_cpu();
11183 +       put_cpu_light();
11185         /* peek cache of free slot */
11186         if (pool->left != FC_XID_UNKNOWN) {
11187 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
11188 index 6f5e2720ffad..ee8a8ed49b89 100644
11189 --- a/drivers/scsi/libsas/sas_ata.c
11190 +++ b/drivers/scsi/libsas/sas_ata.c
11191 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
11192         /* TODO: audit callers to ensure they are ready for qc_issue to
11193          * unconditionally re-enable interrupts
11194          */
11195 -       local_irq_save(flags);
11196 +       local_irq_save_nort(flags);
11197         spin_unlock(ap->lock);
11199         /* If the device fell off, no sense in issuing commands */
11200 @@ -255,7 +255,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
11202   out:
11203         spin_lock(ap->lock);
11204 -       local_irq_restore(flags);
11205 +       local_irq_restore_nort(flags);
11206         return ret;
11209 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
11210 index fee9eb7c8a60..b42d4adc42dc 100644
11211 --- a/drivers/scsi/qla2xxx/qla_inline.h
11212 +++ b/drivers/scsi/qla2xxx/qla_inline.h
11213 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
11215         unsigned long flags;
11216         struct qla_hw_data *ha = rsp->hw;
11217 -       local_irq_save(flags);
11218 +       local_irq_save_nort(flags);
11219         if (IS_P3P_TYPE(ha))
11220                 qla82xx_poll(0, rsp);
11221         else
11222                 ha->isp_ops->intr_handler(0, rsp);
11223 -       local_irq_restore(flags);
11224 +       local_irq_restore_nort(flags);
11227  static inline uint8_t *
11228 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
11229 index 7fc919f7da4d..e03fa17b8670 100644
11230 --- a/drivers/thermal/x86_pkg_temp_thermal.c
11231 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
11232 @@ -29,6 +29,7 @@
11233  #include <linux/pm.h>
11234  #include <linux/thermal.h>
11235  #include <linux/debugfs.h>
11236 +#include <linux/swork.h>
11237  #include <asm/cpu_device_id.h>
11238  #include <asm/mce.h>
11240 @@ -352,7 +353,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
11241         }
11244 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
11245 +static void platform_thermal_notify_work(struct swork_event *event)
11247         unsigned long flags;
11248         int cpu = smp_processor_id();
11249 @@ -369,7 +370,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
11250                         pkg_work_scheduled[phy_id]) {
11251                 disable_pkg_thres_interrupt();
11252                 spin_unlock_irqrestore(&pkg_work_lock, flags);
11253 -               return -EINVAL;
11254 +               return;
11255         }
11256         pkg_work_scheduled[phy_id] = 1;
11257         spin_unlock_irqrestore(&pkg_work_lock, flags);
11258 @@ -378,9 +379,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
11259         schedule_delayed_work_on(cpu,
11260                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
11261                                 msecs_to_jiffies(notify_delay_ms));
11264 +#ifdef CONFIG_PREEMPT_RT_FULL
11265 +static struct swork_event notify_work;
11267 +static int thermal_notify_work_init(void)
11269 +       int err;
11271 +       err = swork_get();
11272 +       if (err)
11273 +               return err;
11275 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
11276         return 0;
11279 +static void thermal_notify_work_cleanup(void)
11281 +       swork_put();
11284 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
11286 +       swork_queue(&notify_work);
11287 +       return 0;
11290 +#else  /* !CONFIG_PREEMPT_RT_FULL */
11292 +static int thermal_notify_work_init(void) { return 0; }
11294 +static void thermal_notify_work_cleanup(void) {  }
11296 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
11298 +       platform_thermal_notify_work(NULL);
11300 +       return 0;
11302 +#endif /* CONFIG_PREEMPT_RT_FULL */
11304  static int find_siblings_cpu(int cpu)
11306         int i;
11307 @@ -584,6 +624,9 @@ static int __init pkg_temp_thermal_init(void)
11308         if (!x86_match_cpu(pkg_temp_thermal_ids))
11309                 return -ENODEV;
11311 +       if (!thermal_notify_work_init())
11312 +               return -ENODEV;
11314         spin_lock_init(&pkg_work_lock);
11315         platform_thermal_package_notify =
11316                         pkg_temp_thermal_platform_thermal_notify;
11317 @@ -608,7 +651,7 @@ static int __init pkg_temp_thermal_init(void)
11318         kfree(pkg_work_scheduled);
11319         platform_thermal_package_notify = NULL;
11320         platform_thermal_package_rate_control = NULL;
11322 +       thermal_notify_work_cleanup();
11323         return -ENODEV;
11326 @@ -633,6 +676,7 @@ static void __exit pkg_temp_thermal_exit(void)
11327         mutex_unlock(&phy_dev_list_mutex);
11328         platform_thermal_package_notify = NULL;
11329         platform_thermal_package_rate_control = NULL;
11330 +       thermal_notify_work_cleanup();
11331         for_each_online_cpu(i)
11332                 cancel_delayed_work_sync(
11333                         &per_cpu(pkg_temp_thermal_threshold_work, i));
11334 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
11335 index 39126460c1f5..af7701ca4d48 100644
11336 --- a/drivers/tty/serial/8250/8250_core.c
11337 +++ b/drivers/tty/serial/8250/8250_core.c
11338 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
11340  static unsigned int skip_txen_test; /* force skip of txen test at init time */
11342 -#define PASS_LIMIT     512
11344 + * On -rt we can have a more delays, and legitimately
11345 + * so - so don't drop work spuriously and spam the
11346 + * syslog:
11347 + */
11348 +#ifdef CONFIG_PREEMPT_RT_FULL
11349 +# define PASS_LIMIT    1000000
11350 +#else
11351 +# define PASS_LIMIT    512
11352 +#endif
11354  #include <asm/serial.h>
11355  /*
11356 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
11357 index d42d66b72d5a..c08e0724a487 100644
11358 --- a/drivers/tty/serial/8250/8250_port.c
11359 +++ b/drivers/tty/serial/8250/8250_port.c
11360 @@ -35,6 +35,7 @@
11361  #include <linux/nmi.h>
11362  #include <linux/mutex.h>
11363  #include <linux/slab.h>
11364 +#include <linux/kdb.h>
11365  #include <linux/uaccess.h>
11366  #include <linux/pm_runtime.h>
11368 @@ -2846,9 +2847,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
11370         serial8250_rpm_get(up);
11372 -       if (port->sysrq)
11373 +       if (port->sysrq || oops_in_progress)
11374                 locked = 0;
11375 -       else if (oops_in_progress)
11376 +       else if (in_kdb_printk())
11377                 locked = spin_trylock_irqsave(&port->lock, flags);
11378         else
11379                 spin_lock_irqsave(&port->lock, flags);
11380 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
11381 index 899a77187bde..3ff6363b3751 100644
11382 --- a/drivers/tty/serial/amba-pl011.c
11383 +++ b/drivers/tty/serial/amba-pl011.c
11384 @@ -2067,13 +2067,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
11386         clk_enable(uap->clk);
11388 -       local_irq_save(flags);
11389 +       /*
11390 +        * local_irq_save(flags);
11391 +        *
11392 +        * This local_irq_save() is nonsense. If we come in via sysrq
11393 +        * handling then interrupts are already disabled. Aside of
11394 +        * that the port.sysrq check is racy on SMP regardless.
11395 +       */
11396         if (uap->port.sysrq)
11397                 locked = 0;
11398         else if (oops_in_progress)
11399 -               locked = spin_trylock(&uap->port.lock);
11400 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
11401         else
11402 -               spin_lock(&uap->port.lock);
11403 +               spin_lock_irqsave(&uap->port.lock, flags);
11405         /*
11406          *      First save the CR then disable the interrupts
11407 @@ -2098,8 +2104,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
11408                 writew(old_cr, uap->port.membase + UART011_CR);
11410         if (locked)
11411 -               spin_unlock(&uap->port.lock);
11412 -       local_irq_restore(flags);
11413 +               spin_unlock_irqrestore(&uap->port.lock, flags);
11415         clk_disable(uap->clk);
11417 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
11418 index 21fc9b3a27cf..2e32729e6a83 100644
11419 --- a/drivers/tty/serial/omap-serial.c
11420 +++ b/drivers/tty/serial/omap-serial.c
11421 @@ -1257,13 +1257,10 @@ serial_omap_console_write(struct console *co, const char *s,
11423         pm_runtime_get_sync(up->dev);
11425 -       local_irq_save(flags);
11426 -       if (up->port.sysrq)
11427 -               locked = 0;
11428 -       else if (oops_in_progress)
11429 -               locked = spin_trylock(&up->port.lock);
11430 +       if (up->port.sysrq || oops_in_progress)
11431 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
11432         else
11433 -               spin_lock(&up->port.lock);
11434 +               spin_lock_irqsave(&up->port.lock, flags);
11436         /*
11437          * First save the IER then disable the interrupts
11438 @@ -1292,8 +1289,7 @@ serial_omap_console_write(struct console *co, const char *s,
11439         pm_runtime_mark_last_busy(up->dev);
11440         pm_runtime_put_autosuspend(up->dev);
11441         if (locked)
11442 -               spin_unlock(&up->port.lock);
11443 -       local_irq_restore(flags);
11444 +               spin_unlock_irqrestore(&up->port.lock, flags);
11447  static int __init
11448 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
11449 index 87a83d925eea..995a37bde992 100644
11450 --- a/drivers/usb/core/hcd.c
11451 +++ b/drivers/usb/core/hcd.c
11452 @@ -1738,9 +1738,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
11453          * and no one may trigger the above deadlock situation when
11454          * running complete() in tasklet.
11455          */
11456 -       local_irq_save(flags);
11457 +       local_irq_save_nort(flags);
11458         urb->complete(urb);
11459 -       local_irq_restore(flags);
11460 +       local_irq_restore_nort(flags);
11462         usb_anchor_resume_wakeups(anchor);
11463         atomic_dec(&urb->use_count);
11464 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
11465 index 39bb65265bff..49c167d150ce 100644
11466 --- a/drivers/usb/gadget/function/f_fs.c
11467 +++ b/drivers/usb/gadget/function/f_fs.c
11468 @@ -1404,7 +1404,7 @@ static void ffs_data_put(struct ffs_data *ffs)
11469                 pr_info("%s(): freeing\n", __func__);
11470                 ffs_data_clear(ffs);
11471                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
11472 -                      waitqueue_active(&ffs->ep0req_completion.wait));
11473 +                      swait_active(&ffs->ep0req_completion.wait));
11474                 kfree(ffs->dev_name);
11475                 kfree(ffs);
11476         }
11477 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
11478 index 81f3c9cb333c..c996bd43741a 100644
11479 --- a/drivers/usb/gadget/legacy/inode.c
11480 +++ b/drivers/usb/gadget/legacy/inode.c
11481 @@ -346,7 +346,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
11482         spin_unlock_irq (&epdata->dev->lock);
11484         if (likely (value == 0)) {
11485 -               value = wait_event_interruptible (done.wait, done.done);
11486 +               value = swait_event_interruptible (done.wait, done.done);
11487                 if (value != 0) {
11488                         spin_lock_irq (&epdata->dev->lock);
11489                         if (likely (epdata->ep != NULL)) {
11490 @@ -355,7 +355,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
11491                                 usb_ep_dequeue (epdata->ep, epdata->req);
11492                                 spin_unlock_irq (&epdata->dev->lock);
11494 -                               wait_event (done.wait, done.done);
11495 +                               swait_event (done.wait, done.done);
11496                                 if (epdata->status == -ECONNRESET)
11497                                         epdata->status = -EINTR;
11498                         } else {
11499 diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c
11500 index 585cb8734f50..1e469be2c216 100644
11501 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c
11502 +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c
11503 @@ -17,7 +17,9 @@
11504  #include <linux/device.h>
11505  #include <linux/dma-mapping.h>
11506  #include <linux/list.h>
11507 +#include <linux/mfd/syscon.h>
11508  #include <linux/platform_device.h>
11509 +#include <linux/regmap.h>
11510  #include <linux/usb/ch9.h>
11511  #include <linux/usb/gadget.h>
11512  #include <linux/usb/atmel_usba_udc.h>
11513 @@ -1890,20 +1892,15 @@ static int atmel_usba_stop(struct usb_gadget *gadget)
11514  #ifdef CONFIG_OF
11515  static void at91sam9rl_toggle_bias(struct usba_udc *udc, int is_on)
11517 -       unsigned int uckr = at91_pmc_read(AT91_CKGR_UCKR);
11519 -       if (is_on)
11520 -               at91_pmc_write(AT91_CKGR_UCKR, uckr | AT91_PMC_BIASEN);
11521 -       else
11522 -               at91_pmc_write(AT91_CKGR_UCKR, uckr & ~(AT91_PMC_BIASEN));
11523 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN,
11524 +                          is_on ? AT91_PMC_BIASEN : 0);
11527  static void at91sam9g45_pulse_bias(struct usba_udc *udc)
11529 -       unsigned int uckr = at91_pmc_read(AT91_CKGR_UCKR);
11531 -       at91_pmc_write(AT91_CKGR_UCKR, uckr & ~(AT91_PMC_BIASEN));
11532 -       at91_pmc_write(AT91_CKGR_UCKR, uckr | AT91_PMC_BIASEN);
11533 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN, 0);
11534 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN,
11535 +                          AT91_PMC_BIASEN);
11538  static const struct usba_udc_errata at91sam9rl_errata = {
11539 @@ -1940,6 +1937,9 @@ static struct usba_ep * atmel_udc_of_init(struct platform_device *pdev,
11540                 return ERR_PTR(-EINVAL);
11542         udc->errata = match->data;
11543 +       udc->pmc = syscon_regmap_lookup_by_compatible("atmel,at91sam9g45-pmc");
11544 +       if (udc->errata && IS_ERR(udc->pmc))
11545 +               return ERR_CAST(udc->pmc);
11547         udc->num_ep = 0;
11549 diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.h b/drivers/usb/gadget/udc/atmel_usba_udc.h
11550 index ea448a344767..3e1c9d589dfa 100644
11551 --- a/drivers/usb/gadget/udc/atmel_usba_udc.h
11552 +++ b/drivers/usb/gadget/udc/atmel_usba_udc.h
11553 @@ -354,6 +354,8 @@ struct usba_udc {
11554         struct dentry *debugfs_root;
11555         struct dentry *debugfs_regs;
11556  #endif
11558 +       struct regmap *pmc;
11559  };
11561  static inline struct usba_ep *to_usba_ep(struct usb_ep *ep)
11562 diff --git a/fs/aio.c b/fs/aio.c
11563 index fe4f49212b99..c3194afdc3df 100644
11564 --- a/fs/aio.c
11565 +++ b/fs/aio.c
11566 @@ -40,6 +40,7 @@
11567  #include <linux/ramfs.h>
11568  #include <linux/percpu-refcount.h>
11569  #include <linux/mount.h>
11570 +#include <linux/swork.h>
11572  #include <asm/kmap_types.h>
11573  #include <asm/uaccess.h>
11574 @@ -115,7 +116,7 @@ struct kioctx {
11575         struct page             **ring_pages;
11576         long                    nr_pages;
11578 -       struct work_struct      free_work;
11579 +       struct swork_event      free_work;
11581         /*
11582          * signals when all in-flight requests are done
11583 @@ -258,6 +259,7 @@ static int __init aio_setup(void)
11584                 .mount          = aio_mount,
11585                 .kill_sb        = kill_anon_super,
11586         };
11587 +       BUG_ON(swork_get());
11588         aio_mnt = kern_mount(&aio_fs);
11589         if (IS_ERR(aio_mnt))
11590                 panic("Failed to create aio fs mount.");
11591 @@ -573,9 +575,9 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
11592         return cancel(&kiocb->common);
11595 -static void free_ioctx(struct work_struct *work)
11596 +static void free_ioctx(struct swork_event *sev)
11598 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
11599 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
11601         pr_debug("freeing %p\n", ctx);
11603 @@ -594,8 +596,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
11604         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
11605                 complete(&ctx->rq_wait->comp);
11607 -       INIT_WORK(&ctx->free_work, free_ioctx);
11608 -       schedule_work(&ctx->free_work);
11609 +       INIT_SWORK(&ctx->free_work, free_ioctx);
11610 +       swork_queue(&ctx->free_work);
11613  /*
11614 @@ -603,9 +605,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
11615   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
11616   * now it's safe to cancel any that need to be.
11617   */
11618 -static void free_ioctx_users(struct percpu_ref *ref)
11619 +static void free_ioctx_users_work(struct swork_event *sev)
11621 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
11622 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
11623         struct aio_kiocb *req;
11625         spin_lock_irq(&ctx->ctx_lock);
11626 @@ -624,6 +626,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
11627         percpu_ref_put(&ctx->reqs);
11630 +static void free_ioctx_users(struct percpu_ref *ref)
11632 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
11634 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
11635 +       swork_queue(&ctx->free_work);
11638  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
11640         unsigned i, new_nr;
11641 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
11642 index 502d3892d8a4..05af8d3e6e88 100644
11643 --- a/fs/autofs4/autofs_i.h
11644 +++ b/fs/autofs4/autofs_i.h
11645 @@ -34,6 +34,7 @@
11646  #include <linux/sched.h>
11647  #include <linux/mount.h>
11648  #include <linux/namei.h>
11649 +#include <linux/delay.h>
11650  #include <asm/current.h>
11651  #include <asm/uaccess.h>
11653 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
11654 index 7a5a598a2d94..d08bcdc30566 100644
11655 --- a/fs/autofs4/expire.c
11656 +++ b/fs/autofs4/expire.c
11657 @@ -150,7 +150,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
11658                         parent = p->d_parent;
11659                         if (!spin_trylock(&parent->d_lock)) {
11660                                 spin_unlock(&p->d_lock);
11661 -                               cpu_relax();
11662 +                               cpu_chill();
11663                                 goto relock;
11664                         }
11665                         spin_unlock(&p->d_lock);
11666 diff --git a/fs/buffer.c b/fs/buffer.c
11667 index 6f7d519a093b..96b943e924cc 100644
11668 --- a/fs/buffer.c
11669 +++ b/fs/buffer.c
11670 @@ -305,8 +305,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11671          * decide that the page is now completely done.
11672          */
11673         first = page_buffers(page);
11674 -       local_irq_save(flags);
11675 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11676 +       flags = bh_uptodate_lock_irqsave(first);
11677         clear_buffer_async_read(bh);
11678         unlock_buffer(bh);
11679         tmp = bh;
11680 @@ -319,8 +318,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11681                 }
11682                 tmp = tmp->b_this_page;
11683         } while (tmp != bh);
11684 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11685 -       local_irq_restore(flags);
11686 +       bh_uptodate_unlock_irqrestore(first, flags);
11688         /*
11689          * If none of the buffers had errors and they are all
11690 @@ -332,9 +330,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11691         return;
11693  still_busy:
11694 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11695 -       local_irq_restore(flags);
11696 -       return;
11697 +       bh_uptodate_unlock_irqrestore(first, flags);
11700  /*
11701 @@ -362,8 +358,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
11702         }
11704         first = page_buffers(page);
11705 -       local_irq_save(flags);
11706 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11707 +       flags = bh_uptodate_lock_irqsave(first);
11709         clear_buffer_async_write(bh);
11710         unlock_buffer(bh);
11711 @@ -375,15 +370,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
11712                 }
11713                 tmp = tmp->b_this_page;
11714         }
11715 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11716 -       local_irq_restore(flags);
11717 +       bh_uptodate_unlock_irqrestore(first, flags);
11718         end_page_writeback(page);
11719         return;
11721  still_busy:
11722 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11723 -       local_irq_restore(flags);
11724 -       return;
11725 +       bh_uptodate_unlock_irqrestore(first, flags);
11727  EXPORT_SYMBOL(end_buffer_async_write);
11729 @@ -3325,6 +3317,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
11730         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
11731         if (ret) {
11732                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
11733 +               buffer_head_init_locks(ret);
11734                 preempt_disable();
11735                 __this_cpu_inc(bh_accounting.nr);
11736                 recalc_bh_state();
11737 diff --git a/fs/dcache.c b/fs/dcache.c
11738 index 3ed642e0a0c2..1437dd042e13 100644
11739 --- a/fs/dcache.c
11740 +++ b/fs/dcache.c
11741 @@ -19,6 +19,7 @@
11742  #include <linux/mm.h>
11743  #include <linux/fs.h>
11744  #include <linux/fsnotify.h>
11745 +#include <linux/delay.h>
11746  #include <linux/slab.h>
11747  #include <linux/init.h>
11748  #include <linux/hash.h>
11749 @@ -774,6 +775,8 @@ static inline bool fast_dput(struct dentry *dentry)
11750   */
11751  void dput(struct dentry *dentry)
11753 +       struct dentry *parent;
11755         if (unlikely(!dentry))
11756                 return;
11758 @@ -810,9 +813,18 @@ void dput(struct dentry *dentry)
11759         return;
11761  kill_it:
11762 -       dentry = dentry_kill(dentry);
11763 -       if (dentry) {
11764 -               cond_resched();
11765 +       parent = dentry_kill(dentry);
11766 +       if (parent) {
11767 +               int r;
11769 +               if (parent == dentry) {
11770 +                       /* the task with the highest priority won't schedule */
11771 +                       r = cond_resched();
11772 +                       if (!r)
11773 +                               cpu_chill();
11774 +               } else {
11775 +                       dentry = parent;
11776 +               }
11777                 goto repeat;
11778         }
11780 @@ -2425,7 +2437,7 @@ void d_delete(struct dentry * dentry)
11781         if (dentry->d_lockref.count == 1) {
11782                 if (!spin_trylock(&inode->i_lock)) {
11783                         spin_unlock(&dentry->d_lock);
11784 -                       cpu_relax();
11785 +                       cpu_chill();
11786                         goto again;
11787                 }
11788                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
11789 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
11790 index 1b08556776ce..06435352045d 100644
11791 --- a/fs/eventpoll.c
11792 +++ b/fs/eventpoll.c
11793 @@ -505,12 +505,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
11794   */
11795  static void ep_poll_safewake(wait_queue_head_t *wq)
11797 -       int this_cpu = get_cpu();
11798 +       int this_cpu = get_cpu_light();
11800         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
11801                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
11803 -       put_cpu();
11804 +       put_cpu_light();
11807  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
11808 diff --git a/fs/exec.c b/fs/exec.c
11809 index 9c5ee2a880aa..c12e576bb209 100644
11810 --- a/fs/exec.c
11811 +++ b/fs/exec.c
11812 @@ -887,12 +887,14 @@ static int exec_mmap(struct mm_struct *mm)
11813                 }
11814         }
11815         task_lock(tsk);
11816 +       preempt_disable_rt();
11817         active_mm = tsk->active_mm;
11818         tsk->mm = mm;
11819         tsk->active_mm = mm;
11820         activate_mm(active_mm, mm);
11821         tsk->mm->vmacache_seqnum = 0;
11822         vmacache_flush(tsk);
11823 +       preempt_enable_rt();
11824         task_unlock(tsk);
11825         if (old_mm) {
11826                 up_read(&old_mm->mmap_sem);
11827 diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
11828 index 6ca56f5f72b5..9e145fe7cae0 100644
11829 --- a/fs/ext4/page-io.c
11830 +++ b/fs/ext4/page-io.c
11831 @@ -97,8 +97,7 @@ static void ext4_finish_bio(struct bio *bio)
11832                  * We check all buffers in the page under BH_Uptodate_Lock
11833                  * to avoid races with other end io clearing async_write flags
11834                  */
11835 -               local_irq_save(flags);
11836 -               bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
11837 +               flags = bh_uptodate_lock_irqsave(head);
11838                 do {
11839                         if (bh_offset(bh) < bio_start ||
11840                             bh_offset(bh) + bh->b_size > bio_end) {
11841 @@ -110,8 +109,7 @@ static void ext4_finish_bio(struct bio *bio)
11842                         if (bio->bi_error)
11843                                 buffer_io_error(bh);
11844                 } while ((bh = bh->b_this_page) != head);
11845 -               bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
11846 -               local_irq_restore(flags);
11847 +               bh_uptodate_unlock_irqrestore(head, flags);
11848                 if (!under_io) {
11849  #ifdef CONFIG_EXT4_FS_ENCRYPTION
11850                         if (ctx)
11851 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
11852 index 2871576fbca4..d1137790ea58 100644
11853 --- a/fs/f2fs/f2fs.h
11854 +++ b/fs/f2fs/f2fs.h
11855 @@ -24,7 +24,6 @@
11857  #ifdef CONFIG_F2FS_CHECK_FS
11858  #define f2fs_bug_on(sbi, condition)    BUG_ON(condition)
11859 -#define f2fs_down_write(x, y)  down_write_nest_lock(x, y)
11860  #else
11861  #define f2fs_bug_on(sbi, condition)                                    \
11862         do {                                                            \
11863 @@ -33,7 +32,6 @@
11864                         set_sbi_flag(sbi, SBI_NEED_FSCK);               \
11865                 }                                                       \
11866         } while (0)
11867 -#define f2fs_down_write(x, y)  down_write(x)
11868  #endif
11870  /*
11871 @@ -959,7 +957,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
11873  static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
11875 -       f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
11876 +       down_write(&sbi->cp_rwsem);
11879  static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
11880 diff --git a/fs/namespace.c b/fs/namespace.c
11881 index ec4078d16eb7..1780601dcbdd 100644
11882 --- a/fs/namespace.c
11883 +++ b/fs/namespace.c
11884 @@ -14,6 +14,7 @@
11885  #include <linux/mnt_namespace.h>
11886  #include <linux/user_namespace.h>
11887  #include <linux/namei.h>
11888 +#include <linux/delay.h>
11889  #include <linux/security.h>
11890  #include <linux/idr.h>
11891  #include <linux/init.h>                /* init_rootfs */
11892 @@ -357,8 +358,11 @@ int __mnt_want_write(struct vfsmount *m)
11893          * incremented count after it has set MNT_WRITE_HOLD.
11894          */
11895         smp_mb();
11896 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
11897 -               cpu_relax();
11898 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
11899 +               preempt_enable();
11900 +               cpu_chill();
11901 +               preempt_disable();
11902 +       }
11903         /*
11904          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
11905          * be set to match its requirements. So we must not load that until
11906 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
11907 index 7521e11db728..f0de4b6b8bf3 100644
11908 --- a/fs/ntfs/aops.c
11909 +++ b/fs/ntfs/aops.c
11910 @@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11911                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
11912         }
11913         first = page_buffers(page);
11914 -       local_irq_save(flags);
11915 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11916 +       flags = bh_uptodate_lock_irqsave(first);
11917         clear_buffer_async_read(bh);
11918         unlock_buffer(bh);
11919         tmp = bh;
11920 @@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11921                 }
11922                 tmp = tmp->b_this_page;
11923         } while (tmp != bh);
11924 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11925 -       local_irq_restore(flags);
11926 +       bh_uptodate_unlock_irqrestore(first, flags);
11927         /*
11928          * If none of the buffers had errors then we can set the page uptodate,
11929          * but we first have to perform the post read mst fixups, if the
11930 @@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11931                 recs = PAGE_CACHE_SIZE / rec_size;
11932                 /* Should have been verified before we got here... */
11933                 BUG_ON(!recs);
11934 -               local_irq_save(flags);
11935 +               local_irq_save_nort(flags);
11936                 kaddr = kmap_atomic(page);
11937                 for (i = 0; i < recs; i++)
11938                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
11939                                         i * rec_size), rec_size);
11940                 kunmap_atomic(kaddr);
11941 -               local_irq_restore(flags);
11942 +               local_irq_restore_nort(flags);
11943                 flush_dcache_page(page);
11944                 if (likely(page_uptodate && !PageError(page)))
11945                         SetPageUptodate(page);
11946 @@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11947         unlock_page(page);
11948         return;
11949  still_busy:
11950 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11951 -       local_irq_restore(flags);
11952 -       return;
11953 +       bh_uptodate_unlock_irqrestore(first, flags);
11956  /**
11957 diff --git a/fs/timerfd.c b/fs/timerfd.c
11958 index 1327a02ec778..4260febcb029 100644
11959 --- a/fs/timerfd.c
11960 +++ b/fs/timerfd.c
11961 @@ -461,7 +461,10 @@ static int do_timerfd_settime(int ufd, int flags,
11962                                 break;
11963                 }
11964                 spin_unlock_irq(&ctx->wqh.lock);
11965 -               cpu_relax();
11966 +               if (isalarm(ctx))
11967 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
11968 +               else
11969 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
11970         }
11972         /*
11973 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
11974 index 323e5daece54..cc5fbd534fd4 100644
11975 --- a/include/acpi/platform/aclinux.h
11976 +++ b/include/acpi/platform/aclinux.h
11977 @@ -127,6 +127,7 @@
11979  #define acpi_cache_t                        struct kmem_cache
11980  #define acpi_spinlock                       spinlock_t *
11981 +#define acpi_raw_spinlock              raw_spinlock_t *
11982  #define acpi_cpu_flags                      unsigned long
11984  /* Use native linux version of acpi_os_allocate_zeroed */
11985 @@ -145,6 +146,20 @@
11986  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
11987  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
11989 +#define acpi_os_create_raw_lock(__handle)                      \
11990 +({                                                             \
11991 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
11992 +                                                               \
11993 +        if (lock) {                                            \
11994 +               *(__handle) = lock;                             \
11995 +               raw_spin_lock_init(*(__handle));                \
11996 +        }                                                      \
11997 +        lock ? AE_OK : AE_NO_MEMORY;                           \
11998 + })
12000 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
12003  /*
12004   * OSL interfaces used by debugger/disassembler
12005   */
12006 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
12007 index 630dd2372238..850e4d993a88 100644
12008 --- a/include/asm-generic/bug.h
12009 +++ b/include/asm-generic/bug.h
12010 @@ -206,6 +206,20 @@ extern void warn_slowpath_null(const char *file, const int line);
12011  # define WARN_ON_SMP(x)                        ({0;})
12012  #endif
12014 +#ifdef CONFIG_PREEMPT_RT_BASE
12015 +# define BUG_ON_RT(c)                  BUG_ON(c)
12016 +# define BUG_ON_NONRT(c)               do { } while (0)
12017 +# define WARN_ON_RT(condition)         WARN_ON(condition)
12018 +# define WARN_ON_NONRT(condition)      do { } while (0)
12019 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
12020 +#else
12021 +# define BUG_ON_RT(c)                  do { } while (0)
12022 +# define BUG_ON_NONRT(c)               BUG_ON(c)
12023 +# define WARN_ON_RT(condition)         do { } while (0)
12024 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
12025 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
12026 +#endif
12028  #endif /* __ASSEMBLY__ */
12030  #endif
12031 diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
12032 index 5d8ffa3e6f8c..c1cde3577551 100644
12033 --- a/include/asm-generic/preempt.h
12034 +++ b/include/asm-generic/preempt.h
12035 @@ -7,10 +7,10 @@
12037  static __always_inline int preempt_count(void)
12039 -       return current_thread_info()->preempt_count;
12040 +       return READ_ONCE(current_thread_info()->preempt_count);
12043 -static __always_inline int *preempt_count_ptr(void)
12044 +static __always_inline volatile int *preempt_count_ptr(void)
12046         return &current_thread_info()->preempt_count;
12048 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
12049 index daf17d70aeca..463df8954255 100644
12050 --- a/include/linux/blk-mq.h
12051 +++ b/include/linux/blk-mq.h
12052 @@ -212,6 +212,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
12054  struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
12055  struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
12056 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
12058  int blk_mq_request_started(struct request *rq);
12059  void blk_mq_start_request(struct request *rq);
12060 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
12061 index fe14382f9664..a82143ad6702 100644
12062 --- a/include/linux/blkdev.h
12063 +++ b/include/linux/blkdev.h
12064 @@ -89,6 +89,7 @@ struct request {
12065         struct list_head queuelist;
12066         union {
12067                 struct call_single_data csd;
12068 +               struct work_struct work;
12069                 unsigned long fifo_time;
12070         };
12072 @@ -455,7 +456,7 @@ struct request_queue {
12073         struct throtl_data *td;
12074  #endif
12075         struct rcu_head         rcu_head;
12076 -       wait_queue_head_t       mq_freeze_wq;
12077 +       struct swait_queue_head mq_freeze_wq;
12078         struct percpu_ref       q_usage_counter;
12079         struct list_head        all_q_node;
12081 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
12082 index 8fdcb783197d..d07dbeec7bc1 100644
12083 --- a/include/linux/bottom_half.h
12084 +++ b/include/linux/bottom_half.h
12085 @@ -3,6 +3,39 @@
12087  #include <linux/preempt.h>
12089 +#ifdef CONFIG_PREEMPT_RT_FULL
12091 +extern void __local_bh_disable(void);
12092 +extern void _local_bh_enable(void);
12093 +extern void __local_bh_enable(void);
12095 +static inline void local_bh_disable(void)
12097 +       __local_bh_disable();
12100 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
12102 +       __local_bh_disable();
12105 +static inline void local_bh_enable(void)
12107 +       __local_bh_enable();
12110 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
12112 +       __local_bh_enable();
12115 +static inline void local_bh_enable_ip(unsigned long ip)
12117 +       __local_bh_enable();
12120 +#else
12122  #ifdef CONFIG_TRACE_IRQFLAGS
12123  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
12124  #else
12125 @@ -30,5 +63,6 @@ static inline void local_bh_enable(void)
12127         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
12129 +#endif
12131  #endif /* _LINUX_BH_H */
12132 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
12133 index 6fe974dbe741..7242505af75c 100644
12134 --- a/include/linux/buffer_head.h
12135 +++ b/include/linux/buffer_head.h
12136 @@ -75,8 +75,50 @@ struct buffer_head {
12137         struct address_space *b_assoc_map;      /* mapping this buffer is
12138                                                    associated with */
12139         atomic_t b_count;               /* users using this buffer_head */
12140 +#ifdef CONFIG_PREEMPT_RT_BASE
12141 +       spinlock_t b_uptodate_lock;
12142 +#if IS_ENABLED(CONFIG_JBD2)
12143 +       spinlock_t b_state_lock;
12144 +       spinlock_t b_journal_head_lock;
12145 +#endif
12146 +#endif
12147  };
12149 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
12151 +       unsigned long flags;
12153 +#ifndef CONFIG_PREEMPT_RT_BASE
12154 +       local_irq_save(flags);
12155 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
12156 +#else
12157 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
12158 +#endif
12159 +       return flags;
12162 +static inline void
12163 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
12165 +#ifndef CONFIG_PREEMPT_RT_BASE
12166 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
12167 +       local_irq_restore(flags);
12168 +#else
12169 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
12170 +#endif
12173 +static inline void buffer_head_init_locks(struct buffer_head *bh)
12175 +#ifdef CONFIG_PREEMPT_RT_BASE
12176 +       spin_lock_init(&bh->b_uptodate_lock);
12177 +#if IS_ENABLED(CONFIG_JBD2)
12178 +       spin_lock_init(&bh->b_state_lock);
12179 +       spin_lock_init(&bh->b_journal_head_lock);
12180 +#endif
12181 +#endif
12184  /*
12185   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
12186   * and buffer_foo() functions.
12187 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
12188 index 8da263299754..0cc474291e08 100644
12189 --- a/include/linux/cgroup-defs.h
12190 +++ b/include/linux/cgroup-defs.h
12191 @@ -16,6 +16,7 @@
12192  #include <linux/percpu-refcount.h>
12193  #include <linux/percpu-rwsem.h>
12194  #include <linux/workqueue.h>
12195 +#include <linux/swork.h>
12197  #ifdef CONFIG_CGROUPS
12199 @@ -142,6 +143,7 @@ struct cgroup_subsys_state {
12200         /* percpu_ref killing and RCU release */
12201         struct rcu_head rcu_head;
12202         struct work_struct destroy_work;
12203 +       struct swork_event destroy_swork;
12204  };
12206  /*
12207 diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h
12208 index 1e6932222e11..17f413bbbedf 100644
12209 --- a/include/linux/clk/at91_pmc.h
12210 +++ b/include/linux/clk/at91_pmc.h
12211 @@ -16,18 +16,6 @@
12212  #ifndef AT91_PMC_H
12213  #define AT91_PMC_H
12215 -#ifndef __ASSEMBLY__
12216 -extern void __iomem *at91_pmc_base;
12218 -#define at91_pmc_read(field) \
12219 -       readl_relaxed(at91_pmc_base + field)
12221 -#define at91_pmc_write(field, value) \
12222 -       writel_relaxed(value, at91_pmc_base + field)
12223 -#else
12224 -.extern at91_pmc_base
12225 -#endif
12227  #define        AT91_PMC_SCER           0x00                    /* System Clock Enable Register */
12228  #define        AT91_PMC_SCDR           0x04                    /* System Clock Disable Register */
12230 diff --git a/include/linux/completion.h b/include/linux/completion.h
12231 index 5d5aaae3af43..3bca1590e29f 100644
12232 --- a/include/linux/completion.h
12233 +++ b/include/linux/completion.h
12234 @@ -7,8 +7,7 @@
12235   * Atomic wait-for-completion handler data structures.
12236   * See kernel/sched/completion.c for details.
12237   */
12239 -#include <linux/wait.h>
12240 +#include <linux/swait.h>
12242  /*
12243   * struct completion - structure used to maintain state for a "completion"
12244 @@ -24,11 +23,11 @@
12245   */
12246  struct completion {
12247         unsigned int done;
12248 -       wait_queue_head_t wait;
12249 +       struct swait_queue_head wait;
12250  };
12252  #define COMPLETION_INITIALIZER(work) \
12253 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
12254 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
12256  #define COMPLETION_INITIALIZER_ONSTACK(work) \
12257         ({ init_completion(&work); work; })
12258 @@ -73,7 +72,7 @@ struct completion {
12259  static inline void init_completion(struct completion *x)
12261         x->done = 0;
12262 -       init_waitqueue_head(&x->wait);
12263 +       init_swait_queue_head(&x->wait);
12266  /**
12267 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
12268 index 7e04bcd9af8e..e7fa0f6935e6 100644
12269 --- a/include/linux/cpu.h
12270 +++ b/include/linux/cpu.h
12271 @@ -231,6 +231,8 @@ extern void get_online_cpus(void);
12272  extern void put_online_cpus(void);
12273  extern void cpu_hotplug_disable(void);
12274  extern void cpu_hotplug_enable(void);
12275 +extern void pin_current_cpu(void);
12276 +extern void unpin_current_cpu(void);
12277  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
12278  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
12279  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
12280 @@ -248,6 +250,8 @@ static inline void cpu_hotplug_done(void) {}
12281  #define put_online_cpus()      do { } while (0)
12282  #define cpu_hotplug_disable()  do { } while (0)
12283  #define cpu_hotplug_enable()   do { } while (0)
12284 +static inline void pin_current_cpu(void) { }
12285 +static inline void unpin_current_cpu(void) { }
12286  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
12287  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
12288  /* These aren't inline functions due to a GCC bug. */
12289 diff --git a/include/linux/delay.h b/include/linux/delay.h
12290 index a6ecb34cf547..37caab306336 100644
12291 --- a/include/linux/delay.h
12292 +++ b/include/linux/delay.h
12293 @@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
12294         msleep(seconds * 1000);
12297 +#ifdef CONFIG_PREEMPT_RT_FULL
12298 +extern void cpu_chill(void);
12299 +#else
12300 +# define cpu_chill()   cpu_relax()
12301 +#endif
12303  #endif /* defined(_LINUX_DELAY_H) */
12304 diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
12305 index 60048c50404e..f2cd67624f18 100644
12306 --- a/include/linux/ftrace.h
12307 +++ b/include/linux/ftrace.h
12308 @@ -694,6 +694,18 @@ static inline void __ftrace_enabled_restore(int enabled)
12309  #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
12310  #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
12312 +static inline unsigned long get_lock_parent_ip(void)
12314 +       unsigned long addr = CALLER_ADDR0;
12316 +       if (!in_lock_functions(addr))
12317 +               return addr;
12318 +       addr = CALLER_ADDR1;
12319 +       if (!in_lock_functions(addr))
12320 +               return addr;
12321 +       return CALLER_ADDR2;
12324  #ifdef CONFIG_IRQSOFF_TRACER
12325    extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
12326    extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
12327 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
12328 index bb3f3297062a..a117a33ef72c 100644
12329 --- a/include/linux/highmem.h
12330 +++ b/include/linux/highmem.h
12331 @@ -7,6 +7,7 @@
12332  #include <linux/mm.h>
12333  #include <linux/uaccess.h>
12334  #include <linux/hardirq.h>
12335 +#include <linux/sched.h>
12337  #include <asm/cacheflush.h>
12339 @@ -65,7 +66,7 @@ static inline void kunmap(struct page *page)
12341  static inline void *kmap_atomic(struct page *page)
12343 -       preempt_disable();
12344 +       preempt_disable_nort();
12345         pagefault_disable();
12346         return page_address(page);
12348 @@ -74,7 +75,7 @@ static inline void *kmap_atomic(struct page *page)
12349  static inline void __kunmap_atomic(void *addr)
12351         pagefault_enable();
12352 -       preempt_enable();
12353 +       preempt_enable_nort();
12356  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
12357 @@ -86,32 +87,51 @@ static inline void __kunmap_atomic(void *addr)
12359  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
12361 +#ifndef CONFIG_PREEMPT_RT_FULL
12362  DECLARE_PER_CPU(int, __kmap_atomic_idx);
12363 +#endif
12365  static inline int kmap_atomic_idx_push(void)
12367 +#ifndef CONFIG_PREEMPT_RT_FULL
12368         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
12370 -#ifdef CONFIG_DEBUG_HIGHMEM
12371 +# ifdef CONFIG_DEBUG_HIGHMEM
12372         WARN_ON_ONCE(in_irq() && !irqs_disabled());
12373         BUG_ON(idx >= KM_TYPE_NR);
12374 -#endif
12375 +# endif
12376         return idx;
12377 +#else
12378 +       current->kmap_idx++;
12379 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
12380 +       return current->kmap_idx - 1;
12381 +#endif
12384  static inline int kmap_atomic_idx(void)
12386 +#ifndef CONFIG_PREEMPT_RT_FULL
12387         return __this_cpu_read(__kmap_atomic_idx) - 1;
12388 +#else
12389 +       return current->kmap_idx - 1;
12390 +#endif
12393  static inline void kmap_atomic_idx_pop(void)
12395 -#ifdef CONFIG_DEBUG_HIGHMEM
12396 +#ifndef CONFIG_PREEMPT_RT_FULL
12397 +# ifdef CONFIG_DEBUG_HIGHMEM
12398         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
12400         BUG_ON(idx < 0);
12401 -#else
12402 +# else
12403         __this_cpu_dec(__kmap_atomic_idx);
12404 +# endif
12405 +#else
12406 +       current->kmap_idx--;
12407 +# ifdef CONFIG_DEBUG_HIGHMEM
12408 +       BUG_ON(current->kmap_idx < 0);
12409 +# endif
12410  #endif
12413 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
12414 index 2ead22dd74a0..ff317006d3e8 100644
12415 --- a/include/linux/hrtimer.h
12416 +++ b/include/linux/hrtimer.h
12417 @@ -87,6 +87,9 @@ enum hrtimer_restart {
12418   * @function:  timer expiry callback function
12419   * @base:      pointer to the timer base (per cpu and per clock)
12420   * @state:     state information (See bit values above)
12421 + * @cb_entry:  list entry to defer timers from hardirq context
12422 + * @irqsafe:   timer can run in hardirq context
12423 + * @praecox:   timer expiry time if expired at the time of programming
12424   * @is_rel:    Set if the timer was armed relative
12425   * @start_pid:  timer statistics field to store the pid of the task which
12426   *             started the timer
12427 @@ -103,6 +106,11 @@ struct hrtimer {
12428         enum hrtimer_restart            (*function)(struct hrtimer *);
12429         struct hrtimer_clock_base       *base;
12430         u8                              state;
12431 +       struct list_head                cb_entry;
12432 +       int                             irqsafe;
12433 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
12434 +       ktime_t                         praecox;
12435 +#endif
12436         u8                              is_rel;
12437  #ifdef CONFIG_TIMER_STATS
12438         int                             start_pid;
12439 @@ -123,11 +131,7 @@ struct hrtimer_sleeper {
12440         struct task_struct *task;
12441  };
12443 -#ifdef CONFIG_64BIT
12444  # define HRTIMER_CLOCK_BASE_ALIGN      64
12445 -#else
12446 -# define HRTIMER_CLOCK_BASE_ALIGN      32
12447 -#endif
12449  /**
12450   * struct hrtimer_clock_base - the timer base for a specific clock
12451 @@ -136,6 +140,7 @@ struct hrtimer_sleeper {
12452   *                     timer to a base on another cpu.
12453   * @clockid:           clock id for per_cpu support
12454   * @active:            red black tree root node for the active timers
12455 + * @expired:           list head for deferred timers.
12456   * @get_time:          function to retrieve the current time of the clock
12457   * @offset:            offset of this clock to the monotonic base
12458   */
12459 @@ -144,6 +149,7 @@ struct hrtimer_clock_base {
12460         int                     index;
12461         clockid_t               clockid;
12462         struct timerqueue_head  active;
12463 +       struct list_head        expired;
12464         ktime_t                 (*get_time)(void);
12465         ktime_t                 offset;
12466  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
12467 @@ -187,6 +193,7 @@ struct hrtimer_cpu_base {
12468         raw_spinlock_t                  lock;
12469         seqcount_t                      seq;
12470         struct hrtimer                  *running;
12471 +       struct hrtimer                  *running_soft;
12472         unsigned int                    cpu;
12473         unsigned int                    active_bases;
12474         unsigned int                    clock_was_set_seq;
12475 @@ -202,6 +209,9 @@ struct hrtimer_cpu_base {
12476         unsigned int                    nr_retries;
12477         unsigned int                    nr_hangs;
12478         unsigned int                    max_hang_time;
12479 +#endif
12480 +#ifdef CONFIG_PREEMPT_RT_BASE
12481 +       wait_queue_head_t               wait;
12482  #endif
12483         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
12484  } ____cacheline_aligned;
12485 @@ -412,6 +422,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
12486         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
12489 +/* Softirq preemption could deadlock timer removal */
12490 +#ifdef CONFIG_PREEMPT_RT_BASE
12491 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
12492 +#else
12493 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
12494 +#endif
12496  /* Query timers: */
12497  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
12499 @@ -436,9 +453,15 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
12500   * Helper function to check, whether the timer is running the callback
12501   * function
12502   */
12503 -static inline int hrtimer_callback_running(struct hrtimer *timer)
12504 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
12506 -       return timer->base->cpu_base->running == timer;
12507 +       if (timer->base->cpu_base->running == timer)
12508 +               return 1;
12509 +#ifdef CONFIG_PREEMPT_RT_BASE
12510 +       if (timer->base->cpu_base->running_soft == timer)
12511 +               return 1;
12512 +#endif
12513 +       return 0;
12516  /* Forward a hrtimer so it expires after now: */
12517 diff --git a/include/linux/idr.h b/include/linux/idr.h
12518 index 013fd9bc4cb6..f62be0aec911 100644
12519 --- a/include/linux/idr.h
12520 +++ b/include/linux/idr.h
12521 @@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
12522   * Each idr_preload() should be matched with an invocation of this
12523   * function.  See idr_preload() for details.
12524   */
12525 +#ifdef CONFIG_PREEMPT_RT_FULL
12526 +void idr_preload_end(void);
12527 +#else
12528  static inline void idr_preload_end(void)
12530         preempt_enable();
12532 +#endif
12534  /**
12535   * idr_find - return pointer for given id
12536 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
12537 index 1c1ff7e4faa4..60fadde71a44 100644
12538 --- a/include/linux/init_task.h
12539 +++ b/include/linux/init_task.h
12540 @@ -148,9 +148,15 @@ extern struct task_group root_task_group;
12541  # define INIT_PERF_EVENTS(tsk)
12542  #endif
12544 +#ifdef CONFIG_PREEMPT_RT_BASE
12545 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
12546 +#else
12547 +# define INIT_TIMER_LIST
12548 +#endif
12550  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
12551  # define INIT_VTIME(tsk)                                               \
12552 -       .vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock), \
12553 +       .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
12554         .vtime_snap = 0,                                \
12555         .vtime_snap_whence = VTIME_SYS,
12556  #else
12557 @@ -239,6 +245,7 @@ extern struct task_group root_task_group;
12558         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
12559         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
12560         .timer_slack_ns = 50000, /* 50 usec default slack */            \
12561 +       INIT_TIMER_LIST                                                 \
12562         .pids = {                                                       \
12563                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
12564                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
12565 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
12566 index ad16809c8596..655cee096aed 100644
12567 --- a/include/linux/interrupt.h
12568 +++ b/include/linux/interrupt.h
12569 @@ -61,6 +61,7 @@
12570   *                interrupt handler after suspending interrupts. For system
12571   *                wakeup devices users need to implement wakeup detection in
12572   *                their interrupt handlers.
12573 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
12574   */
12575  #define IRQF_SHARED            0x00000080
12576  #define IRQF_PROBE_SHARED      0x00000100
12577 @@ -74,6 +75,7 @@
12578  #define IRQF_NO_THREAD         0x00010000
12579  #define IRQF_EARLY_RESUME      0x00020000
12580  #define IRQF_COND_SUSPEND      0x00040000
12581 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
12583  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
12585 @@ -186,7 +188,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
12586  #ifdef CONFIG_LOCKDEP
12587  # define local_irq_enable_in_hardirq() do { } while (0)
12588  #else
12589 -# define local_irq_enable_in_hardirq() local_irq_enable()
12590 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
12591  #endif
12593  extern void disable_irq_nosync(unsigned int irq);
12594 @@ -206,6 +208,7 @@ extern void resume_device_irqs(void);
12595   * @irq:               Interrupt to which notification applies
12596   * @kref:              Reference count, for internal use
12597   * @work:              Work item, for internal use
12598 + * @list:              List item for deferred callbacks
12599   * @notify:            Function to be called on change.  This will be
12600   *                     called in process context.
12601   * @release:           Function to be called on release.  This will be
12602 @@ -217,6 +220,7 @@ struct irq_affinity_notify {
12603         unsigned int irq;
12604         struct kref kref;
12605         struct work_struct work;
12606 +       struct list_head list;
12607         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
12608         void (*release)(struct kref *ref);
12609  };
12610 @@ -379,9 +383,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
12611                                  bool state);
12613  #ifdef CONFIG_IRQ_FORCED_THREADING
12614 +# ifndef CONFIG_PREEMPT_RT_BASE
12615  extern bool force_irqthreads;
12616 +# else
12617 +#  define force_irqthreads     (true)
12618 +# endif
12619  #else
12620 -#define force_irqthreads       (0)
12621 +#define force_irqthreads       (false)
12622  #endif
12624  #ifndef __ARCH_SET_SOFTIRQ_PENDING
12625 @@ -438,9 +446,10 @@ struct softirq_action
12626         void    (*action)(struct softirq_action *);
12627  };
12629 +#ifndef CONFIG_PREEMPT_RT_FULL
12630  asmlinkage void do_softirq(void);
12631  asmlinkage void __do_softirq(void);
12633 +static inline void thread_do_softirq(void) { do_softirq(); }
12634  #ifdef __ARCH_HAS_DO_SOFTIRQ
12635  void do_softirq_own_stack(void);
12636  #else
12637 @@ -449,13 +458,25 @@ static inline void do_softirq_own_stack(void)
12638         __do_softirq();
12640  #endif
12641 +#else
12642 +extern void thread_do_softirq(void);
12643 +#endif
12645  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
12646  extern void softirq_init(void);
12647  extern void __raise_softirq_irqoff(unsigned int nr);
12648 +#ifdef CONFIG_PREEMPT_RT_FULL
12649 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
12650 +#else
12651 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
12653 +       __raise_softirq_irqoff(nr);
12655 +#endif
12657  extern void raise_softirq_irqoff(unsigned int nr);
12658  extern void raise_softirq(unsigned int nr);
12659 +extern void softirq_check_pending_idle(void);
12661  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
12663 @@ -477,8 +498,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
12664       to be executed on some cpu at least once after this.
12665     * If the tasklet is already scheduled, but its execution is still not
12666       started, it will be executed only once.
12667 -   * If this tasklet is already running on another CPU (or schedule is called
12668 -     from tasklet itself), it is rescheduled for later.
12669 +   * If this tasklet is already running on another CPU, it is rescheduled
12670 +     for later.
12671 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
12672     * Tasklet is strictly serialized wrt itself, but not
12673       wrt another tasklets. If client needs some intertask synchronization,
12674       he makes it with spinlocks.
12675 @@ -503,27 +525,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
12676  enum
12678         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
12679 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
12680 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
12681 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
12682  };
12684 -#ifdef CONFIG_SMP
12685 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
12686 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
12687 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
12689 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
12690  static inline int tasklet_trylock(struct tasklet_struct *t)
12692         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
12695 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
12697 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
12700  static inline void tasklet_unlock(struct tasklet_struct *t)
12702         smp_mb__before_atomic();
12703         clear_bit(TASKLET_STATE_RUN, &(t)->state);
12706 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
12708 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
12710 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
12712  #else
12713  #define tasklet_trylock(t) 1
12714 +#define tasklet_tryunlock(t)   1
12715  #define tasklet_unlock_wait(t) do { } while (0)
12716  #define tasklet_unlock(t) do { } while (0)
12717  #endif
12718 @@ -572,12 +603,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
12719         smp_mb();
12722 -static inline void tasklet_enable(struct tasklet_struct *t)
12724 -       smp_mb__before_atomic();
12725 -       atomic_dec(&t->count);
12728 +extern void tasklet_enable(struct tasklet_struct *t);
12729  extern void tasklet_kill(struct tasklet_struct *t);
12730  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
12731  extern void tasklet_init(struct tasklet_struct *t,
12732 @@ -608,6 +634,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
12733         tasklet_kill(&ttimer->tasklet);
12736 +#ifdef CONFIG_PREEMPT_RT_FULL
12737 +extern void softirq_early_init(void);
12738 +#else
12739 +static inline void softirq_early_init(void) { }
12740 +#endif
12742  /*
12743   * Autoprobing for irqs:
12744   *
12745 diff --git a/include/linux/irq.h b/include/linux/irq.h
12746 index f7cade00c525..dac9e11ba037 100644
12747 --- a/include/linux/irq.h
12748 +++ b/include/linux/irq.h
12749 @@ -72,6 +72,7 @@ enum irqchip_irq_state;
12750   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
12751   *                               it from the spurious interrupt detection
12752   *                               mechanism and from core side polling.
12753 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
12754   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
12755   */
12756  enum {
12757 @@ -99,13 +100,14 @@ enum {
12758         IRQ_PER_CPU_DEVID       = (1 << 17),
12759         IRQ_IS_POLLED           = (1 << 18),
12760         IRQ_DISABLE_UNLAZY      = (1 << 19),
12761 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
12762  };
12764  #define IRQF_MODIFY_MASK       \
12765         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
12766          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
12767          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
12768 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
12769 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
12771  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
12773 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
12774 index 47b9ebd4a74f..2543aab05daa 100644
12775 --- a/include/linux/irq_work.h
12776 +++ b/include/linux/irq_work.h
12777 @@ -16,6 +16,7 @@
12778  #define IRQ_WORK_BUSY          2UL
12779  #define IRQ_WORK_FLAGS         3UL
12780  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
12781 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
12783  struct irq_work {
12784         unsigned long flags;
12785 @@ -51,4 +52,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
12786  static inline void irq_work_run(void) { }
12787  #endif
12789 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12790 +void irq_work_tick_soft(void);
12791 +#else
12792 +static inline void irq_work_tick_soft(void) { }
12793 +#endif
12795  #endif /* _LINUX_IRQ_WORK_H */
12796 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
12797 index a587a33363c7..ad57402a242d 100644
12798 --- a/include/linux/irqdesc.h
12799 +++ b/include/linux/irqdesc.h
12800 @@ -61,6 +61,7 @@ struct irq_desc {
12801         unsigned int            irqs_unhandled;
12802         atomic_t                threads_handled;
12803         int                     threads_handled_last;
12804 +       u64                     random_ip;
12805         raw_spinlock_t          lock;
12806         struct cpumask          *percpu_enabled;
12807  #ifdef CONFIG_SMP
12808 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
12809 index 5dd1272d1ab2..9b77034f7c5e 100644
12810 --- a/include/linux/irqflags.h
12811 +++ b/include/linux/irqflags.h
12812 @@ -25,8 +25,6 @@
12813  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
12814  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
12815  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
12816 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
12817 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
12818  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
12819  #else
12820  # define trace_hardirqs_on()           do { } while (0)
12821 @@ -39,9 +37,15 @@
12822  # define trace_softirqs_enabled(p)     0
12823  # define trace_hardirq_enter()         do { } while (0)
12824  # define trace_hardirq_exit()          do { } while (0)
12825 +# define INIT_TRACE_IRQFLAGS
12826 +#endif
12828 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
12829 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
12830 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
12831 +#else
12832  # define lockdep_softirq_enter()       do { } while (0)
12833  # define lockdep_softirq_exit()                do { } while (0)
12834 -# define INIT_TRACE_IRQFLAGS
12835  #endif
12837  #if defined(CONFIG_IRQSOFF_TRACER) || \
12838 @@ -148,4 +152,23 @@
12840  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
12843 + * local_irq* variants depending on RT/!RT
12844 + */
12845 +#ifdef CONFIG_PREEMPT_RT_FULL
12846 +# define local_irq_disable_nort()      do { } while (0)
12847 +# define local_irq_enable_nort()       do { } while (0)
12848 +# define local_irq_save_nort(flags)    local_save_flags(flags)
12849 +# define local_irq_restore_nort(flags) (void)(flags)
12850 +# define local_irq_disable_rt()                local_irq_disable()
12851 +# define local_irq_enable_rt()         local_irq_enable()
12852 +#else
12853 +# define local_irq_disable_nort()      local_irq_disable()
12854 +# define local_irq_enable_nort()       local_irq_enable()
12855 +# define local_irq_save_nort(flags)    local_irq_save(flags)
12856 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
12857 +# define local_irq_disable_rt()                do { } while (0)
12858 +# define local_irq_enable_rt()         do { } while (0)
12859 +#endif
12861  #endif
12862 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
12863 index 65407f6c9120..eb5aabe4e18c 100644
12864 --- a/include/linux/jbd2.h
12865 +++ b/include/linux/jbd2.h
12866 @@ -352,32 +352,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
12868  static inline void jbd_lock_bh_state(struct buffer_head *bh)
12870 +#ifndef CONFIG_PREEMPT_RT_BASE
12871         bit_spin_lock(BH_State, &bh->b_state);
12872 +#else
12873 +       spin_lock(&bh->b_state_lock);
12874 +#endif
12877  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
12879 +#ifndef CONFIG_PREEMPT_RT_BASE
12880         return bit_spin_trylock(BH_State, &bh->b_state);
12881 +#else
12882 +       return spin_trylock(&bh->b_state_lock);
12883 +#endif
12886  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
12888 +#ifndef CONFIG_PREEMPT_RT_BASE
12889         return bit_spin_is_locked(BH_State, &bh->b_state);
12890 +#else
12891 +       return spin_is_locked(&bh->b_state_lock);
12892 +#endif
12895  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
12897 +#ifndef CONFIG_PREEMPT_RT_BASE
12898         bit_spin_unlock(BH_State, &bh->b_state);
12899 +#else
12900 +       spin_unlock(&bh->b_state_lock);
12901 +#endif
12904  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
12906 +#ifndef CONFIG_PREEMPT_RT_BASE
12907         bit_spin_lock(BH_JournalHead, &bh->b_state);
12908 +#else
12909 +       spin_lock(&bh->b_journal_head_lock);
12910 +#endif
12913  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
12915 +#ifndef CONFIG_PREEMPT_RT_BASE
12916         bit_spin_unlock(BH_JournalHead, &bh->b_state);
12917 +#else
12918 +       spin_unlock(&bh->b_journal_head_lock);
12919 +#endif
12922  #define J_ASSERT(assert)       BUG_ON(!(assert))
12923 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
12924 index a19bcf9e762e..897495386446 100644
12925 --- a/include/linux/kdb.h
12926 +++ b/include/linux/kdb.h
12927 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
12928  extern __printf(1, 2) int kdb_printf(const char *, ...);
12929  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
12931 +#define in_kdb_printk()        (kdb_trap_printk)
12932  extern void kdb_init(int level);
12934  /* Access to kdb specific polling devices */
12935 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
12936  extern int kdb_unregister(char *);
12937  #else /* ! CONFIG_KGDB_KDB */
12938  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
12939 +#define in_kdb_printk() (0)
12940  static inline void kdb_init(int level) {}
12941  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
12942                                char *help, short minlen) { return 0; }
12943 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
12944 index 50220cab738c..d68f639f7330 100644
12945 --- a/include/linux/kernel.h
12946 +++ b/include/linux/kernel.h
12947 @@ -188,6 +188,9 @@ extern int _cond_resched(void);
12948   */
12949  # define might_sleep() \
12950         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12952 +# define might_sleep_no_state_check() \
12953 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12954  # define sched_annotate_sleep()        (current->task_state_change = 0)
12955  #else
12956    static inline void ___might_sleep(const char *file, int line,
12957 @@ -195,6 +198,7 @@ extern int _cond_resched(void);
12958    static inline void __might_sleep(const char *file, int line,
12959                                    int preempt_offset) { }
12960  # define might_sleep() do { might_resched(); } while (0)
12961 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
12962  # define sched_annotate_sleep() do { } while (0)
12963  #endif
12965 @@ -255,6 +259,7 @@ extern long (*panic_blink)(int state);
12966  __printf(1, 2)
12967  void panic(const char *fmt, ...)
12968         __noreturn __cold;
12969 +void nmi_panic(struct pt_regs *regs, const char *msg);
12970  extern void oops_enter(void);
12971  extern void oops_exit(void);
12972  void print_oops_end_marker(void);
12973 @@ -447,6 +452,14 @@ extern int sysctl_panic_on_stackoverflow;
12975  extern bool crash_kexec_post_notifiers;
12978 + * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
12979 + * holds a CPU number which is executing panic() currently. A value of
12980 + * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
12981 + */
12982 +extern atomic_t panic_cpu;
12983 +#define PANIC_CPU_INVALID      -1
12985  /*
12986   * Only to be used by arch init code. If the user over-wrote the default
12987   * CONFIG_PANIC_TIMEOUT, honor it.
12988 @@ -475,6 +488,7 @@ extern enum system_states {
12989         SYSTEM_HALT,
12990         SYSTEM_POWER_OFF,
12991         SYSTEM_RESTART,
12992 +       SYSTEM_SUSPEND,
12993  } system_state;
12995  #define TAINT_PROPRIETARY_MODULE       0
12996 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
12997 index d7ce4e3280db..0b6d392b38e7 100644
12998 --- a/include/linux/kvm_host.h
12999 +++ b/include/linux/kvm_host.h
13000 @@ -25,6 +25,7 @@
13001  #include <linux/irqflags.h>
13002  #include <linux/context_tracking.h>
13003  #include <linux/irqbypass.h>
13004 +#include <linux/swait.h>
13005  #include <asm/signal.h>
13007  #include <linux/kvm.h>
13008 @@ -243,7 +244,7 @@ struct kvm_vcpu {
13009         int fpu_active;
13010         int guest_fpu_loaded, guest_xcr0_loaded;
13011         unsigned char fpu_counter;
13012 -       wait_queue_head_t wq;
13013 +       struct swait_queue_head wq;
13014         struct pid *pid;
13015         int sigset_active;
13016         sigset_t sigset;
13017 @@ -794,7 +795,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
13019  #endif
13021 -static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
13022 +static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
13024  #ifdef __KVM_HAVE_ARCH_WQP
13025         return vcpu->arch.wqp;
13026 diff --git a/include/linux/lglock.h b/include/linux/lglock.h
13027 index c92ebd100d9b..6f035f635d0e 100644
13028 --- a/include/linux/lglock.h
13029 +++ b/include/linux/lglock.h
13030 @@ -34,13 +34,30 @@
13031  #endif
13033  struct lglock {
13034 +#ifdef CONFIG_PREEMPT_RT_FULL
13035 +       struct rt_mutex __percpu *lock;
13036 +#else
13037         arch_spinlock_t __percpu *lock;
13038 +#endif
13039  #ifdef CONFIG_DEBUG_LOCK_ALLOC
13040         struct lock_class_key lock_key;
13041         struct lockdep_map    lock_dep_map;
13042  #endif
13043  };
13045 +#ifdef CONFIG_PREEMPT_RT_FULL
13046 +# define DEFINE_LGLOCK(name)                                           \
13047 +       static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)           \
13048 +       = __RT_MUTEX_INITIALIZER( name ## _lock);                       \
13049 +       struct lglock name = { .lock = &name ## _lock }
13051 +# define DEFINE_STATIC_LGLOCK(name)                                    \
13052 +       static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)           \
13053 +       = __RT_MUTEX_INITIALIZER( name ## _lock);                       \
13054 +       static struct lglock name = { .lock = &name ## _lock }
13056 +#else
13058  #define DEFINE_LGLOCK(name)                                            \
13059         static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
13060         = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
13061 @@ -50,6 +67,7 @@ struct lglock {
13062         static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
13063         = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
13064         static struct lglock name = { .lock = &name ## _lock }
13065 +#endif
13067  void lg_lock_init(struct lglock *lg, char *name);
13069 @@ -64,6 +82,12 @@ void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
13070  void lg_global_lock(struct lglock *lg);
13071  void lg_global_unlock(struct lglock *lg);
13073 +#ifndef CONFIG_PREEMPT_RT_FULL
13074 +#define lg_global_trylock_relax(name)  lg_global_lock(name)
13075 +#else
13076 +void lg_global_trylock_relax(struct lglock *lg);
13077 +#endif
13079  #else
13080  /* When !CONFIG_SMP, map lglock to spinlock */
13081  #define lglock spinlock
13082 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
13083 index 8132214e8efd..89ffaa7bd342 100644
13084 --- a/include/linux/list_bl.h
13085 +++ b/include/linux/list_bl.h
13086 @@ -2,6 +2,7 @@
13087  #define _LINUX_LIST_BL_H
13089  #include <linux/list.h>
13090 +#include <linux/spinlock.h>
13091  #include <linux/bit_spinlock.h>
13093  /*
13094 @@ -32,13 +33,24 @@
13096  struct hlist_bl_head {
13097         struct hlist_bl_node *first;
13098 +#ifdef CONFIG_PREEMPT_RT_BASE
13099 +       raw_spinlock_t lock;
13100 +#endif
13101  };
13103  struct hlist_bl_node {
13104         struct hlist_bl_node *next, **pprev;
13105  };
13106 -#define INIT_HLIST_BL_HEAD(ptr) \
13107 -       ((ptr)->first = NULL)
13109 +#ifdef CONFIG_PREEMPT_RT_BASE
13110 +#define INIT_HLIST_BL_HEAD(h)          \
13111 +do {                                   \
13112 +       (h)->first = NULL;              \
13113 +       raw_spin_lock_init(&(h)->lock); \
13114 +} while (0)
13115 +#else
13116 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
13117 +#endif
13119  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
13121 @@ -118,12 +130,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
13123  static inline void hlist_bl_lock(struct hlist_bl_head *b)
13125 +#ifndef CONFIG_PREEMPT_RT_BASE
13126         bit_spin_lock(0, (unsigned long *)b);
13127 +#else
13128 +       raw_spin_lock(&b->lock);
13129 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
13130 +       __set_bit(0, (unsigned long *)b);
13131 +#endif
13132 +#endif
13135  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
13137 +#ifndef CONFIG_PREEMPT_RT_BASE
13138         __bit_spin_unlock(0, (unsigned long *)b);
13139 +#else
13140 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
13141 +       __clear_bit(0, (unsigned long *)b);
13142 +#endif
13143 +       raw_spin_unlock(&b->lock);
13144 +#endif
13147  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
13148 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
13149 new file mode 100644
13150 index 000000000000..0baaf28dc4ee
13151 --- /dev/null
13152 +++ b/include/linux/locallock.h
13153 @@ -0,0 +1,285 @@
13154 +#ifndef _LINUX_LOCALLOCK_H
13155 +#define _LINUX_LOCALLOCK_H
13157 +#include <linux/percpu.h>
13158 +#include <linux/spinlock.h>
13160 +#ifdef CONFIG_PREEMPT_RT_BASE
13162 +#ifdef CONFIG_DEBUG_SPINLOCK
13163 +# define LL_WARN(cond) WARN_ON(cond)
13164 +#else
13165 +# define LL_WARN(cond) do { } while (0)
13166 +#endif
13169 + * per cpu lock based substitute for local_irq_*()
13170 + */
13171 +struct local_irq_lock {
13172 +       spinlock_t              lock;
13173 +       struct task_struct      *owner;
13174 +       int                     nestcnt;
13175 +       unsigned long           flags;
13178 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
13179 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
13180 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
13182 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
13183 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
13185 +#define local_irq_lock_init(lvar)                                      \
13186 +       do {                                                            \
13187 +               int __cpu;                                              \
13188 +               for_each_possible_cpu(__cpu)                            \
13189 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
13190 +       } while (0)
13193 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
13194 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
13195 + * already takes care of the migrate_disable/enable
13196 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
13197 + */
13198 +#ifdef CONFIG_PREEMPT_RT_FULL
13199 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
13200 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
13201 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
13202 +#else
13203 +# define spin_lock_local(lock)                 spin_lock(lock)
13204 +# define spin_trylock_local(lock)              spin_trylock(lock)
13205 +# define spin_unlock_local(lock)               spin_unlock(lock)
13206 +#endif
13208 +static inline void __local_lock(struct local_irq_lock *lv)
13210 +       if (lv->owner != current) {
13211 +               spin_lock_local(&lv->lock);
13212 +               LL_WARN(lv->owner);
13213 +               LL_WARN(lv->nestcnt);
13214 +               lv->owner = current;
13215 +       }
13216 +       lv->nestcnt++;
13219 +#define local_lock(lvar)                                       \
13220 +       do { __local_lock(&get_local_var(lvar)); } while (0)
13222 +#define local_lock_on(lvar, cpu)                               \
13223 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
13225 +static inline int __local_trylock(struct local_irq_lock *lv)
13227 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
13228 +               LL_WARN(lv->owner);
13229 +               LL_WARN(lv->nestcnt);
13230 +               lv->owner = current;
13231 +               lv->nestcnt = 1;
13232 +               return 1;
13233 +       } else if (lv->owner == current) {
13234 +               lv->nestcnt++;
13235 +               return 1;
13236 +       }
13237 +       return 0;
13240 +#define local_trylock(lvar)                                            \
13241 +       ({                                                              \
13242 +               int __locked;                                           \
13243 +               __locked = __local_trylock(&get_local_var(lvar));       \
13244 +               if (!__locked)                                          \
13245 +                       put_local_var(lvar);                            \
13246 +               __locked;                                               \
13247 +       })
13249 +static inline void __local_unlock(struct local_irq_lock *lv)
13251 +       LL_WARN(lv->nestcnt == 0);
13252 +       LL_WARN(lv->owner != current);
13253 +       if (--lv->nestcnt)
13254 +               return;
13256 +       lv->owner = NULL;
13257 +       spin_unlock_local(&lv->lock);
13260 +#define local_unlock(lvar)                                     \
13261 +       do {                                                    \
13262 +               __local_unlock(this_cpu_ptr(&lvar));            \
13263 +               put_local_var(lvar);                            \
13264 +       } while (0)
13266 +#define local_unlock_on(lvar, cpu)                       \
13267 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
13269 +static inline void __local_lock_irq(struct local_irq_lock *lv)
13271 +       spin_lock_irqsave(&lv->lock, lv->flags);
13272 +       LL_WARN(lv->owner);
13273 +       LL_WARN(lv->nestcnt);
13274 +       lv->owner = current;
13275 +       lv->nestcnt = 1;
13278 +#define local_lock_irq(lvar)                                           \
13279 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
13281 +#define local_lock_irq_on(lvar, cpu)                                   \
13282 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
13284 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
13286 +       LL_WARN(!lv->nestcnt);
13287 +       LL_WARN(lv->owner != current);
13288 +       lv->owner = NULL;
13289 +       lv->nestcnt = 0;
13290 +       spin_unlock_irq(&lv->lock);
13293 +#define local_unlock_irq(lvar)                                         \
13294 +       do {                                                            \
13295 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
13296 +               put_local_var(lvar);                                    \
13297 +       } while (0)
13299 +#define local_unlock_irq_on(lvar, cpu)                                 \
13300 +       do {                                                            \
13301 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
13302 +       } while (0)
13304 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
13306 +       if (lv->owner != current) {
13307 +               __local_lock_irq(lv);
13308 +               return 0;
13309 +       } else {
13310 +               lv->nestcnt++;
13311 +               return 1;
13312 +       }
13315 +#define local_lock_irqsave(lvar, _flags)                               \
13316 +       do {                                                            \
13317 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
13318 +                       put_local_var(lvar);                            \
13319 +               _flags = __this_cpu_read(lvar.flags);                   \
13320 +       } while (0)
13322 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
13323 +       do {                                                            \
13324 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
13325 +               _flags = per_cpu(lvar, cpu).flags;                      \
13326 +       } while (0)
13328 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
13329 +                                           unsigned long flags)
13331 +       LL_WARN(!lv->nestcnt);
13332 +       LL_WARN(lv->owner != current);
13333 +       if (--lv->nestcnt)
13334 +               return 0;
13336 +       lv->owner = NULL;
13337 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
13338 +       return 1;
13341 +#define local_unlock_irqrestore(lvar, flags)                           \
13342 +       do {                                                            \
13343 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
13344 +                       put_local_var(lvar);                            \
13345 +       } while (0)
13347 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
13348 +       do {                                                            \
13349 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
13350 +       } while (0)
13352 +#define local_spin_trylock_irq(lvar, lock)                             \
13353 +       ({                                                              \
13354 +               int __locked;                                           \
13355 +               local_lock_irq(lvar);                                   \
13356 +               __locked = spin_trylock(lock);                          \
13357 +               if (!__locked)                                          \
13358 +                       local_unlock_irq(lvar);                         \
13359 +               __locked;                                               \
13360 +       })
13362 +#define local_spin_lock_irq(lvar, lock)                                        \
13363 +       do {                                                            \
13364 +               local_lock_irq(lvar);                                   \
13365 +               spin_lock(lock);                                        \
13366 +       } while (0)
13368 +#define local_spin_unlock_irq(lvar, lock)                              \
13369 +       do {                                                            \
13370 +               spin_unlock(lock);                                      \
13371 +               local_unlock_irq(lvar);                                 \
13372 +       } while (0)
13374 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
13375 +       do {                                                            \
13376 +               local_lock_irqsave(lvar, flags);                        \
13377 +               spin_lock(lock);                                        \
13378 +       } while (0)
13380 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
13381 +       do {                                                            \
13382 +               spin_unlock(lock);                                      \
13383 +               local_unlock_irqrestore(lvar, flags);                   \
13384 +       } while (0)
13386 +#define get_locked_var(lvar, var)                                      \
13387 +       (*({                                                            \
13388 +               local_lock(lvar);                                       \
13389 +               this_cpu_ptr(&var);                                     \
13390 +       }))
13392 +#define put_locked_var(lvar, var)      local_unlock(lvar);
13394 +#define local_lock_cpu(lvar)                                           \
13395 +       ({                                                              \
13396 +               local_lock(lvar);                                       \
13397 +               smp_processor_id();                                     \
13398 +       })
13400 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
13402 +#else /* PREEMPT_RT_BASE */
13404 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
13405 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
13407 +static inline void local_irq_lock_init(int lvar) { }
13409 +#define local_trylock(lvar)                                    \
13410 +       ({                                                      \
13411 +               preempt_disable();                              \
13412 +               1;                                              \
13413 +       })
13415 +#define local_lock(lvar)                       preempt_disable()
13416 +#define local_unlock(lvar)                     preempt_enable()
13417 +#define local_lock_irq(lvar)                   local_irq_disable()
13418 +#define local_unlock_irq(lvar)                 local_irq_enable()
13419 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
13420 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
13422 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
13423 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
13424 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
13425 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
13426 +       spin_lock_irqsave(lock, flags)
13427 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
13428 +       spin_unlock_irqrestore(lock, flags)
13430 +#define get_locked_var(lvar, var)              get_cpu_var(var)
13431 +#define put_locked_var(lvar, var)              put_cpu_var(var)
13433 +#define local_lock_cpu(lvar)                   get_cpu()
13434 +#define local_unlock_cpu(lvar)                 put_cpu()
13436 +#endif
13438 +#endif
13439 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
13440 index 36f4695aa604..2be9cca8be08 100644
13441 --- a/include/linux/mm_types.h
13442 +++ b/include/linux/mm_types.h
13443 @@ -11,6 +11,7 @@
13444  #include <linux/completion.h>
13445  #include <linux/cpumask.h>
13446  #include <linux/uprobes.h>
13447 +#include <linux/rcupdate.h>
13448  #include <linux/page-flags-layout.h>
13449  #include <asm/page.h>
13450  #include <asm/mmu.h>
13451 @@ -509,6 +510,9 @@ struct mm_struct {
13452         bool tlb_flush_batched;
13453  #endif
13454         struct uprobes_state uprobes_state;
13455 +#ifdef CONFIG_PREEMPT_RT_BASE
13456 +       struct rcu_head delayed_drop;
13457 +#endif
13458  #ifdef CONFIG_X86_INTEL_MPX
13459         /* address of the bounds directory */
13460         void __user *bd_addr;
13461 diff --git a/include/linux/module.h b/include/linux/module.h
13462 index b229a9961d02..5fea847cf95c 100644
13463 --- a/include/linux/module.h
13464 +++ b/include/linux/module.h
13465 @@ -500,6 +500,7 @@ static inline int module_is_live(struct module *mod)
13466  struct module *__module_text_address(unsigned long addr);
13467  struct module *__module_address(unsigned long addr);
13468  bool is_module_address(unsigned long addr);
13469 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
13470  bool is_module_percpu_address(unsigned long addr);
13471  bool is_module_text_address(unsigned long addr);
13473 @@ -665,6 +666,11 @@ static inline bool is_module_percpu_address(unsigned long addr)
13474         return false;
13477 +static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
13479 +       return false;
13482  static inline bool is_module_text_address(unsigned long addr)
13484         return false;
13485 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
13486 index 2cb7531e7d7a..b3fdfc820216 100644
13487 --- a/include/linux/mutex.h
13488 +++ b/include/linux/mutex.h
13489 @@ -19,6 +19,17 @@
13490  #include <asm/processor.h>
13491  #include <linux/osq_lock.h>
13493 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13494 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13495 +       , .dep_map = { .name = #lockname }
13496 +#else
13497 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13498 +#endif
13500 +#ifdef CONFIG_PREEMPT_RT_FULL
13501 +# include <linux/mutex_rt.h>
13502 +#else
13504  /*
13505   * Simple, straightforward mutexes with strict semantics:
13506   *
13507 @@ -99,13 +110,6 @@ do {                                                        \
13508  static inline void mutex_destroy(struct mutex *lock) {}
13509  #endif
13511 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
13512 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13513 -               , .dep_map = { .name = #lockname }
13514 -#else
13515 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13516 -#endif
13518  #define __MUTEX_INITIALIZER(lockname) \
13519                 { .count = ATOMIC_INIT(1) \
13520                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
13521 @@ -173,6 +177,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
13522  extern int mutex_trylock(struct mutex *lock);
13523  extern void mutex_unlock(struct mutex *lock);
13525 +#endif /* !PREEMPT_RT_FULL */
13527  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
13529  #endif /* __LINUX_MUTEX_H */
13530 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
13531 new file mode 100644
13532 index 000000000000..e0284edec655
13533 --- /dev/null
13534 +++ b/include/linux/mutex_rt.h
13535 @@ -0,0 +1,89 @@
13536 +#ifndef __LINUX_MUTEX_RT_H
13537 +#define __LINUX_MUTEX_RT_H
13539 +#ifndef __LINUX_MUTEX_H
13540 +#error "Please include mutex.h"
13541 +#endif
13543 +#include <linux/rtmutex.h>
13545 +/* FIXME: Just for __lockfunc */
13546 +#include <linux/spinlock.h>
13548 +struct mutex {
13549 +       struct rt_mutex         lock;
13550 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13551 +       struct lockdep_map      dep_map;
13552 +#endif
13555 +#define __MUTEX_INITIALIZER(mutexname)                                 \
13556 +       {                                                               \
13557 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
13558 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
13559 +       }
13561 +#define DEFINE_MUTEX(mutexname)                                                \
13562 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
13564 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
13565 +extern void __lockfunc _mutex_lock(struct mutex *lock);
13566 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
13567 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
13568 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
13569 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
13570 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
13571 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
13572 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
13573 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
13575 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
13576 +#define mutex_lock(l)                  _mutex_lock(l)
13577 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
13578 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
13579 +#define mutex_trylock(l)               _mutex_trylock(l)
13580 +#define mutex_unlock(l)                        _mutex_unlock(l)
13582 +#ifdef CONFIG_DEBUG_MUTEXES
13583 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
13584 +#else
13585 +static inline void mutex_destroy(struct mutex *lock) {}
13586 +#endif
13588 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13589 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
13590 +# define mutex_lock_interruptible_nested(l, s) \
13591 +                                       _mutex_lock_interruptible_nested(l, s)
13592 +# define mutex_lock_killable_nested(l, s) \
13593 +                                       _mutex_lock_killable_nested(l, s)
13595 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
13596 +do {                                                                   \
13597 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
13598 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
13599 +} while (0)
13601 +#else
13602 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
13603 +# define mutex_lock_interruptible_nested(l, s) \
13604 +                                       _mutex_lock_interruptible(l)
13605 +# define mutex_lock_killable_nested(l, s) \
13606 +                                       _mutex_lock_killable(l)
13607 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
13608 +#endif
13610 +# define mutex_init(mutex)                             \
13611 +do {                                                   \
13612 +       static struct lock_class_key __key;             \
13613 +                                                       \
13614 +       rt_mutex_init(&(mutex)->lock);                  \
13615 +       __mutex_do_init((mutex), #mutex, &__key);       \
13616 +} while (0)
13618 +# define __mutex_init(mutex, name, key)                        \
13619 +do {                                                   \
13620 +       rt_mutex_init(&(mutex)->lock);                  \
13621 +       __mutex_do_init((mutex), name, key);            \
13622 +} while (0)
13624 +#endif
13625 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
13626 index fc54049e8286..5a5588a57cad 100644
13627 --- a/include/linux/netdevice.h
13628 +++ b/include/linux/netdevice.h
13629 @@ -390,7 +390,19 @@ typedef enum rx_handler_result rx_handler_result_t;
13630  typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
13632  void __napi_schedule(struct napi_struct *n);
13635 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
13636 + * run as threads, and they can also be preempted (without PREEMPT_RT
13637 + * interrupt threads can not be preempted). Which means that calling
13638 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
13639 + * and can corrupt the napi->poll_list.
13640 + */
13641 +#ifdef CONFIG_PREEMPT_RT_FULL
13642 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
13643 +#else
13644  void __napi_schedule_irqoff(struct napi_struct *n);
13645 +#endif
13647  static inline bool napi_disable_pending(struct napi_struct *n)
13649 @@ -2288,11 +2300,20 @@ void netdev_freemem(struct net_device *dev);
13650  void synchronize_net(void);
13651  int init_dummy_netdev(struct net_device *dev);
13653 +#ifdef CONFIG_PREEMPT_RT_FULL
13654 +static inline int dev_recursion_level(void)
13656 +       return current->xmit_recursion;
13659 +#else
13661  DECLARE_PER_CPU(int, xmit_recursion);
13662  static inline int dev_recursion_level(void)
13664         return this_cpu_read(xmit_recursion);
13666 +#endif
13668  struct net_device *dev_get_by_index(struct net *net, int ifindex);
13669  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
13670 @@ -2610,6 +2631,7 @@ struct softnet_data {
13671         unsigned int            dropped;
13672         struct sk_buff_head     input_pkt_queue;
13673         struct napi_struct      backlog;
13674 +       struct sk_buff_head     tofree_queue;
13676  };
13678 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
13679 index 04078e8a4803..a61c9609e32f 100644
13680 --- a/include/linux/netfilter/x_tables.h
13681 +++ b/include/linux/netfilter/x_tables.h
13682 @@ -4,6 +4,7 @@
13684  #include <linux/netdevice.h>
13685  #include <linux/static_key.h>
13686 +#include <linux/locallock.h>
13687  #include <uapi/linux/netfilter/x_tables.h>
13689  /**
13690 @@ -289,6 +290,8 @@ void xt_free_table_info(struct xt_table_info *info);
13691   */
13692  DECLARE_PER_CPU(seqcount_t, xt_recseq);
13694 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
13696  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
13697   *
13698   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
13699 @@ -309,6 +312,9 @@ static inline unsigned int xt_write_recseq_begin(void)
13701         unsigned int addend;
13703 +       /* RT protection */
13704 +       local_lock(xt_write_lock);
13706         /*
13707          * Low order bit of sequence is set if we already
13708          * called xt_write_recseq_begin().
13709 @@ -339,6 +345,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
13710         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
13711         smp_wmb();
13712         __this_cpu_add(xt_recseq.sequence, addend);
13713 +       local_unlock(xt_write_lock);
13716  /*
13717 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
13718 index d14a4c362465..2e4414a0c1c4 100644
13719 --- a/include/linux/notifier.h
13720 +++ b/include/linux/notifier.h
13721 @@ -6,7 +6,7 @@
13722   *
13723   *                             Alan Cox <Alan.Cox@linux.org>
13724   */
13727  #ifndef _LINUX_NOTIFIER_H
13728  #define _LINUX_NOTIFIER_H
13729  #include <linux/errno.h>
13730 @@ -42,9 +42,7 @@
13731   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
13732   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
13733   * SRCU notifier chains should be used when the chain will be called very
13734 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
13735 - * chains are slightly more difficult to use because they require special
13736 - * runtime initialization.
13737 + * often but notifier_blocks will seldom be removed.
13738   */
13740  typedef        int (*notifier_fn_t)(struct notifier_block *nb,
13741 @@ -88,7 +86,7 @@ struct srcu_notifier_head {
13742                 (name)->head = NULL;            \
13743         } while (0)
13745 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
13746 +/* srcu_notifier_heads must be cleaned up dynamically */
13747  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13748  #define srcu_cleanup_notifier_head(name)       \
13749                 cleanup_srcu_struct(&(name)->srcu);
13750 @@ -101,7 +99,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13751                 .head = NULL }
13752  #define RAW_NOTIFIER_INIT(name)        {                               \
13753                 .head = NULL }
13754 -/* srcu_notifier_heads cannot be initialized statically */
13756 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
13757 +       {                                                       \
13758 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
13759 +               .head = NULL,                                   \
13760 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
13761 +       }
13763  #define ATOMIC_NOTIFIER_HEAD(name)                             \
13764         struct atomic_notifier_head name =                      \
13765 @@ -113,6 +117,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13766         struct raw_notifier_head name =                         \
13767                 RAW_NOTIFIER_INIT(name)
13769 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
13770 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
13771 +                       name##_head_srcu_array);                \
13772 +       mod struct srcu_notifier_head name =                    \
13773 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
13775 +#define SRCU_NOTIFIER_HEAD(name)                               \
13776 +       _SRCU_NOTIFIER_HEAD(name, )
13778 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
13779 +       _SRCU_NOTIFIER_HEAD(name, static)
13781  #ifdef __KERNEL__
13783  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
13784 @@ -182,12 +198,12 @@ static inline int notifier_to_errno(int ret)
13786  /*
13787   *     Declared notifiers so far. I can imagine quite a few more chains
13788 - *     over time (eg laptop power reset chains, reboot chain (to clean 
13789 + *     over time (eg laptop power reset chains, reboot chain (to clean
13790   *     device units up), device [un]mount chain, module load/unload chain,
13791 - *     low memory chain, screenblank chain (for plug in modular screenblankers) 
13792 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
13793   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
13794   */
13797  /* CPU notfiers are defined in include/linux/cpu.h. */
13799  /* netdevice notifiers are defined in include/linux/netdevice.h */
13800 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
13801 index caebf2a758dc..4ecc057b6e27 100644
13802 --- a/include/linux/percpu.h
13803 +++ b/include/linux/percpu.h
13804 @@ -24,6 +24,35 @@
13805          PERCPU_MODULE_RESERVE)
13806  #endif
13808 +#ifdef CONFIG_PREEMPT_RT_FULL
13810 +#define get_local_var(var) (*({                \
13811 +              migrate_disable();       \
13812 +              this_cpu_ptr(&var);      }))
13814 +#define put_local_var(var) do {        \
13815 +       (void)&(var);           \
13816 +       migrate_enable();       \
13817 +} while (0)
13819 +# define get_local_ptr(var) ({         \
13820 +               migrate_disable();      \
13821 +               this_cpu_ptr(var);      })
13823 +# define put_local_ptr(var) do {       \
13824 +       (void)(var);                    \
13825 +       migrate_enable();               \
13826 +} while (0)
13828 +#else
13830 +#define get_local_var(var)     get_cpu_var(var)
13831 +#define put_local_var(var)     put_cpu_var(var)
13832 +#define get_local_ptr(var)     get_cpu_ptr(var)
13833 +#define put_local_ptr(var)     put_cpu_ptr(var)
13835 +#endif
13837  /* minimum unit size, also is the maximum supported allocation size */
13838  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
13840 @@ -116,6 +145,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
13841  #endif
13843  extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
13844 +extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
13845  extern bool is_kernel_percpu_address(unsigned long addr);
13847  #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
13848 diff --git a/include/linux/pid.h b/include/linux/pid.h
13849 index 97b745ddece5..01a5460a0c85 100644
13850 --- a/include/linux/pid.h
13851 +++ b/include/linux/pid.h
13852 @@ -2,6 +2,7 @@
13853  #define _LINUX_PID_H
13855  #include <linux/rcupdate.h>
13856 +#include <linux/atomic.h>
13858  enum pid_type
13860 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
13861 index 7eeceac52dea..f97c54265904 100644
13862 --- a/include/linux/preempt.h
13863 +++ b/include/linux/preempt.h
13864 @@ -50,7 +50,11 @@
13865  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
13866  #define NMI_OFFSET     (1UL << NMI_SHIFT)
13868 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13869 +#ifndef CONFIG_PREEMPT_RT_FULL
13870 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
13871 +#else
13872 +# define SOFTIRQ_DISABLE_OFFSET                (0)
13873 +#endif
13875  /* We use the MSB mostly because its available */
13876  #define PREEMPT_NEED_RESCHED   0x80000000
13877 @@ -59,9 +63,15 @@
13878  #include <asm/preempt.h>
13880  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
13881 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
13882  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
13883                                  | NMI_MASK))
13884 +#ifndef CONFIG_PREEMPT_RT_FULL
13885 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
13886 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
13887 +#else
13888 +# define softirq_count()       (0UL)
13889 +extern int in_serving_softirq(void);
13890 +#endif
13892  /*
13893   * Are we doing bottom half or hardware interrupt processing?
13894 @@ -79,7 +89,6 @@
13895  #define in_irq()               (hardirq_count())
13896  #define in_softirq()           (softirq_count())
13897  #define in_interrupt()         (irq_count())
13898 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
13899  #define in_nmi()               (preempt_count() & NMI_MASK)
13900  #define in_task()              (!(preempt_count() & \
13901                                    (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
13902 @@ -96,7 +105,11 @@
13903  /*
13904   * The preempt_count offset after spin_lock()
13905   */
13906 +#if !defined(CONFIG_PREEMPT_RT_FULL)
13907  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
13908 +#else
13909 +#define PREEMPT_LOCK_OFFSET    0
13910 +#endif
13912  /*
13913   * The preempt_count offset needed for things like:
13914 @@ -145,6 +158,20 @@ extern void preempt_count_sub(int val);
13915  #define preempt_count_inc() preempt_count_add(1)
13916  #define preempt_count_dec() preempt_count_sub(1)
13918 +#ifdef CONFIG_PREEMPT_LAZY
13919 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
13920 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
13921 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
13922 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
13923 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
13924 +#else
13925 +#define add_preempt_lazy_count(val)    do { } while (0)
13926 +#define sub_preempt_lazy_count(val)    do { } while (0)
13927 +#define inc_preempt_lazy_count()       do { } while (0)
13928 +#define dec_preempt_lazy_count()       do { } while (0)
13929 +#define preempt_lazy_count()           (0)
13930 +#endif
13932  #ifdef CONFIG_PREEMPT_COUNT
13934  #define preempt_disable() \
13935 @@ -153,13 +180,25 @@ do { \
13936         barrier(); \
13937  } while (0)
13939 +#define preempt_lazy_disable() \
13940 +do { \
13941 +       inc_preempt_lazy_count(); \
13942 +       barrier(); \
13943 +} while (0)
13945  #define sched_preempt_enable_no_resched() \
13946  do { \
13947         barrier(); \
13948         preempt_count_dec(); \
13949  } while (0)
13951 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13952 +#ifdef CONFIG_PREEMPT_RT_BASE
13953 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13954 +# define preempt_check_resched_rt() preempt_check_resched()
13955 +#else
13956 +# define preempt_enable_no_resched() preempt_enable()
13957 +# define preempt_check_resched_rt() barrier();
13958 +#endif
13960  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
13962 @@ -184,6 +223,13 @@ do { \
13963                 __preempt_schedule(); \
13964  } while (0)
13966 +#define preempt_lazy_enable() \
13967 +do { \
13968 +       dec_preempt_lazy_count(); \
13969 +       barrier(); \
13970 +       preempt_check_resched(); \
13971 +} while (0)
13973  #else /* !CONFIG_PREEMPT */
13974  #define preempt_enable() \
13975  do { \
13976 @@ -229,6 +275,7 @@ do { \
13977  #define preempt_disable_notrace()              barrier()
13978  #define preempt_enable_no_resched_notrace()    barrier()
13979  #define preempt_enable_notrace()               barrier()
13980 +#define preempt_check_resched_rt()             barrier()
13981  #define preemptible()                          0
13983  #endif /* CONFIG_PREEMPT_COUNT */
13984 @@ -249,10 +296,31 @@ do { \
13985  } while (0)
13986  #define preempt_fold_need_resched() \
13987  do { \
13988 -       if (tif_need_resched()) \
13989 +       if (tif_need_resched_now()) \
13990                 set_preempt_need_resched(); \
13991  } while (0)
13993 +#ifdef CONFIG_PREEMPT_RT_FULL
13994 +# define preempt_disable_rt()          preempt_disable()
13995 +# define preempt_enable_rt()           preempt_enable()
13996 +# define preempt_disable_nort()                barrier()
13997 +# define preempt_enable_nort()         barrier()
13998 +# ifdef CONFIG_SMP
13999 +   extern void migrate_disable(void);
14000 +   extern void migrate_enable(void);
14001 +# else /* CONFIG_SMP */
14002 +#  define migrate_disable()            barrier()
14003 +#  define migrate_enable()             barrier()
14004 +# endif /* CONFIG_SMP */
14005 +#else
14006 +# define preempt_disable_rt()          barrier()
14007 +# define preempt_enable_rt()           barrier()
14008 +# define preempt_disable_nort()                preempt_disable()
14009 +# define preempt_enable_nort()         preempt_enable()
14010 +# define migrate_disable()             preempt_disable()
14011 +# define migrate_enable()              preempt_enable()
14012 +#endif
14014  #ifdef CONFIG_PREEMPT_NOTIFIERS
14016  struct preempt_notifier;
14017 diff --git a/include/linux/printk.h b/include/linux/printk.h
14018 index 9729565c25ff..9cdca696b718 100644
14019 --- a/include/linux/printk.h
14020 +++ b/include/linux/printk.h
14021 @@ -117,9 +117,11 @@ int no_printk(const char *fmt, ...)
14022  #ifdef CONFIG_EARLY_PRINTK
14023  extern asmlinkage __printf(1, 2)
14024  void early_printk(const char *fmt, ...);
14025 +extern void printk_kill(void);
14026  #else
14027  static inline __printf(1, 2) __cold
14028  void early_printk(const char *s, ...) { }
14029 +static inline void printk_kill(void) { }
14030  #endif
14032  typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
14033 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
14034 index 5d5174b59802..327dddaf4c8f 100644
14035 --- a/include/linux/radix-tree.h
14036 +++ b/include/linux/radix-tree.h
14037 @@ -279,6 +279,8 @@ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
14038                         unsigned long first_index, unsigned int max_items);
14039  int radix_tree_preload(gfp_t gfp_mask);
14040  int radix_tree_maybe_preload(gfp_t gfp_mask);
14041 +void radix_tree_preload_end(void);
14043  void radix_tree_init(void);
14044  void *radix_tree_tag_set(struct radix_tree_root *root,
14045                         unsigned long index, unsigned int tag);
14046 @@ -301,11 +303,6 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
14047  int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
14048  unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
14050 -static inline void radix_tree_preload_end(void)
14052 -       preempt_enable();
14055  /**
14056   * struct radix_tree_iter - radix tree iterator state
14057   *
14058 diff --git a/include/linux/random.h b/include/linux/random.h
14059 index 9c29122037f9..e7f2f8604918 100644
14060 --- a/include/linux/random.h
14061 +++ b/include/linux/random.h
14062 @@ -20,7 +20,7 @@ struct random_ready_callback {
14063  extern void add_device_randomness(const void *, unsigned int);
14064  extern void add_input_randomness(unsigned int type, unsigned int code,
14065                                  unsigned int value);
14066 -extern void add_interrupt_randomness(int irq, int irq_flags);
14067 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip);
14069  extern void get_random_bytes(void *buf, int nbytes);
14070  extern int add_random_ready_callback(struct random_ready_callback *rdy);
14071 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
14072 index a5aa7ae671f4..24ddffd25492 100644
14073 --- a/include/linux/rbtree.h
14074 +++ b/include/linux/rbtree.h
14075 @@ -31,7 +31,6 @@
14077  #include <linux/kernel.h>
14078  #include <linux/stddef.h>
14079 -#include <linux/rcupdate.h>
14081  struct rb_node {
14082         unsigned long  __rb_parent_color;
14083 @@ -86,14 +85,8 @@ static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
14084         *rb_link = node;
14087 -static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
14088 -                                   struct rb_node **rb_link)
14090 -       node->__rb_parent_color = (unsigned long)parent;
14091 -       node->rb_left = node->rb_right = NULL;
14093 -       rcu_assign_pointer(*rb_link, node);
14095 +void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
14096 +                     struct rb_node **rb_link);
14098  #define rb_entry_safe(ptr, type, member) \
14099         ({ typeof(ptr) ____ptr = (ptr); \
14100 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
14101 index a0189ba67fde..c2f5f955163d 100644
14102 --- a/include/linux/rcupdate.h
14103 +++ b/include/linux/rcupdate.h
14104 @@ -169,6 +169,9 @@ void call_rcu(struct rcu_head *head,
14106  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
14108 +#ifdef CONFIG_PREEMPT_RT_FULL
14109 +#define call_rcu_bh    call_rcu
14110 +#else
14111  /**
14112   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
14113   * @head: structure to be used for queueing the RCU updates.
14114 @@ -192,6 +195,7 @@ void call_rcu(struct rcu_head *head,
14115   */
14116  void call_rcu_bh(struct rcu_head *head,
14117                  rcu_callback_t func);
14118 +#endif
14120  /**
14121   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
14122 @@ -292,6 +296,11 @@ void synchronize_rcu(void);
14123   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
14124   */
14125  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
14126 +#ifndef CONFIG_PREEMPT_RT_FULL
14127 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
14128 +#else
14129 +static inline int sched_rcu_preempt_depth(void) { return 0; }
14130 +#endif
14132  #else /* #ifdef CONFIG_PREEMPT_RCU */
14134 @@ -317,6 +326,8 @@ static inline int rcu_preempt_depth(void)
14135         return 0;
14138 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
14140  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
14142  /* Internal to kernel */
14143 @@ -489,7 +500,14 @@ extern struct lockdep_map rcu_callback_map;
14144  int debug_lockdep_rcu_enabled(void);
14146  int rcu_read_lock_held(void);
14147 +#ifdef CONFIG_PREEMPT_RT_FULL
14148 +static inline int rcu_read_lock_bh_held(void)
14150 +       return rcu_read_lock_held();
14152 +#else
14153  int rcu_read_lock_bh_held(void);
14154 +#endif
14156  /**
14157   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
14158 @@ -937,10 +955,14 @@ static inline void rcu_read_unlock(void)
14159  static inline void rcu_read_lock_bh(void)
14161         local_bh_disable();
14162 +#ifdef CONFIG_PREEMPT_RT_FULL
14163 +       rcu_read_lock();
14164 +#else
14165         __acquire(RCU_BH);
14166         rcu_lock_acquire(&rcu_bh_lock_map);
14167         RCU_LOCKDEP_WARN(!rcu_is_watching(),
14168                          "rcu_read_lock_bh() used illegally while idle");
14169 +#endif
14172  /*
14173 @@ -950,10 +972,14 @@ static inline void rcu_read_lock_bh(void)
14174   */
14175  static inline void rcu_read_unlock_bh(void)
14177 +#ifdef CONFIG_PREEMPT_RT_FULL
14178 +       rcu_read_unlock();
14179 +#else
14180         RCU_LOCKDEP_WARN(!rcu_is_watching(),
14181                          "rcu_read_unlock_bh() used illegally while idle");
14182         rcu_lock_release(&rcu_bh_lock_map);
14183         __release(RCU_BH);
14184 +#endif
14185         local_bh_enable();
14188 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
14189 index 60d15a080d7c..436c9e62bfc6 100644
14190 --- a/include/linux/rcutree.h
14191 +++ b/include/linux/rcutree.h
14192 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
14193         rcu_note_context_switch();
14196 +#ifdef CONFIG_PREEMPT_RT_FULL
14197 +# define synchronize_rcu_bh    synchronize_rcu
14198 +#else
14199  void synchronize_rcu_bh(void);
14200 +#endif
14201  void synchronize_sched_expedited(void);
14202  void synchronize_rcu_expedited(void);
14204 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
14207  void rcu_barrier(void);
14208 +#ifdef CONFIG_PREEMPT_RT_FULL
14209 +# define rcu_barrier_bh                rcu_barrier
14210 +#else
14211  void rcu_barrier_bh(void);
14212 +#endif
14213  void rcu_barrier_sched(void);
14214  unsigned long get_state_synchronize_rcu(void);
14215  void cond_synchronize_rcu(unsigned long oldstate);
14216 @@ -85,12 +93,10 @@ unsigned long rcu_batches_started(void);
14217  unsigned long rcu_batches_started_bh(void);
14218  unsigned long rcu_batches_started_sched(void);
14219  unsigned long rcu_batches_completed(void);
14220 -unsigned long rcu_batches_completed_bh(void);
14221  unsigned long rcu_batches_completed_sched(void);
14222  void show_rcu_gp_kthreads(void);
14224  void rcu_force_quiescent_state(void);
14225 -void rcu_bh_force_quiescent_state(void);
14226  void rcu_sched_force_quiescent_state(void);
14228  void rcu_idle_enter(void);
14229 @@ -105,6 +111,14 @@ extern int rcu_scheduler_active __read_mostly;
14231  bool rcu_is_watching(void);
14233 +#ifndef CONFIG_PREEMPT_RT_FULL
14234 +void rcu_bh_force_quiescent_state(void);
14235 +unsigned long rcu_batches_completed_bh(void);
14236 +#else
14237 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
14238 +# define rcu_batches_completed_bh      rcu_batches_completed
14239 +#endif
14241  void rcu_all_qs(void);
14243  #endif /* __LINUX_RCUTREE_H */
14244 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
14245 index 1abba5ce2a2f..30211c627511 100644
14246 --- a/include/linux/rtmutex.h
14247 +++ b/include/linux/rtmutex.h
14248 @@ -13,11 +13,15 @@
14249  #define __LINUX_RT_MUTEX_H
14251  #include <linux/linkage.h>
14252 +#include <linux/spinlock_types_raw.h>
14253  #include <linux/rbtree.h>
14254 -#include <linux/spinlock_types.h>
14256  extern int max_lock_depth; /* for sysctl */
14258 +#ifdef CONFIG_DEBUG_MUTEXES
14259 +#include <linux/debug_locks.h>
14260 +#endif
14262  /**
14263   * The rt_mutex structure
14264   *
14265 @@ -31,8 +35,8 @@ struct rt_mutex {
14266         struct rb_root          waiters;
14267         struct rb_node          *waiters_leftmost;
14268         struct task_struct      *owner;
14269 -#ifdef CONFIG_DEBUG_RT_MUTEXES
14270         int                     save_state;
14271 +#ifdef CONFIG_DEBUG_RT_MUTEXES
14272         const char              *name, *file;
14273         int                     line;
14274         void                    *magic;
14275 @@ -55,22 +59,33 @@ struct hrtimer_sleeper;
14276  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
14277  #endif
14279 +# define rt_mutex_init(mutex)                                  \
14280 +       do {                                                    \
14281 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
14282 +               __rt_mutex_init(mutex, #mutex);                 \
14283 +       } while (0)
14285  #ifdef CONFIG_DEBUG_RT_MUTEXES
14286  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
14287         , .name = #mutexname, .file = __FILE__, .line = __LINE__
14288 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
14289   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
14290  #else
14291  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
14292 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
14293  # define rt_mutex_debug_task_free(t)                   do { } while (0)
14294  #endif
14296 -#define __RT_MUTEX_INITIALIZER(mutexname) \
14297 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14298 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
14299 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14300         , .waiters = RB_ROOT \
14301         , .owner = NULL \
14302 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
14303 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
14305 +#define __RT_MUTEX_INITIALIZER(mutexname) \
14306 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
14308 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
14309 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
14310 +       , .save_state = 1 }
14312  #define DEFINE_RT_MUTEX(mutexname) \
14313         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
14314 @@ -91,6 +106,7 @@ extern void rt_mutex_destroy(struct rt_mutex *lock);
14316  extern void rt_mutex_lock(struct rt_mutex *lock);
14317  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
14318 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
14319  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
14320                                struct hrtimer_sleeper *timeout);
14322 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
14323 new file mode 100644
14324 index 000000000000..49ed2d45d3be
14325 --- /dev/null
14326 +++ b/include/linux/rwlock_rt.h
14327 @@ -0,0 +1,99 @@
14328 +#ifndef __LINUX_RWLOCK_RT_H
14329 +#define __LINUX_RWLOCK_RT_H
14331 +#ifndef __LINUX_SPINLOCK_H
14332 +#error Do not include directly. Use spinlock.h
14333 +#endif
14335 +#define rwlock_init(rwl)                               \
14336 +do {                                                   \
14337 +       static struct lock_class_key __key;             \
14338 +                                                       \
14339 +       rt_mutex_init(&(rwl)->lock);                    \
14340 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
14341 +} while (0)
14343 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
14344 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
14345 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
14346 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
14347 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
14348 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
14349 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
14350 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
14351 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
14352 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
14354 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
14355 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
14357 +#define write_trylock_irqsave(lock, flags)     \
14358 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
14360 +#define read_lock_irqsave(lock, flags)                 \
14361 +       do {                                            \
14362 +               typecheck(unsigned long, flags);        \
14363 +               flags = rt_read_lock_irqsave(lock);     \
14364 +       } while (0)
14366 +#define write_lock_irqsave(lock, flags)                        \
14367 +       do {                                            \
14368 +               typecheck(unsigned long, flags);        \
14369 +               flags = rt_write_lock_irqsave(lock);    \
14370 +       } while (0)
14372 +#define read_lock(lock)                rt_read_lock(lock)
14374 +#define read_lock_bh(lock)                             \
14375 +       do {                                            \
14376 +               local_bh_disable();                     \
14377 +               rt_read_lock(lock);                     \
14378 +       } while (0)
14380 +#define read_lock_irq(lock)    read_lock(lock)
14382 +#define write_lock(lock)       rt_write_lock(lock)
14384 +#define write_lock_bh(lock)                            \
14385 +       do {                                            \
14386 +               local_bh_disable();                     \
14387 +               rt_write_lock(lock);                    \
14388 +       } while (0)
14390 +#define write_lock_irq(lock)   write_lock(lock)
14392 +#define read_unlock(lock)      rt_read_unlock(lock)
14394 +#define read_unlock_bh(lock)                           \
14395 +       do {                                            \
14396 +               rt_read_unlock(lock);                   \
14397 +               local_bh_enable();                      \
14398 +       } while (0)
14400 +#define read_unlock_irq(lock)  read_unlock(lock)
14402 +#define write_unlock(lock)     rt_write_unlock(lock)
14404 +#define write_unlock_bh(lock)                          \
14405 +       do {                                            \
14406 +               rt_write_unlock(lock);                  \
14407 +               local_bh_enable();                      \
14408 +       } while (0)
14410 +#define write_unlock_irq(lock) write_unlock(lock)
14412 +#define read_unlock_irqrestore(lock, flags)            \
14413 +       do {                                            \
14414 +               typecheck(unsigned long, flags);        \
14415 +               (void) flags;                           \
14416 +               rt_read_unlock(lock);                   \
14417 +       } while (0)
14419 +#define write_unlock_irqrestore(lock, flags) \
14420 +       do {                                            \
14421 +               typecheck(unsigned long, flags);        \
14422 +               (void) flags;                           \
14423 +               rt_write_unlock(lock);                  \
14424 +       } while (0)
14426 +#endif
14427 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
14428 index cc0072e93e36..d0da966ad7a0 100644
14429 --- a/include/linux/rwlock_types.h
14430 +++ b/include/linux/rwlock_types.h
14431 @@ -1,6 +1,10 @@
14432  #ifndef __LINUX_RWLOCK_TYPES_H
14433  #define __LINUX_RWLOCK_TYPES_H
14435 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
14436 +# error "Do not include directly, include spinlock_types.h"
14437 +#endif
14439  /*
14440   * include/linux/rwlock_types.h - generic rwlock type definitions
14441   *                               and initializers
14442 @@ -43,6 +47,7 @@ typedef struct {
14443                                 RW_DEP_MAP_INIT(lockname) }
14444  #endif
14446 -#define DEFINE_RWLOCK(x)       rwlock_t x = __RW_LOCK_UNLOCKED(x)
14447 +#define DEFINE_RWLOCK(name) \
14448 +       rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
14450  #endif /* __LINUX_RWLOCK_TYPES_H */
14451 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
14452 new file mode 100644
14453 index 000000000000..b13832119591
14454 --- /dev/null
14455 +++ b/include/linux/rwlock_types_rt.h
14456 @@ -0,0 +1,33 @@
14457 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
14458 +#define __LINUX_RWLOCK_TYPES_RT_H
14460 +#ifndef __LINUX_SPINLOCK_TYPES_H
14461 +#error "Do not include directly. Include spinlock_types.h instead"
14462 +#endif
14465 + * rwlocks - rtmutex which allows single reader recursion
14466 + */
14467 +typedef struct {
14468 +       struct rt_mutex         lock;
14469 +       int                     read_depth;
14470 +       unsigned int            break_lock;
14471 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14472 +       struct lockdep_map      dep_map;
14473 +#endif
14474 +} rwlock_t;
14476 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14477 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
14478 +#else
14479 +# define RW_DEP_MAP_INIT(lockname)
14480 +#endif
14482 +#define __RW_LOCK_UNLOCKED(name) \
14483 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
14484 +         RW_DEP_MAP_INIT(name) }
14486 +#define DEFINE_RWLOCK(name) \
14487 +       rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
14489 +#endif
14490 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
14491 index 8f498cdde280..2b2148431f14 100644
14492 --- a/include/linux/rwsem.h
14493 +++ b/include/linux/rwsem.h
14494 @@ -18,6 +18,10 @@
14495  #include <linux/osq_lock.h>
14496  #endif
14498 +#ifdef CONFIG_PREEMPT_RT_FULL
14499 +#include <linux/rwsem_rt.h>
14500 +#else /* PREEMPT_RT_FULL */
14502  struct rw_semaphore;
14504  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
14505 @@ -177,4 +181,6 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
14506  # define up_read_non_owner(sem)                        up_read(sem)
14507  #endif
14509 +#endif /* !PREEMPT_RT_FULL */
14511  #endif /* _LINUX_RWSEM_H */
14512 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
14513 new file mode 100644
14514 index 000000000000..f97860b2e2a4
14515 --- /dev/null
14516 +++ b/include/linux/rwsem_rt.h
14517 @@ -0,0 +1,152 @@
14518 +#ifndef _LINUX_RWSEM_RT_H
14519 +#define _LINUX_RWSEM_RT_H
14521 +#ifndef _LINUX_RWSEM_H
14522 +#error "Include rwsem.h"
14523 +#endif
14526 + * RW-semaphores are a spinlock plus a reader-depth count.
14527 + *
14528 + * Note that the semantics are different from the usual
14529 + * Linux rw-sems, in PREEMPT_RT mode we do not allow
14530 + * multiple readers to hold the lock at once, we only allow
14531 + * a read-lock owner to read-lock recursively. This is
14532 + * better for latency, makes the implementation inherently
14533 + * fair and makes it simpler as well.
14534 + */
14536 +#include <linux/rtmutex.h>
14538 +struct rw_semaphore {
14539 +       struct rt_mutex         lock;
14540 +       int                     read_depth;
14541 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14542 +       struct lockdep_map      dep_map;
14543 +#endif
14546 +#define __RWSEM_INITIALIZER(name) \
14547 +       { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
14548 +         RW_DEP_MAP_INIT(name) }
14550 +#define DECLARE_RWSEM(lockname) \
14551 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
14553 +extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
14554 +                                    struct lock_class_key *key);
14556 +#define __rt_init_rwsem(sem, name, key)                        \
14557 +       do {                                            \
14558 +               rt_mutex_init(&(sem)->lock);            \
14559 +               __rt_rwsem_init((sem), (name), (key));\
14560 +       } while (0)
14562 +#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
14564 +# define rt_init_rwsem(sem)                            \
14565 +do {                                                   \
14566 +       static struct lock_class_key __key;             \
14567 +                                                       \
14568 +       __rt_init_rwsem((sem), #sem, &__key);           \
14569 +} while (0)
14571 +extern void rt_down_write(struct rw_semaphore *rwsem);
14572 +extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
14573 +extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
14574 +extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
14575 +                                     struct lockdep_map *nest);
14576 +extern void rt__down_read(struct rw_semaphore *rwsem);
14577 +extern void rt_down_read(struct rw_semaphore *rwsem);
14578 +extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
14579 +extern int  rt__down_read_trylock(struct rw_semaphore *rwsem);
14580 +extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
14581 +extern void __rt_up_read(struct rw_semaphore *rwsem);
14582 +extern void rt_up_read(struct rw_semaphore *rwsem);
14583 +extern void rt_up_write(struct rw_semaphore *rwsem);
14584 +extern void rt_downgrade_write(struct rw_semaphore *rwsem);
14586 +#define init_rwsem(sem)                rt_init_rwsem(sem)
14587 +#define rwsem_is_locked(s)     rt_mutex_is_locked(&(s)->lock)
14589 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
14591 +       /* rt_mutex_has_waiters() */
14592 +       return !RB_EMPTY_ROOT(&sem->lock.waiters);
14595 +static inline void __down_read(struct rw_semaphore *sem)
14597 +       rt__down_read(sem);
14600 +static inline void down_read(struct rw_semaphore *sem)
14602 +       rt_down_read(sem);
14605 +static inline int __down_read_trylock(struct rw_semaphore *sem)
14607 +       return rt__down_read_trylock(sem);
14610 +static inline int down_read_trylock(struct rw_semaphore *sem)
14612 +       return rt_down_read_trylock(sem);
14615 +static inline void down_write(struct rw_semaphore *sem)
14617 +       rt_down_write(sem);
14620 +static inline int down_write_trylock(struct rw_semaphore *sem)
14622 +       return rt_down_write_trylock(sem);
14625 +static inline void __up_read(struct rw_semaphore *sem)
14627 +       __rt_up_read(sem);
14630 +static inline void up_read(struct rw_semaphore *sem)
14632 +       rt_up_read(sem);
14635 +static inline void up_write(struct rw_semaphore *sem)
14637 +       rt_up_write(sem);
14640 +static inline void downgrade_write(struct rw_semaphore *sem)
14642 +       rt_downgrade_write(sem);
14645 +static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
14647 +       return rt_down_read_nested(sem, subclass);
14650 +static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
14652 +       rt_down_write_nested(sem, subclass);
14654 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14655 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
14656 +               struct rw_semaphore *nest_lock)
14658 +       rt_down_write_nested_lock(sem, &nest_lock->dep_map);
14661 +#else
14663 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
14664 +               struct rw_semaphore *nest_lock)
14666 +       rt_down_write_nested_lock(sem, NULL);
14668 +#endif
14669 +#endif
14670 diff --git a/include/linux/sched.h b/include/linux/sched.h
14671 index e887c8d6f395..f37654adf12a 100644
14672 --- a/include/linux/sched.h
14673 +++ b/include/linux/sched.h
14674 @@ -26,6 +26,7 @@ struct sched_param {
14675  #include <linux/nodemask.h>
14676  #include <linux/mm_types.h>
14677  #include <linux/preempt.h>
14678 +#include <asm/kmap_types.h>
14680  #include <asm/page.h>
14681  #include <asm/ptrace.h>
14682 @@ -182,8 +183,6 @@ extern void update_cpu_load_nohz(void);
14683  static inline void update_cpu_load_nohz(void) { }
14684  #endif
14686 -extern unsigned long get_parent_ip(unsigned long addr);
14688  extern void dump_cpu_task(int cpu);
14690  struct seq_file;
14691 @@ -235,17 +234,13 @@ extern char ___assert_task_state[1 - 2*!!(
14693  /* Convenience macros for the sake of wake_up */
14694  #define TASK_NORMAL            (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
14695 -#define TASK_ALL               (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
14697  /* get_task_state() */
14698  #define TASK_REPORT            (TASK_RUNNING | TASK_INTERRUPTIBLE | \
14699                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
14700                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
14702 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
14703  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
14704 -#define task_is_stopped_or_traced(task)        \
14705 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
14706  #define task_contributes_to_load(task) \
14707                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
14708                                  (task->flags & PF_FROZEN) == 0 && \
14709 @@ -311,6 +306,11 @@ extern char ___assert_task_state[1 - 2*!!(
14711  #endif
14713 +#define __set_current_state_no_track(state_value)      \
14714 +       do { current->state = (state_value); } while (0)
14715 +#define set_current_state_no_track(state_value)                \
14716 +       set_mb(current->state, (state_value))
14718  /* Task command name length */
14719  #define TASK_COMM_LEN 16
14721 @@ -979,9 +979,31 @@ struct wake_q_head {
14722  #define WAKE_Q(name)                                   \
14723         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
14725 -extern void wake_q_add(struct wake_q_head *head,
14726 -                      struct task_struct *task);
14727 -extern void wake_up_q(struct wake_q_head *head);
14728 +extern void __wake_q_add(struct wake_q_head *head,
14729 +                        struct task_struct *task, bool sleeper);
14730 +static inline void wake_q_add(struct wake_q_head *head,
14731 +                             struct task_struct *task)
14733 +       __wake_q_add(head, task, false);
14736 +static inline void wake_q_add_sleeper(struct wake_q_head *head,
14737 +                                     struct task_struct *task)
14739 +       __wake_q_add(head, task, true);
14742 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
14744 +static inline void wake_up_q(struct wake_q_head *head)
14746 +       __wake_up_q(head, false);
14749 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
14751 +       __wake_up_q(head, true);
14754  /*
14755   * sched-domains (multiprocessor balancing) declarations:
14756 @@ -1389,6 +1411,7 @@ struct tlbflush_unmap_batch {
14758  struct task_struct {
14759         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
14760 +       volatile long saved_state;      /* saved state for "spinlock sleepers" */
14761         void *stack;
14762         atomic_t usage;
14763         unsigned int flags;     /* per process flags, defined below */
14764 @@ -1425,6 +1448,13 @@ struct task_struct {
14765  #endif
14767         unsigned int policy;
14768 +#ifdef CONFIG_PREEMPT_RT_FULL
14769 +       int migrate_disable;
14770 +       int migrate_disable_update;
14771 +# ifdef CONFIG_SCHED_DEBUG
14772 +       int migrate_disable_atomic;
14773 +# endif
14774 +#endif
14775         int nr_cpus_allowed;
14776         cpumask_t cpus_allowed;
14778 @@ -1536,11 +1566,14 @@ struct task_struct {
14779         cputime_t gtime;
14780         struct prev_cputime prev_cputime;
14781  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
14782 -       seqlock_t vtime_seqlock;
14783 +       seqcount_t vtime_seqcount;
14784         unsigned long long vtime_snap;
14785         enum {
14786 -               VTIME_SLEEPING = 0,
14787 +               /* Task is sleeping or running in a CPU with VTIME inactive */
14788 +               VTIME_INACTIVE = 0,
14789 +               /* Task runs in userspace in a CPU with VTIME active */
14790                 VTIME_USER,
14791 +               /* Task runs in kernelspace in a CPU with VTIME active */
14792                 VTIME_SYS,
14793         } vtime_snap_whence;
14794  #endif
14795 @@ -1552,6 +1585,9 @@ struct task_struct {
14797         struct task_cputime cputime_expires;
14798         struct list_head cpu_timers[3];
14799 +#ifdef CONFIG_PREEMPT_RT_BASE
14800 +       struct task_struct *posix_timer_list;
14801 +#endif
14803  /* process credentials */
14804         const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
14805 @@ -1583,10 +1619,15 @@ struct task_struct {
14806  /* signal handlers */
14807         struct signal_struct *signal;
14808         struct sighand_struct *sighand;
14809 +       struct sigqueue *sigqueue_cache;
14811         sigset_t blocked, real_blocked;
14812         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
14813         struct sigpending pending;
14814 +#ifdef CONFIG_PREEMPT_RT_FULL
14815 +       /* TODO: move me into ->restart_block ? */
14816 +       struct siginfo forced_info;
14817 +#endif
14819         unsigned long sas_ss_sp;
14820         size_t sas_ss_size;
14821 @@ -1611,6 +1652,7 @@ struct task_struct {
14822         raw_spinlock_t pi_lock;
14824         struct wake_q_node wake_q;
14825 +       struct wake_q_node wake_q_sleeper;
14827  #ifdef CONFIG_RT_MUTEXES
14828         /* PI waiters blocked on a rt_mutex held by this task */
14829 @@ -1810,6 +1852,12 @@ struct task_struct {
14830         unsigned long trace;
14831         /* bitmask and counter of trace recursion */
14832         unsigned long trace_recursion;
14833 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
14834 +       u64 preempt_timestamp_hist;
14835 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
14836 +       long timer_offset;
14837 +#endif
14838 +#endif
14839  #endif /* CONFIG_TRACING */
14840  #ifdef CONFIG_MEMCG
14841         struct mem_cgroup *memcg_in_oom;
14842 @@ -1826,8 +1874,22 @@ struct task_struct {
14843         unsigned int    sequential_io;
14844         unsigned int    sequential_io_avg;
14845  #endif
14846 +#ifdef CONFIG_PREEMPT_RT_BASE
14847 +       struct rcu_head put_rcu;
14848 +       int softirq_nestcnt;
14849 +       unsigned int softirqs_raised;
14850 +#endif
14851 +#ifdef CONFIG_PREEMPT_RT_FULL
14852 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
14853 +       int kmap_idx;
14854 +       pte_t kmap_pte[KM_TYPE_NR];
14855 +# endif
14856 +#endif
14857  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
14858         unsigned long   task_state_change;
14859 +#endif
14860 +#ifdef CONFIG_PREEMPT_RT_FULL
14861 +       int xmit_recursion;
14862  #endif
14863         int pagefault_disabled;
14864  /* CPU-specific state of this task */
14865 @@ -1846,9 +1908,6 @@ extern int arch_task_struct_size __read_mostly;
14866  # define arch_task_struct_size (sizeof(struct task_struct))
14867  #endif
14869 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
14870 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
14872  #define TNF_MIGRATED   0x01
14873  #define TNF_NO_GROUP   0x02
14874  #define TNF_SHARED     0x04
14875 @@ -2042,6 +2101,15 @@ extern struct pid *cad_pid;
14876  extern void free_task(struct task_struct *tsk);
14877  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
14879 +#ifdef CONFIG_PREEMPT_RT_BASE
14880 +extern void __put_task_struct_cb(struct rcu_head *rhp);
14882 +static inline void put_task_struct(struct task_struct *t)
14884 +       if (atomic_dec_and_test(&t->usage))
14885 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
14887 +#else
14888  extern void __put_task_struct(struct task_struct *t);
14890  static inline void put_task_struct(struct task_struct *t)
14891 @@ -2049,6 +2117,7 @@ static inline void put_task_struct(struct task_struct *t)
14892         if (atomic_dec_and_test(&t->usage))
14893                 __put_task_struct(t);
14895 +#endif
14897  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
14898  extern void task_cputime(struct task_struct *t,
14899 @@ -2087,6 +2156,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
14900  /*
14901   * Per process flags
14902   */
14903 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
14904  #define PF_EXITING     0x00000004      /* getting shut down */
14905  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
14906  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
14907 @@ -2251,6 +2321,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
14909  extern int set_cpus_allowed_ptr(struct task_struct *p,
14910                                 const struct cpumask *new_mask);
14911 +int migrate_me(void);
14912 +void tell_sched_cpu_down_begin(int cpu);
14913 +void tell_sched_cpu_down_done(int cpu);
14915  #else
14916  static inline void do_set_cpus_allowed(struct task_struct *p,
14917                                       const struct cpumask *new_mask)
14918 @@ -2263,6 +2337,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
14919                 return -EINVAL;
14920         return 0;
14922 +static inline int migrate_me(void) { return 0; }
14923 +static inline void tell_sched_cpu_down_begin(int cpu) { }
14924 +static inline void tell_sched_cpu_down_done(int cpu) { }
14925  #endif
14927  #ifdef CONFIG_NO_HZ_COMMON
14928 @@ -2472,6 +2549,7 @@ extern void xtime_update(unsigned long ticks);
14930  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
14931  extern int wake_up_process(struct task_struct *tsk);
14932 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
14933  extern void wake_up_new_task(struct task_struct *tsk);
14934  #ifdef CONFIG_SMP
14935   extern void kick_process(struct task_struct *tsk);
14936 @@ -2595,12 +2673,24 @@ extern struct mm_struct * mm_alloc(void);
14938  /* mmdrop drops the mm and the page tables */
14939  extern void __mmdrop(struct mm_struct *);
14941  static inline void mmdrop(struct mm_struct * mm)
14943         if (unlikely(atomic_dec_and_test(&mm->mm_count)))
14944                 __mmdrop(mm);
14947 +#ifdef CONFIG_PREEMPT_RT_BASE
14948 +extern void __mmdrop_delayed(struct rcu_head *rhp);
14949 +static inline void mmdrop_delayed(struct mm_struct *mm)
14951 +       if (atomic_dec_and_test(&mm->mm_count))
14952 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
14954 +#else
14955 +# define mmdrop_delayed(mm)    mmdrop(mm)
14956 +#endif
14958  /* mmput gets rid of the mappings and all user-space */
14959  extern void mmput(struct mm_struct *);
14960  /* Grab a reference to a task's mm, if it is not already going away */
14961 @@ -2910,6 +3000,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
14962         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
14965 +#ifdef CONFIG_PREEMPT_LAZY
14966 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
14968 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14971 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
14973 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14976 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
14978 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
14981 +static inline int need_resched_lazy(void)
14983 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
14986 +static inline int need_resched_now(void)
14988 +       return test_thread_flag(TIF_NEED_RESCHED);
14991 +#else
14992 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
14993 +static inline int need_resched_lazy(void) { return 0; }
14995 +static inline int need_resched_now(void)
14997 +       return test_thread_flag(TIF_NEED_RESCHED);
15000 +#endif
15002  static inline int restart_syscall(void)
15004         set_tsk_thread_flag(current, TIF_SIGPENDING);
15005 @@ -2941,6 +3068,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
15006         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
15009 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
15011 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
15012 +               return true;
15013 +#ifdef CONFIG_PREEMPT_RT_FULL
15014 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
15015 +               return true;
15016 +#endif
15017 +       return false;
15020 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
15022 +       bool traced_stopped;
15024 +#ifdef CONFIG_PREEMPT_RT_FULL
15025 +       unsigned long flags;
15027 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
15028 +       traced_stopped = __task_is_stopped_or_traced(task);
15029 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
15030 +#else
15031 +       traced_stopped = __task_is_stopped_or_traced(task);
15032 +#endif
15033 +       return traced_stopped;
15036 +static inline bool task_is_traced(struct task_struct *task)
15038 +       bool traced = false;
15040 +       if (task->state & __TASK_TRACED)
15041 +               return true;
15042 +#ifdef CONFIG_PREEMPT_RT_FULL
15043 +       /* in case the task is sleeping on tasklist_lock */
15044 +       raw_spin_lock_irq(&task->pi_lock);
15045 +       if (task->state & __TASK_TRACED)
15046 +               traced = true;
15047 +       else if (task->saved_state & __TASK_TRACED)
15048 +               traced = true;
15049 +       raw_spin_unlock_irq(&task->pi_lock);
15050 +#endif
15051 +       return traced;
15054  /*
15055   * cond_resched() and cond_resched_lock(): latency reduction via
15056   * explicit rescheduling in places that are safe. The return
15057 @@ -2962,12 +3134,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
15058         __cond_resched_lock(lock);                              \
15059  })
15061 +#ifndef CONFIG_PREEMPT_RT_FULL
15062  extern int __cond_resched_softirq(void);
15064  #define cond_resched_softirq() ({                                      \
15065         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
15066         __cond_resched_softirq();                                       \
15067  })
15068 +#else
15069 +# define cond_resched_softirq()                cond_resched()
15070 +#endif
15072  static inline void cond_resched_rcu(void)
15074 @@ -3129,6 +3305,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
15076  #endif /* CONFIG_SMP */
15078 +static inline int __migrate_disabled(struct task_struct *p)
15080 +#ifdef CONFIG_PREEMPT_RT_FULL
15081 +       return p->migrate_disable;
15082 +#else
15083 +       return 0;
15084 +#endif
15087 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
15088 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
15090 +       if (__migrate_disabled(p))
15091 +               return cpumask_of(task_cpu(p));
15093 +       return &p->cpus_allowed;
15096 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
15098 +       if (__migrate_disabled(p))
15099 +               return 1;
15100 +       return p->nr_cpus_allowed;
15103  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
15104  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
15106 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
15107 index e0582106ef4f..b14f4d2368aa 100644
15108 --- a/include/linux/seqlock.h
15109 +++ b/include/linux/seqlock.h
15110 @@ -220,20 +220,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
15111         return __read_seqcount_retry(s, start);
15116 -static inline void raw_write_seqcount_begin(seqcount_t *s)
15117 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
15119         s->sequence++;
15120         smp_wmb();
15123 -static inline void raw_write_seqcount_end(seqcount_t *s)
15124 +static inline void raw_write_seqcount_begin(seqcount_t *s)
15126 +       preempt_disable_rt();
15127 +       __raw_write_seqcount_begin(s);
15130 +static inline void __raw_write_seqcount_end(seqcount_t *s)
15132         smp_wmb();
15133         s->sequence++;
15136 +static inline void raw_write_seqcount_end(seqcount_t *s)
15138 +       __raw_write_seqcount_end(s);
15139 +       preempt_enable_rt();
15142  /**
15143   * raw_write_seqcount_barrier - do a seq write barrier
15144   * @s: pointer to seqcount_t
15145 @@ -425,10 +435,32 @@ typedef struct {
15146  /*
15147   * Read side functions for starting and finalizing a read side section.
15148   */
15149 +#ifndef CONFIG_PREEMPT_RT_FULL
15150  static inline unsigned read_seqbegin(const seqlock_t *sl)
15152         return read_seqcount_begin(&sl->seqcount);
15154 +#else
15156 + * Starvation safe read side for RT
15157 + */
15158 +static inline unsigned read_seqbegin(seqlock_t *sl)
15160 +       unsigned ret;
15162 +repeat:
15163 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
15164 +       if (unlikely(ret & 1)) {
15165 +               /*
15166 +                * Take the lock and let the writer proceed (i.e. evtl
15167 +                * boost it), otherwise we could loop here forever.
15168 +                */
15169 +               spin_unlock_wait(&sl->lock);
15170 +               goto repeat;
15171 +       }
15172 +       return ret;
15174 +#endif
15176  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
15178 @@ -443,36 +475,36 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
15179  static inline void write_seqlock(seqlock_t *sl)
15181         spin_lock(&sl->lock);
15182 -       write_seqcount_begin(&sl->seqcount);
15183 +       __raw_write_seqcount_begin(&sl->seqcount);
15186  static inline void write_sequnlock(seqlock_t *sl)
15188 -       write_seqcount_end(&sl->seqcount);
15189 +       __raw_write_seqcount_end(&sl->seqcount);
15190         spin_unlock(&sl->lock);
15193  static inline void write_seqlock_bh(seqlock_t *sl)
15195         spin_lock_bh(&sl->lock);
15196 -       write_seqcount_begin(&sl->seqcount);
15197 +       __raw_write_seqcount_begin(&sl->seqcount);
15200  static inline void write_sequnlock_bh(seqlock_t *sl)
15202 -       write_seqcount_end(&sl->seqcount);
15203 +       __raw_write_seqcount_end(&sl->seqcount);
15204         spin_unlock_bh(&sl->lock);
15207  static inline void write_seqlock_irq(seqlock_t *sl)
15209         spin_lock_irq(&sl->lock);
15210 -       write_seqcount_begin(&sl->seqcount);
15211 +       __raw_write_seqcount_begin(&sl->seqcount);
15214  static inline void write_sequnlock_irq(seqlock_t *sl)
15216 -       write_seqcount_end(&sl->seqcount);
15217 +       __raw_write_seqcount_end(&sl->seqcount);
15218         spin_unlock_irq(&sl->lock);
15221 @@ -481,7 +513,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
15222         unsigned long flags;
15224         spin_lock_irqsave(&sl->lock, flags);
15225 -       write_seqcount_begin(&sl->seqcount);
15226 +       __raw_write_seqcount_begin(&sl->seqcount);
15227         return flags;
15230 @@ -491,7 +523,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
15231  static inline void
15232  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
15234 -       write_seqcount_end(&sl->seqcount);
15235 +       __raw_write_seqcount_end(&sl->seqcount);
15236         spin_unlock_irqrestore(&sl->lock, flags);
15239 diff --git a/include/linux/signal.h b/include/linux/signal.h
15240 index d80259afb9e5..ddd1e6866a54 100644
15241 --- a/include/linux/signal.h
15242 +++ b/include/linux/signal.h
15243 @@ -233,6 +233,7 @@ static inline void init_sigpending(struct sigpending *sig)
15246  extern void flush_sigqueue(struct sigpending *queue);
15247 +extern void flush_task_sigqueue(struct task_struct *tsk);
15249  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
15250  static inline int valid_signal(unsigned long sig)
15251 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
15252 index b5421f6f155a..b05be76fc504 100644
15253 --- a/include/linux/skbuff.h
15254 +++ b/include/linux/skbuff.h
15255 @@ -203,6 +203,7 @@ struct sk_buff_head {
15257         __u32           qlen;
15258         spinlock_t      lock;
15259 +       raw_spinlock_t  raw_lock;
15260  };
15262  struct sk_buff;
15263 @@ -1462,6 +1463,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
15264         __skb_queue_head_init(list);
15267 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
15269 +       raw_spin_lock_init(&list->raw_lock);
15270 +       __skb_queue_head_init(list);
15273  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
15274                 struct lock_class_key *class)
15276 diff --git a/include/linux/smp.h b/include/linux/smp.h
15277 index c4414074bd88..e6ab36aeaaab 100644
15278 --- a/include/linux/smp.h
15279 +++ b/include/linux/smp.h
15280 @@ -185,6 +185,9 @@ static inline void smp_init(void) { }
15281  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
15282  #define put_cpu()              preempt_enable()
15284 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
15285 +#define put_cpu_light()                migrate_enable()
15287  /*
15288   * Callback to arch code if there's nosmp or maxcpus=0 on the
15289   * boot command line:
15290 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
15291 index 47dd0cebd204..02928fa5499d 100644
15292 --- a/include/linux/spinlock.h
15293 +++ b/include/linux/spinlock.h
15294 @@ -271,7 +271,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
15295  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
15297  /* Include rwlock functions */
15298 -#include <linux/rwlock.h>
15299 +#ifdef CONFIG_PREEMPT_RT_FULL
15300 +# include <linux/rwlock_rt.h>
15301 +#else
15302 +# include <linux/rwlock.h>
15303 +#endif
15305  /*
15306   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
15307 @@ -282,6 +286,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
15308  # include <linux/spinlock_api_up.h>
15309  #endif
15311 +#ifdef CONFIG_PREEMPT_RT_FULL
15312 +# include <linux/spinlock_rt.h>
15313 +#else /* PREEMPT_RT_FULL */
15315  /*
15316   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
15317   */
15318 @@ -347,6 +355,12 @@ static __always_inline void spin_unlock(spinlock_t *lock)
15319         raw_spin_unlock(&lock->rlock);
15322 +static __always_inline int spin_unlock_no_deboost(spinlock_t *lock)
15324 +       raw_spin_unlock(&lock->rlock);
15325 +       return 0;
15328  static __always_inline void spin_unlock_bh(spinlock_t *lock)
15330         raw_spin_unlock_bh(&lock->rlock);
15331 @@ -416,4 +430,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
15332  #define atomic_dec_and_lock(atomic, lock) \
15333                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
15335 +#endif /* !PREEMPT_RT_FULL */
15337  #endif /* __LINUX_SPINLOCK_H */
15338 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
15339 index 5344268e6e62..043263f30e81 100644
15340 --- a/include/linux/spinlock_api_smp.h
15341 +++ b/include/linux/spinlock_api_smp.h
15342 @@ -189,6 +189,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
15343         return 0;
15346 -#include <linux/rwlock_api_smp.h>
15347 +#ifndef CONFIG_PREEMPT_RT_FULL
15348 +# include <linux/rwlock_api_smp.h>
15349 +#endif
15351  #endif /* __LINUX_SPINLOCK_API_SMP_H */
15352 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
15353 new file mode 100644
15354 index 000000000000..7eb87584e843
15355 --- /dev/null
15356 +++ b/include/linux/spinlock_rt.h
15357 @@ -0,0 +1,165 @@
15358 +#ifndef __LINUX_SPINLOCK_RT_H
15359 +#define __LINUX_SPINLOCK_RT_H
15361 +#ifndef __LINUX_SPINLOCK_H
15362 +#error Do not include directly. Use spinlock.h
15363 +#endif
15365 +#include <linux/bug.h>
15367 +extern void
15368 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
15370 +#define spin_lock_init(slock)                          \
15371 +do {                                                   \
15372 +       static struct lock_class_key __key;             \
15373 +                                                       \
15374 +       rt_mutex_init(&(slock)->lock);                  \
15375 +       __rt_spin_lock_init(slock, #slock, &__key);     \
15376 +} while (0)
15378 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
15379 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
15380 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
15382 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
15383 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
15384 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
15385 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
15386 +extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock);
15387 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
15388 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
15389 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
15390 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
15391 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
15394 + * lockdep-less calls, for derived types like rwlock:
15395 + * (for trylock they can use rt_mutex_trylock() directly.
15396 + */
15397 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
15398 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
15399 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
15400 +extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
15402 +#define spin_lock(lock)                        rt_spin_lock(lock)
15404 +#define spin_lock_bh(lock)                     \
15405 +       do {                                    \
15406 +               local_bh_disable();             \
15407 +               rt_spin_lock(lock);             \
15408 +       } while (0)
15410 +#define spin_lock_irq(lock)            spin_lock(lock)
15412 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
15414 +#define spin_trylock(lock)                     \
15415 +({                                             \
15416 +       int __locked;                           \
15417 +       __locked = spin_do_trylock(lock);       \
15418 +       __locked;                               \
15421 +#ifdef CONFIG_LOCKDEP
15422 +# define spin_lock_nested(lock, subclass)              \
15423 +       do {                                            \
15424 +               rt_spin_lock_nested(lock, subclass);    \
15425 +       } while (0)
15427 +#define spin_lock_bh_nested(lock, subclass)            \
15428 +       do {                                            \
15429 +               local_bh_disable();                     \
15430 +               rt_spin_lock_nested(lock, subclass);    \
15431 +       } while (0)
15433 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15434 +       do {                                             \
15435 +               typecheck(unsigned long, flags);         \
15436 +               flags = 0;                               \
15437 +               rt_spin_lock_nested(lock, subclass);     \
15438 +       } while (0)
15439 +#else
15440 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
15441 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
15443 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15444 +       do {                                             \
15445 +               typecheck(unsigned long, flags);         \
15446 +               flags = 0;                               \
15447 +               spin_lock(lock);                         \
15448 +       } while (0)
15449 +#endif
15451 +#define spin_lock_irqsave(lock, flags)                  \
15452 +       do {                                             \
15453 +               typecheck(unsigned long, flags);         \
15454 +               flags = 0;                               \
15455 +               spin_lock(lock);                         \
15456 +       } while (0)
15458 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
15460 +       unsigned long flags = 0;
15461 +#ifdef CONFIG_TRACE_IRQFLAGS
15462 +       flags = rt_spin_lock_trace_flags(lock);
15463 +#else
15464 +       spin_lock(lock); /* lock_local */
15465 +#endif
15466 +       return flags;
15469 +/* FIXME: we need rt_spin_lock_nest_lock */
15470 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
15472 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
15473 +#define spin_unlock_no_deboost(lock)           rt_spin_unlock_no_deboost(lock)
15475 +#define spin_unlock_bh(lock)                           \
15476 +       do {                                            \
15477 +               rt_spin_unlock(lock);                   \
15478 +               local_bh_enable();                      \
15479 +       } while (0)
15481 +#define spin_unlock_irq(lock)          spin_unlock(lock)
15483 +#define spin_unlock_irqrestore(lock, flags)            \
15484 +       do {                                            \
15485 +               typecheck(unsigned long, flags);        \
15486 +               (void) flags;                           \
15487 +               spin_unlock(lock);                      \
15488 +       } while (0)
15490 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
15491 +#define spin_trylock_irq(lock) spin_trylock(lock)
15493 +#define spin_trylock_irqsave(lock, flags)      \
15494 +       rt_spin_trylock_irqsave(lock, &(flags))
15496 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
15498 +#ifdef CONFIG_GENERIC_LOCKBREAK
15499 +# define spin_is_contended(lock)       ((lock)->break_lock)
15500 +#else
15501 +# define spin_is_contended(lock)       (((void)(lock), 0))
15502 +#endif
15504 +static inline int spin_can_lock(spinlock_t *lock)
15506 +       return !rt_mutex_is_locked(&lock->lock);
15509 +static inline int spin_is_locked(spinlock_t *lock)
15511 +       return rt_mutex_is_locked(&lock->lock);
15514 +static inline void assert_spin_locked(spinlock_t *lock)
15516 +       BUG_ON(!spin_is_locked(lock));
15519 +#define atomic_dec_and_lock(atomic, lock) \
15520 +       atomic_dec_and_spin_lock(atomic, lock)
15522 +#endif
15523 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
15524 index 73548eb13a5d..10bac715ea96 100644
15525 --- a/include/linux/spinlock_types.h
15526 +++ b/include/linux/spinlock_types.h
15527 @@ -9,80 +9,15 @@
15528   * Released under the General Public License (GPL).
15529   */
15531 -#if defined(CONFIG_SMP)
15532 -# include <asm/spinlock_types.h>
15533 -#else
15534 -# include <linux/spinlock_types_up.h>
15535 -#endif
15537 -#include <linux/lockdep.h>
15539 -typedef struct raw_spinlock {
15540 -       arch_spinlock_t raw_lock;
15541 -#ifdef CONFIG_GENERIC_LOCKBREAK
15542 -       unsigned int break_lock;
15543 -#endif
15544 -#ifdef CONFIG_DEBUG_SPINLOCK
15545 -       unsigned int magic, owner_cpu;
15546 -       void *owner;
15547 -#endif
15548 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15549 -       struct lockdep_map dep_map;
15550 -#endif
15551 -} raw_spinlock_t;
15553 -#define SPINLOCK_MAGIC         0xdead4ead
15555 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15557 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15558 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15559 -#else
15560 -# define SPIN_DEP_MAP_INIT(lockname)
15561 -#endif
15562 +#include <linux/spinlock_types_raw.h>
15564 -#ifdef CONFIG_DEBUG_SPINLOCK
15565 -# define SPIN_DEBUG_INIT(lockname)             \
15566 -       .magic = SPINLOCK_MAGIC,                \
15567 -       .owner_cpu = -1,                        \
15568 -       .owner = SPINLOCK_OWNER_INIT,
15569 +#ifndef CONFIG_PREEMPT_RT_FULL
15570 +# include <linux/spinlock_types_nort.h>
15571 +# include <linux/rwlock_types.h>
15572  #else
15573 -# define SPIN_DEBUG_INIT(lockname)
15574 +# include <linux/rtmutex.h>
15575 +# include <linux/spinlock_types_rt.h>
15576 +# include <linux/rwlock_types_rt.h>
15577  #endif
15579 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15580 -       {                                       \
15581 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15582 -       SPIN_DEBUG_INIT(lockname)               \
15583 -       SPIN_DEP_MAP_INIT(lockname) }
15585 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15586 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15588 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15590 -typedef struct spinlock {
15591 -       union {
15592 -               struct raw_spinlock rlock;
15594 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15595 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15596 -               struct {
15597 -                       u8 __padding[LOCK_PADSIZE];
15598 -                       struct lockdep_map dep_map;
15599 -               };
15600 -#endif
15601 -       };
15602 -} spinlock_t;
15604 -#define __SPIN_LOCK_INITIALIZER(lockname) \
15605 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15607 -#define __SPIN_LOCK_UNLOCKED(lockname) \
15608 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15610 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15612 -#include <linux/rwlock_types.h>
15614  #endif /* __LINUX_SPINLOCK_TYPES_H */
15615 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
15616 new file mode 100644
15617 index 000000000000..f1dac1fb1d6a
15618 --- /dev/null
15619 +++ b/include/linux/spinlock_types_nort.h
15620 @@ -0,0 +1,33 @@
15621 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
15622 +#define __LINUX_SPINLOCK_TYPES_NORT_H
15624 +#ifndef __LINUX_SPINLOCK_TYPES_H
15625 +#error "Do not include directly. Include spinlock_types.h instead"
15626 +#endif
15629 + * The non RT version maps spinlocks to raw_spinlocks
15630 + */
15631 +typedef struct spinlock {
15632 +       union {
15633 +               struct raw_spinlock rlock;
15635 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15636 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15637 +               struct {
15638 +                       u8 __padding[LOCK_PADSIZE];
15639 +                       struct lockdep_map dep_map;
15640 +               };
15641 +#endif
15642 +       };
15643 +} spinlock_t;
15645 +#define __SPIN_LOCK_INITIALIZER(lockname) \
15646 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15648 +#define __SPIN_LOCK_UNLOCKED(lockname) \
15649 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15651 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15653 +#endif
15654 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
15655 new file mode 100644
15656 index 000000000000..edffc4d53fc9
15657 --- /dev/null
15658 +++ b/include/linux/spinlock_types_raw.h
15659 @@ -0,0 +1,56 @@
15660 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
15661 +#define __LINUX_SPINLOCK_TYPES_RAW_H
15663 +#if defined(CONFIG_SMP)
15664 +# include <asm/spinlock_types.h>
15665 +#else
15666 +# include <linux/spinlock_types_up.h>
15667 +#endif
15669 +#include <linux/lockdep.h>
15671 +typedef struct raw_spinlock {
15672 +       arch_spinlock_t raw_lock;
15673 +#ifdef CONFIG_GENERIC_LOCKBREAK
15674 +       unsigned int break_lock;
15675 +#endif
15676 +#ifdef CONFIG_DEBUG_SPINLOCK
15677 +       unsigned int magic, owner_cpu;
15678 +       void *owner;
15679 +#endif
15680 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15681 +       struct lockdep_map dep_map;
15682 +#endif
15683 +} raw_spinlock_t;
15685 +#define SPINLOCK_MAGIC         0xdead4ead
15687 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15689 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15690 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15691 +#else
15692 +# define SPIN_DEP_MAP_INIT(lockname)
15693 +#endif
15695 +#ifdef CONFIG_DEBUG_SPINLOCK
15696 +# define SPIN_DEBUG_INIT(lockname)             \
15697 +       .magic = SPINLOCK_MAGIC,                \
15698 +       .owner_cpu = -1,                        \
15699 +       .owner = SPINLOCK_OWNER_INIT,
15700 +#else
15701 +# define SPIN_DEBUG_INIT(lockname)
15702 +#endif
15704 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15705 +       {                                       \
15706 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15707 +       SPIN_DEBUG_INIT(lockname)               \
15708 +       SPIN_DEP_MAP_INIT(lockname) }
15710 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15711 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15713 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15715 +#endif
15716 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
15717 new file mode 100644
15718 index 000000000000..9fd431967abc
15719 --- /dev/null
15720 +++ b/include/linux/spinlock_types_rt.h
15721 @@ -0,0 +1,51 @@
15722 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
15723 +#define __LINUX_SPINLOCK_TYPES_RT_H
15725 +#ifndef __LINUX_SPINLOCK_TYPES_H
15726 +#error "Do not include directly. Include spinlock_types.h instead"
15727 +#endif
15729 +#include <linux/cache.h>
15732 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
15733 + */
15734 +typedef struct spinlock {
15735 +       struct rt_mutex         lock;
15736 +       unsigned int            break_lock;
15737 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15738 +       struct lockdep_map      dep_map;
15739 +#endif
15740 +} spinlock_t;
15742 +#ifdef CONFIG_DEBUG_RT_MUTEXES
15743 +# define __RT_SPIN_INITIALIZER(name) \
15744 +       { \
15745 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15746 +       .save_state = 1, \
15747 +       .file = __FILE__, \
15748 +       .line = __LINE__ , \
15749 +       }
15750 +#else
15751 +# define __RT_SPIN_INITIALIZER(name) \
15752 +       {                                                               \
15753 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
15754 +       .save_state = 1, \
15755 +       }
15756 +#endif
15759 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
15762 +#define __SPIN_LOCK_UNLOCKED(name)                     \
15763 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
15764 +         SPIN_DEP_MAP_INIT(name) }
15766 +#define __DEFINE_SPINLOCK(name) \
15767 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
15769 +#define DEFINE_SPINLOCK(name) \
15770 +       spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
15772 +#endif
15773 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
15774 index f5f80c5643ac..ec1a8f01563c 100644
15775 --- a/include/linux/srcu.h
15776 +++ b/include/linux/srcu.h
15777 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
15779  void process_srcu(struct work_struct *work);
15781 -#define __SRCU_STRUCT_INIT(name)                                       \
15782 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
15783         {                                                               \
15784                 .completed = -300,                                      \
15785 -               .per_cpu_ref = &name##_srcu_array,                      \
15786 +               .per_cpu_ref = &pcpu_name,                              \
15787                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
15788                 .running = false,                                       \
15789                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
15790 @@ -104,7 +104,7 @@ void process_srcu(struct work_struct *work);
15791   */
15792  #define __DEFINE_SRCU(name, is_static)                                 \
15793         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
15794 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15795 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
15796  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
15797  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
15799 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
15800 index 8b6ec7ef0854..9b77d4cc929f 100644
15801 --- a/include/linux/suspend.h
15802 +++ b/include/linux/suspend.h
15803 @@ -194,6 +194,12 @@ struct platform_freeze_ops {
15804         void (*end)(void);
15805  };
15807 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
15808 +extern bool pm_in_action;
15809 +#else
15810 +# define pm_in_action false
15811 +#endif
15813  #ifdef CONFIG_SUSPEND
15814  /**
15815   * suspend_set_ops - set platform dependent suspend operations
15816 diff --git a/include/linux/swait.h b/include/linux/swait.h
15817 new file mode 100644
15818 index 000000000000..83f004a72320
15819 --- /dev/null
15820 +++ b/include/linux/swait.h
15821 @@ -0,0 +1,173 @@
15822 +#ifndef _LINUX_SWAIT_H
15823 +#define _LINUX_SWAIT_H
15825 +#include <linux/list.h>
15826 +#include <linux/stddef.h>
15827 +#include <linux/spinlock.h>
15828 +#include <asm/current.h>
15831 + * Simple wait queues
15832 + *
15833 + * While these are very similar to the other/complex wait queues (wait.h) the
15834 + * most important difference is that the simple waitqueue allows for
15835 + * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
15836 + * times.
15837 + *
15838 + * In order to make this so, we had to drop a fair number of features of the
15839 + * other waitqueue code; notably:
15840 + *
15841 + *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
15842 + *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
15843 + *    sleeper state.
15844 + *
15845 + *  - the exclusive mode; because this requires preserving the list order
15846 + *    and this is hard.
15847 + *
15848 + *  - custom wake functions; because you cannot give any guarantees about
15849 + *    random code.
15850 + *
15851 + * As a side effect of this; the data structures are slimmer.
15852 + *
15853 + * One would recommend using this wait queue where possible.
15854 + */
15856 +struct task_struct;
15858 +struct swait_queue_head {
15859 +       raw_spinlock_t          lock;
15860 +       struct list_head        task_list;
15863 +struct swait_queue {
15864 +       struct task_struct      *task;
15865 +       struct list_head        task_list;
15868 +#define __SWAITQUEUE_INITIALIZER(name) {                               \
15869 +       .task           = current,                                      \
15870 +       .task_list      = LIST_HEAD_INIT((name).task_list),             \
15873 +#define DECLARE_SWAITQUEUE(name)                                       \
15874 +       struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
15876 +#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) {                         \
15877 +       .lock           = __RAW_SPIN_LOCK_UNLOCKED(name.lock),          \
15878 +       .task_list      = LIST_HEAD_INIT((name).task_list),             \
15881 +#define DECLARE_SWAIT_QUEUE_HEAD(name)                                 \
15882 +       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
15884 +extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
15885 +                                   struct lock_class_key *key);
15887 +#define init_swait_queue_head(q)                               \
15888 +       do {                                                    \
15889 +               static struct lock_class_key __key;             \
15890 +               __init_swait_queue_head((q), #q, &__key);       \
15891 +       } while (0)
15893 +#ifdef CONFIG_LOCKDEP
15894 +# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)                 \
15895 +       ({ init_swait_queue_head(&name); name; })
15896 +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
15897 +       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
15898 +#else
15899 +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
15900 +       DECLARE_SWAIT_QUEUE_HEAD(name)
15901 +#endif
15903 +static inline int swait_active(struct swait_queue_head *q)
15905 +       return !list_empty(&q->task_list);
15908 +extern void swake_up(struct swait_queue_head *q);
15909 +extern void swake_up_all(struct swait_queue_head *q);
15910 +extern void swake_up_locked(struct swait_queue_head *q);
15911 +extern void swake_up_all_locked(struct swait_queue_head *q);
15913 +extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
15914 +extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
15915 +extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
15917 +extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
15918 +extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
15920 +/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
15921 +#define ___swait_event(wq, condition, state, ret, cmd)                 \
15922 +({                                                                     \
15923 +       struct swait_queue __wait;                                      \
15924 +       long __ret = ret;                                               \
15925 +                                                                       \
15926 +       INIT_LIST_HEAD(&__wait.task_list);                              \
15927 +       for (;;) {                                                      \
15928 +               long __int = prepare_to_swait_event(&wq, &__wait, state);\
15929 +                                                                       \
15930 +               if (condition)                                          \
15931 +                       break;                                          \
15932 +                                                                       \
15933 +               if (___wait_is_interruptible(state) && __int) {         \
15934 +                       __ret = __int;                                  \
15935 +                       break;                                          \
15936 +               }                                                       \
15937 +                                                                       \
15938 +               cmd;                                                    \
15939 +       }                                                               \
15940 +       finish_swait(&wq, &__wait);                                     \
15941 +       __ret;                                                          \
15944 +#define __swait_event(wq, condition)                                   \
15945 +       (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,    \
15946 +                           schedule())
15948 +#define swait_event(wq, condition)                                     \
15949 +do {                                                                   \
15950 +       if (condition)                                                  \
15951 +               break;                                                  \
15952 +       __swait_event(wq, condition);                                   \
15953 +} while (0)
15955 +#define __swait_event_timeout(wq, condition, timeout)                  \
15956 +       ___swait_event(wq, ___wait_cond_timeout(condition),             \
15957 +                     TASK_UNINTERRUPTIBLE, timeout,                    \
15958 +                     __ret = schedule_timeout(__ret))
15960 +#define swait_event_timeout(wq, condition, timeout)                    \
15961 +({                                                                     \
15962 +       long __ret = timeout;                                           \
15963 +       if (!___wait_cond_timeout(condition))                           \
15964 +               __ret = __swait_event_timeout(wq, condition, timeout);  \
15965 +       __ret;                                                          \
15968 +#define __swait_event_interruptible(wq, condition)                     \
15969 +       ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,            \
15970 +                     schedule())
15972 +#define swait_event_interruptible(wq, condition)                       \
15973 +({                                                                     \
15974 +       int __ret = 0;                                                  \
15975 +       if (!(condition))                                               \
15976 +               __ret = __swait_event_interruptible(wq, condition);     \
15977 +       __ret;                                                          \
15980 +#define __swait_event_interruptible_timeout(wq, condition, timeout)    \
15981 +       ___swait_event(wq, ___wait_cond_timeout(condition),             \
15982 +                     TASK_INTERRUPTIBLE, timeout,                      \
15983 +                     __ret = schedule_timeout(__ret))
15985 +#define swait_event_interruptible_timeout(wq, condition, timeout)      \
15986 +({                                                                     \
15987 +       long __ret = timeout;                                           \
15988 +       if (!___wait_cond_timeout(condition))                           \
15989 +               __ret = __swait_event_interruptible_timeout(wq,         \
15990 +                                               condition, timeout);    \
15991 +       __ret;                                                          \
15994 +#endif /* _LINUX_SWAIT_H */
15995 diff --git a/include/linux/swap.h b/include/linux/swap.h
15996 index d8ca2eaa3a8b..19e038054914 100644
15997 --- a/include/linux/swap.h
15998 +++ b/include/linux/swap.h
15999 @@ -11,6 +11,7 @@
16000  #include <linux/fs.h>
16001  #include <linux/atomic.h>
16002  #include <linux/page-flags.h>
16003 +#include <linux/locallock.h>
16004  #include <asm/page.h>
16006  struct notifier_block;
16007 @@ -252,7 +253,8 @@ struct swap_info_struct {
16008  void *workingset_eviction(struct address_space *mapping, struct page *page);
16009  bool workingset_refault(void *shadow);
16010  void workingset_activation(struct page *page);
16011 -extern struct list_lru workingset_shadow_nodes;
16012 +extern struct list_lru __workingset_shadow_nodes;
16013 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
16015  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
16017 @@ -298,6 +300,7 @@ extern unsigned long nr_free_pagecache_pages(void);
16020  /* linux/mm/swap.c */
16021 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
16022  extern void lru_cache_add(struct page *);
16023  extern void lru_cache_add_anon(struct page *page);
16024  extern void lru_cache_add_file(struct page *page);
16025 diff --git a/include/linux/swork.h b/include/linux/swork.h
16026 new file mode 100644
16027 index 000000000000..f175fa9a6016
16028 --- /dev/null
16029 +++ b/include/linux/swork.h
16030 @@ -0,0 +1,24 @@
16031 +#ifndef _LINUX_SWORK_H
16032 +#define _LINUX_SWORK_H
16034 +#include <linux/list.h>
16036 +struct swork_event {
16037 +       struct list_head item;
16038 +       unsigned long flags;
16039 +       void (*func)(struct swork_event *);
16042 +static inline void INIT_SWORK(struct swork_event *event,
16043 +                             void (*func)(struct swork_event *))
16045 +       event->flags = 0;
16046 +       event->func = func;
16049 +bool swork_queue(struct swork_event *sev);
16051 +int swork_get(void);
16052 +void swork_put(void);
16054 +#endif /* _LINUX_SWORK_H */
16055 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
16056 index ff307b548ed3..be9f9dc6a4e1 100644
16057 --- a/include/linux/thread_info.h
16058 +++ b/include/linux/thread_info.h
16059 @@ -102,7 +102,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
16060  #define test_thread_flag(flag) \
16061         test_ti_thread_flag(current_thread_info(), flag)
16063 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
16064 +#ifdef CONFIG_PREEMPT_LAZY
16065 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
16066 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
16067 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
16068 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
16070 +#else
16071 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
16072 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
16073 +#define tif_need_resched_lazy()        0
16074 +#endif
16076  #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
16077  /*
16078 diff --git a/include/linux/timer.h b/include/linux/timer.h
16079 index 61aa61dc410c..299d2b78591f 100644
16080 --- a/include/linux/timer.h
16081 +++ b/include/linux/timer.h
16082 @@ -225,7 +225,7 @@ extern void add_timer(struct timer_list *timer);
16084  extern int try_to_del_timer_sync(struct timer_list *timer);
16086 -#ifdef CONFIG_SMP
16087 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
16088    extern int del_timer_sync(struct timer_list *timer);
16089  #else
16090  # define del_timer_sync(t)             del_timer(t)
16091 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
16092 index 311176f290b2..62d4bff5f477 100644
16093 --- a/include/linux/trace_events.h
16094 +++ b/include/linux/trace_events.h
16095 @@ -66,6 +66,9 @@ struct trace_entry {
16096         unsigned char           flags;
16097         unsigned char           preempt_count;
16098         int                     pid;
16099 +       unsigned short          migrate_disable;
16100 +       unsigned short          padding;
16101 +       unsigned char           preempt_lazy_count;
16102  };
16104  #define TRACE_EVENT_TYPE_MAX                                           \
16105 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
16106 index 558129af828a..cf5c472bbc79 100644
16107 --- a/include/linux/uaccess.h
16108 +++ b/include/linux/uaccess.h
16109 @@ -24,6 +24,7 @@ static __always_inline void pagefault_disabled_dec(void)
16110   */
16111  static inline void pagefault_disable(void)
16113 +       migrate_disable();
16114         pagefault_disabled_inc();
16115         /*
16116          * make sure to have issued the store before a pagefault
16117 @@ -40,6 +41,7 @@ static inline void pagefault_enable(void)
16118          */
16119         barrier();
16120         pagefault_disabled_dec();
16121 +       migrate_enable();
16124  /*
16125 diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
16126 index 4a29c75b146e..0a294e950df8 100644
16127 --- a/include/linux/uprobes.h
16128 +++ b/include/linux/uprobes.h
16129 @@ -27,6 +27,7 @@
16130  #include <linux/errno.h>
16131  #include <linux/rbtree.h>
16132  #include <linux/types.h>
16133 +#include <linux/wait.h>
16135  struct vm_area_struct;
16136  struct mm_struct;
16137 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
16138 index 3e5d9075960f..7eaa847cd5a5 100644
16139 --- a/include/linux/vmstat.h
16140 +++ b/include/linux/vmstat.h
16141 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
16142   */
16143  static inline void __count_vm_event(enum vm_event_item item)
16145 +       preempt_disable_rt();
16146         raw_cpu_inc(vm_event_states.event[item]);
16147 +       preempt_enable_rt();
16150  static inline void count_vm_event(enum vm_event_item item)
16151 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
16153  static inline void __count_vm_events(enum vm_event_item item, long delta)
16155 +       preempt_disable_rt();
16156         raw_cpu_add(vm_event_states.event[item], delta);
16157 +       preempt_enable_rt();
16160  static inline void count_vm_events(enum vm_event_item item, long delta)
16161 diff --git a/include/linux/wait.h b/include/linux/wait.h
16162 index 513b36f04dfd..981c8a840f96 100644
16163 --- a/include/linux/wait.h
16164 +++ b/include/linux/wait.h
16165 @@ -8,6 +8,7 @@
16166  #include <linux/spinlock.h>
16167  #include <asm/current.h>
16168  #include <uapi/linux/wait.h>
16169 +#include <linux/atomic.h>
16171  typedef struct __wait_queue wait_queue_t;
16172  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
16173 diff --git a/include/net/dst.h b/include/net/dst.h
16174 index e4f450617919..16cd3ef62202 100644
16175 --- a/include/net/dst.h
16176 +++ b/include/net/dst.h
16177 @@ -443,7 +443,7 @@ static inline void dst_confirm(struct dst_entry *dst)
16178  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
16179                                    struct sk_buff *skb)
16181 -       const struct hh_cache *hh;
16182 +       struct hh_cache *hh;
16184         if (dst->pending_confirm) {
16185                 unsigned long now = jiffies;
16186 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
16187 index 8b683841e574..bf656008f6e7 100644
16188 --- a/include/net/neighbour.h
16189 +++ b/include/net/neighbour.h
16190 @@ -446,7 +446,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
16192  #endif
16194 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
16195 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
16197         unsigned int seq;
16198         int hh_len;
16199 @@ -501,7 +501,7 @@ struct neighbour_cb {
16201  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
16203 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
16204 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
16205                                      const struct net_device *dev)
16207         unsigned int seq;
16208 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
16209 index c68926b4899c..dd0751e76065 100644
16210 --- a/include/net/netns/ipv4.h
16211 +++ b/include/net/netns/ipv4.h
16212 @@ -70,6 +70,7 @@ struct netns_ipv4 {
16214         int sysctl_icmp_echo_ignore_all;
16215         int sysctl_icmp_echo_ignore_broadcasts;
16216 +       int sysctl_icmp_echo_sysrq;
16217         int sysctl_icmp_ignore_bogus_error_responses;
16218         int sysctl_icmp_ratelimit;
16219         int sysctl_icmp_ratemask;
16220 diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
16221 new file mode 100644
16222 index 000000000000..f7710de1b1f3
16223 --- /dev/null
16224 +++ b/include/trace/events/hist.h
16225 @@ -0,0 +1,73 @@
16226 +#undef TRACE_SYSTEM
16227 +#define TRACE_SYSTEM hist
16229 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
16230 +#define _TRACE_HIST_H
16232 +#include "latency_hist.h"
16233 +#include <linux/tracepoint.h>
16235 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
16236 +#define trace_preemptirqsoff_hist(a, b)
16237 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
16238 +#else
16239 +TRACE_EVENT(preemptirqsoff_hist,
16241 +       TP_PROTO(int reason, int starthist),
16243 +       TP_ARGS(reason, starthist),
16245 +       TP_STRUCT__entry(
16246 +               __field(int,    reason)
16247 +               __field(int,    starthist)
16248 +       ),
16250 +       TP_fast_assign(
16251 +               __entry->reason         = reason;
16252 +               __entry->starthist      = starthist;
16253 +       ),
16255 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
16256 +                 __entry->starthist ? "start" : "stop")
16258 +#endif
16260 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
16261 +#define trace_hrtimer_interrupt(a, b, c, d)
16262 +#else
16263 +TRACE_EVENT(hrtimer_interrupt,
16265 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
16266 +               struct task_struct *task),
16268 +       TP_ARGS(cpu, offset, curr, task),
16270 +       TP_STRUCT__entry(
16271 +               __field(int,            cpu)
16272 +               __field(long long,      offset)
16273 +               __array(char,           ccomm,  TASK_COMM_LEN)
16274 +               __field(int,            cprio)
16275 +               __array(char,           tcomm,  TASK_COMM_LEN)
16276 +               __field(int,            tprio)
16277 +       ),
16279 +       TP_fast_assign(
16280 +               __entry->cpu    = cpu;
16281 +               __entry->offset = offset;
16282 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
16283 +               __entry->cprio  = curr->prio;
16284 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
16285 +                       task != NULL ? TASK_COMM_LEN : 7);
16286 +               __entry->tprio  = task != NULL ? task->prio : -1;
16287 +       ),
16289 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
16290 +               __entry->cpu, __entry->offset, __entry->ccomm,
16291 +               __entry->cprio, __entry->tcomm, __entry->tprio)
16293 +#endif
16295 +#endif /* _TRACE_HIST_H */
16297 +/* This part must be outside protection */
16298 +#include <trace/define_trace.h>
16299 diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
16300 new file mode 100644
16301 index 000000000000..d3f2fbd560b1
16302 --- /dev/null
16303 +++ b/include/trace/events/latency_hist.h
16304 @@ -0,0 +1,29 @@
16305 +#ifndef _LATENCY_HIST_H
16306 +#define _LATENCY_HIST_H
16308 +enum hist_action {
16309 +       IRQS_ON,
16310 +       PREEMPT_ON,
16311 +       TRACE_STOP,
16312 +       IRQS_OFF,
16313 +       PREEMPT_OFF,
16314 +       TRACE_START,
16317 +static char *actions[] = {
16318 +       "IRQS_ON",
16319 +       "PREEMPT_ON",
16320 +       "TRACE_STOP",
16321 +       "IRQS_OFF",
16322 +       "PREEMPT_OFF",
16323 +       "TRACE_START",
16326 +static inline char *getaction(int action)
16328 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
16329 +               return actions[action];
16330 +       return "unknown";
16333 +#endif /* _LATENCY_HIST_H */
16334 diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
16335 index fff846b512e6..73614ce1d204 100644
16336 --- a/include/trace/events/writeback.h
16337 +++ b/include/trace/events/writeback.h
16338 @@ -134,58 +134,28 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
16339  #ifdef CREATE_TRACE_POINTS
16340  #ifdef CONFIG_CGROUP_WRITEBACK
16342 -static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
16343 +static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
16345 -       return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
16346 +       return wb->memcg_css->cgroup->kn->ino;
16349 -static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
16351 -       struct cgroup *cgrp = wb->memcg_css->cgroup;
16352 -       char *path;
16354 -       path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
16355 -       WARN_ON_ONCE(path != buf);
16358 -static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
16360 -       if (wbc->wb)
16361 -               return __trace_wb_cgroup_size(wbc->wb);
16362 -       else
16363 -               return 2;
16366 -static inline void __trace_wbc_assign_cgroup(char *buf,
16367 -                                            struct writeback_control *wbc)
16368 +static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
16370         if (wbc->wb)
16371 -               __trace_wb_assign_cgroup(buf, wbc->wb);
16372 +               return __trace_wb_assign_cgroup(wbc->wb);
16373         else
16374 -               strcpy(buf, "/");
16375 +               return -1U;
16378  #else  /* CONFIG_CGROUP_WRITEBACK */
16380 -static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
16382 -       return 2;
16385 -static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
16387 -       strcpy(buf, "/");
16390 -static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
16391 +static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
16393 -       return 2;
16394 +       return -1U;
16397 -static inline void __trace_wbc_assign_cgroup(char *buf,
16398 -                                            struct writeback_control *wbc)
16399 +static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
16401 -       strcpy(buf, "/");
16402 +       return -1U;
16405  #endif /* CONFIG_CGROUP_WRITEBACK */
16406 @@ -201,7 +171,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
16407                 __array(char, name, 32)
16408                 __field(unsigned long, ino)
16409                 __field(int, sync_mode)
16410 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
16411 +               __field(unsigned int, cgroup_ino)
16412         ),
16414         TP_fast_assign(
16415 @@ -209,14 +179,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
16416                         dev_name(inode_to_bdi(inode)->dev), 32);
16417                 __entry->ino            = inode->i_ino;
16418                 __entry->sync_mode      = wbc->sync_mode;
16419 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16420 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16421         ),
16423 -       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
16424 +       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%u",
16425                 __entry->name,
16426                 __entry->ino,
16427                 __entry->sync_mode,
16428 -               __get_str(cgroup)
16429 +               __entry->cgroup_ino
16430         )
16431  );
16433 @@ -246,7 +216,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16434                 __field(int, range_cyclic)
16435                 __field(int, for_background)
16436                 __field(int, reason)
16437 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16438 +               __field(unsigned int, cgroup_ino)
16439         ),
16440         TP_fast_assign(
16441                 strncpy(__entry->name,
16442 @@ -258,10 +228,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16443                 __entry->range_cyclic = work->range_cyclic;
16444                 __entry->for_background = work->for_background;
16445                 __entry->reason = work->reason;
16446 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16447 +               __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
16448         ),
16449         TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
16450 -                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
16451 +                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%u",
16452                   __entry->name,
16453                   MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
16454                   __entry->nr_pages,
16455 @@ -270,7 +240,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16456                   __entry->range_cyclic,
16457                   __entry->for_background,
16458                   __print_symbolic(__entry->reason, WB_WORK_REASON),
16459 -                 __get_str(cgroup)
16460 +                 __entry->cgroup_ino
16461         )
16462  );
16463  #define DEFINE_WRITEBACK_WORK_EVENT(name) \
16464 @@ -300,15 +270,15 @@ DECLARE_EVENT_CLASS(writeback_class,
16465         TP_ARGS(wb),
16466         TP_STRUCT__entry(
16467                 __array(char, name, 32)
16468 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16469 +               __field(unsigned int, cgroup_ino)
16470         ),
16471         TP_fast_assign(
16472                 strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
16473 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16474 +               __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
16475         ),
16476 -       TP_printk("bdi %s: cgroup=%s",
16477 +       TP_printk("bdi %s: cgroup_ino=%u",
16478                   __entry->name,
16479 -                 __get_str(cgroup)
16480 +                 __entry->cgroup_ino
16481         )
16482  );
16483  #define DEFINE_WRITEBACK_EVENT(name) \
16484 @@ -347,7 +317,7 @@ DECLARE_EVENT_CLASS(wbc_class,
16485                 __field(int, range_cyclic)
16486                 __field(long, range_start)
16487                 __field(long, range_end)
16488 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
16489 +               __field(unsigned int, cgroup_ino)
16490         ),
16492         TP_fast_assign(
16493 @@ -361,12 +331,12 @@ DECLARE_EVENT_CLASS(wbc_class,
16494                 __entry->range_cyclic   = wbc->range_cyclic;
16495                 __entry->range_start    = (long)wbc->range_start;
16496                 __entry->range_end      = (long)wbc->range_end;
16497 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16498 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16499         ),
16501         TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
16502                 "bgrd=%d reclm=%d cyclic=%d "
16503 -               "start=0x%lx end=0x%lx cgroup=%s",
16504 +               "start=0x%lx end=0x%lx cgroup_ino=%u",
16505                 __entry->name,
16506                 __entry->nr_to_write,
16507                 __entry->pages_skipped,
16508 @@ -377,7 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
16509                 __entry->range_cyclic,
16510                 __entry->range_start,
16511                 __entry->range_end,
16512 -               __get_str(cgroup)
16513 +               __entry->cgroup_ino
16514         )
16517 @@ -398,7 +368,7 @@ TRACE_EVENT(writeback_queue_io,
16518                 __field(long,           age)
16519                 __field(int,            moved)
16520                 __field(int,            reason)
16521 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16522 +               __field(unsigned int,   cgroup_ino)
16523         ),
16524         TP_fast_assign(
16525                 unsigned long *older_than_this = work->older_than_this;
16526 @@ -408,15 +378,15 @@ TRACE_EVENT(writeback_queue_io,
16527                                   (jiffies - *older_than_this) * 1000 / HZ : -1;
16528                 __entry->moved  = moved;
16529                 __entry->reason = work->reason;
16530 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16531 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16532         ),
16533 -       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
16534 +       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%u",
16535                 __entry->name,
16536                 __entry->older, /* older_than_this in jiffies */
16537                 __entry->age,   /* older_than_this in relative milliseconds */
16538                 __entry->moved,
16539                 __print_symbolic(__entry->reason, WB_WORK_REASON),
16540 -               __get_str(cgroup)
16541 +               __entry->cgroup_ino
16542         )
16543  );
16545 @@ -484,7 +454,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16546                 __field(unsigned long,  dirty_ratelimit)
16547                 __field(unsigned long,  task_ratelimit)
16548                 __field(unsigned long,  balanced_dirty_ratelimit)
16549 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16550 +               __field(unsigned int,   cgroup_ino)
16551         ),
16553         TP_fast_assign(
16554 @@ -496,13 +466,13 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16555                 __entry->task_ratelimit = KBps(task_ratelimit);
16556                 __entry->balanced_dirty_ratelimit =
16557                                         KBps(wb->balanced_dirty_ratelimit);
16558 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16559 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16560         ),
16562         TP_printk("bdi %s: "
16563                   "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
16564                   "dirty_ratelimit=%lu task_ratelimit=%lu "
16565 -                 "balanced_dirty_ratelimit=%lu cgroup=%s",
16566 +                 "balanced_dirty_ratelimit=%lu cgroup_ino=%u",
16567                   __entry->bdi,
16568                   __entry->write_bw,            /* write bandwidth */
16569                   __entry->avg_write_bw,        /* avg write bandwidth */
16570 @@ -510,7 +480,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16571                   __entry->dirty_ratelimit,     /* base ratelimit */
16572                   __entry->task_ratelimit, /* ratelimit with position control */
16573                   __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
16574 -                 __get_str(cgroup)
16575 +                 __entry->cgroup_ino
16576         )
16577  );
16579 @@ -548,7 +518,7 @@ TRACE_EVENT(balance_dirty_pages,
16580                 __field(         long,  pause)
16581                 __field(unsigned long,  period)
16582                 __field(         long,  think)
16583 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16584 +               __field(unsigned int,   cgroup_ino)
16585         ),
16587         TP_fast_assign(
16588 @@ -571,7 +541,7 @@ TRACE_EVENT(balance_dirty_pages,
16589                 __entry->period         = period * 1000 / HZ;
16590                 __entry->pause          = pause * 1000 / HZ;
16591                 __entry->paused         = (jiffies - start_time) * 1000 / HZ;
16592 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16593 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16594         ),
16597 @@ -580,7 +550,7 @@ TRACE_EVENT(balance_dirty_pages,
16598                   "bdi_setpoint=%lu bdi_dirty=%lu "
16599                   "dirty_ratelimit=%lu task_ratelimit=%lu "
16600                   "dirtied=%u dirtied_pause=%u "
16601 -                 "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
16602 +                 "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%u",
16603                   __entry->bdi,
16604                   __entry->limit,
16605                   __entry->setpoint,
16606 @@ -595,7 +565,7 @@ TRACE_EVENT(balance_dirty_pages,
16607                   __entry->pause,       /* ms */
16608                   __entry->period,      /* ms */
16609                   __entry->think,       /* ms */
16610 -                 __get_str(cgroup)
16611 +                 __entry->cgroup_ino
16612           )
16613  );
16615 @@ -609,8 +579,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
16616                 __field(unsigned long, ino)
16617                 __field(unsigned long, state)
16618                 __field(unsigned long, dirtied_when)
16619 -               __dynamic_array(char, cgroup,
16620 -                               __trace_wb_cgroup_size(inode_to_wb(inode)))
16621 +               __field(unsigned int, cgroup_ino)
16622         ),
16624         TP_fast_assign(
16625 @@ -619,16 +588,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
16626                 __entry->ino            = inode->i_ino;
16627                 __entry->state          = inode->i_state;
16628                 __entry->dirtied_when   = inode->dirtied_when;
16629 -               __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
16630 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(inode_to_wb(inode));
16631         ),
16633 -       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
16634 +       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%u",
16635                   __entry->name,
16636                   __entry->ino,
16637                   show_inode_state(__entry->state),
16638                   __entry->dirtied_when,
16639                   (jiffies - __entry->dirtied_when) / HZ,
16640 -                 __get_str(cgroup)
16641 +                 __entry->cgroup_ino
16642         )
16643  );
16645 @@ -684,7 +653,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16646                 __field(unsigned long, writeback_index)
16647                 __field(long, nr_to_write)
16648                 __field(unsigned long, wrote)
16649 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
16650 +               __field(unsigned int, cgroup_ino)
16651         ),
16653         TP_fast_assign(
16654 @@ -696,11 +665,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16655                 __entry->writeback_index = inode->i_mapping->writeback_index;
16656                 __entry->nr_to_write    = nr_to_write;
16657                 __entry->wrote          = nr_to_write - wbc->nr_to_write;
16658 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16659 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16660         ),
16662         TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
16663 -                 "index=%lu to_write=%ld wrote=%lu cgroup=%s",
16664 +                 "index=%lu to_write=%ld wrote=%lu cgroup_ino=%u",
16665                   __entry->name,
16666                   __entry->ino,
16667                   show_inode_state(__entry->state),
16668 @@ -709,7 +678,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16669                   __entry->writeback_index,
16670                   __entry->nr_to_write,
16671                   __entry->wrote,
16672 -                 __get_str(cgroup)
16673 +                 __entry->cgroup_ino
16674         )
16675  );
16677 diff --git a/init/Kconfig b/init/Kconfig
16678 index 235c7a2c0d20..a7c81c0911da 100644
16679 --- a/init/Kconfig
16680 +++ b/init/Kconfig
16681 @@ -498,7 +498,7 @@ config TINY_RCU
16683  config RCU_EXPERT
16684         bool "Make expert-level adjustments to RCU configuration"
16685 -       default n
16686 +       default y if PREEMPT_RT_FULL
16687         help
16688           This option needs to be enabled if you wish to make
16689           expert-level adjustments to RCU configuration.  By default,
16690 @@ -614,7 +614,7 @@ config RCU_FANOUT_LEAF
16692  config RCU_FAST_NO_HZ
16693         bool "Accelerate last non-dyntick-idle CPU's grace periods"
16694 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
16695 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
16696         default n
16697         help
16698           This option permits CPUs to enter dynticks-idle state even if
16699 @@ -641,7 +641,7 @@ config TREE_RCU_TRACE
16700  config RCU_BOOST
16701         bool "Enable RCU priority boosting"
16702         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
16703 -       default n
16704 +       default y if PREEMPT_RT_FULL
16705         help
16706           This option boosts the priority of preempted RCU readers that
16707           block the current preemptible RCU grace period for too long.
16708 @@ -1106,6 +1106,7 @@ config CFS_BANDWIDTH
16709  config RT_GROUP_SCHED
16710         bool "Group scheduling for SCHED_RR/FIFO"
16711         depends on CGROUP_SCHED
16712 +       depends on !PREEMPT_RT_FULL
16713         default n
16714         help
16715           This feature lets you explicitly allocate real CPU bandwidth
16716 @@ -1719,6 +1720,7 @@ choice
16718  config SLAB
16719         bool "SLAB"
16720 +       depends on !PREEMPT_RT_FULL
16721         help
16722           The regular slab allocator that is established and known to work
16723           well in all environments. It organizes cache hot objects in
16724 @@ -1737,6 +1739,7 @@ config SLUB
16725  config SLOB
16726         depends on EXPERT
16727         bool "SLOB (Simple Allocator)"
16728 +       depends on !PREEMPT_RT_FULL
16729         help
16730            SLOB replaces the stock allocator with a drastically simpler
16731            allocator. SLOB is generally more space efficient but
16732 @@ -1746,7 +1749,7 @@ endchoice
16734  config SLUB_CPU_PARTIAL
16735         default y
16736 -       depends on SLUB && SMP
16737 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
16738         bool "SLUB per cpu partial cache"
16739         help
16740           Per cpu partial caches accellerate objects allocation and freeing
16741 diff --git a/init/Makefile b/init/Makefile
16742 index 7bc47ee31c36..88cf473554e0 100644
16743 --- a/init/Makefile
16744 +++ b/init/Makefile
16745 @@ -33,4 +33,4 @@ $(obj)/version.o: include/generated/compile.h
16746  include/generated/compile.h: FORCE
16747         @$($(quiet)chk_compile.h)
16748         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
16749 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
16750 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
16751 diff --git a/init/main.c b/init/main.c
16752 index 49926d95442f..995cbb8d09b6 100644
16753 --- a/init/main.c
16754 +++ b/init/main.c
16755 @@ -532,6 +532,7 @@ asmlinkage __visible void __init start_kernel(void)
16756         setup_command_line(command_line);
16757         setup_nr_cpu_ids();
16758         setup_per_cpu_areas();
16759 +       softirq_early_init();
16760         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
16762         build_all_zonelists(NULL, NULL);
16763 diff --git a/ipc/msg.c b/ipc/msg.c
16764 index c6521c205cb4..996d89023552 100644
16765 --- a/ipc/msg.c
16766 +++ b/ipc/msg.c
16767 @@ -183,20 +183,14 @@ static void ss_wakeup(struct list_head *h, int kill)
16768         }
16771 -static void expunge_all(struct msg_queue *msq, int res)
16772 +static void expunge_all(struct msg_queue *msq, int res,
16773 +                       struct wake_q_head *wake_q)
16775         struct msg_receiver *msr, *t;
16777         list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
16778 -               msr->r_msg = NULL; /* initialize expunge ordering */
16779 -               wake_up_process(msr->r_tsk);
16780 -               /*
16781 -                * Ensure that the wakeup is visible before setting r_msg as
16782 -                * the receiving end depends on it: either spinning on a nil,
16783 -                * or dealing with -EAGAIN cases. See lockless receive part 1
16784 -                * and 2 in do_msgrcv().
16785 -                */
16786 -               smp_wmb(); /* barrier (B) */
16788 +               wake_q_add(wake_q, msr->r_tsk);
16789                 msr->r_msg = ERR_PTR(res);
16790         }
16792 @@ -213,11 +207,13 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
16794         struct msg_msg *msg, *t;
16795         struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
16796 +       WAKE_Q(wake_q);
16798 -       expunge_all(msq, -EIDRM);
16799 +       expunge_all(msq, -EIDRM, &wake_q);
16800         ss_wakeup(&msq->q_senders, 1);
16801         msg_rmid(ns, msq);
16802         ipc_unlock_object(&msq->q_perm);
16803 +       wake_up_q(&wake_q);
16804         rcu_read_unlock();
16806         list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
16807 @@ -342,6 +338,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16808         struct kern_ipc_perm *ipcp;
16809         struct msqid64_ds uninitialized_var(msqid64);
16810         struct msg_queue *msq;
16811 +       WAKE_Q(wake_q);
16812         int err;
16814         if (cmd == IPC_SET) {
16815 @@ -389,7 +386,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16816                 /* sleeping receivers might be excluded by
16817                  * stricter permissions.
16818                  */
16819 -               expunge_all(msq, -EAGAIN);
16820 +               expunge_all(msq, -EAGAIN, &wake_q);
16821                 /* sleeping senders might be able to send
16822                  * due to a larger queue size.
16823                  */
16824 @@ -402,6 +399,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16826  out_unlock0:
16827         ipc_unlock_object(&msq->q_perm);
16828 +       wake_up_q(&wake_q);
16829  out_unlock1:
16830         rcu_read_unlock();
16831  out_up:
16832 @@ -566,7 +564,8 @@ static int testmsg(struct msg_msg *msg, long type, int mode)
16833         return 0;
16836 -static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
16837 +static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg,
16838 +                                struct wake_q_head *wake_q)
16840         struct msg_receiver *msr, *t;
16842 @@ -577,27 +576,13 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
16844                         list_del(&msr->r_list);
16845                         if (msr->r_maxsize < msg->m_ts) {
16846 -                               /* initialize pipelined send ordering */
16847 -                               msr->r_msg = NULL;
16848 -                               wake_up_process(msr->r_tsk);
16849 -                               /* barrier (B) see barrier comment below */
16850 -                               smp_wmb();
16851 +                               wake_q_add(wake_q, msr->r_tsk);
16852                                 msr->r_msg = ERR_PTR(-E2BIG);
16853                         } else {
16854 -                               msr->r_msg = NULL;
16855                                 msq->q_lrpid = task_pid_vnr(msr->r_tsk);
16856                                 msq->q_rtime = get_seconds();
16857 -                               wake_up_process(msr->r_tsk);
16858 -                               /*
16859 -                                * Ensure that the wakeup is visible before
16860 -                                * setting r_msg, as the receiving can otherwise
16861 -                                * exit - once r_msg is set, the receiver can
16862 -                                * continue. See lockless receive part 1 and 2
16863 -                                * in do_msgrcv(). Barrier (B).
16864 -                                */
16865 -                               smp_wmb();
16866 +                               wake_q_add(wake_q, msr->r_tsk);
16867                                 msr->r_msg = msg;
16869                                 return 1;
16870                         }
16871                 }
16872 @@ -613,6 +598,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16873         struct msg_msg *msg;
16874         int err;
16875         struct ipc_namespace *ns;
16876 +       WAKE_Q(wake_q);
16878         ns = current->nsproxy->ipc_ns;
16880 @@ -698,7 +684,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16881         msq->q_lspid = task_tgid_vnr(current);
16882         msq->q_stime = get_seconds();
16884 -       if (!pipelined_send(msq, msg)) {
16885 +       if (!pipelined_send(msq, msg, &wake_q)) {
16886                 /* no one is waiting for this message, enqueue it */
16887                 list_add_tail(&msg->m_list, &msq->q_messages);
16888                 msq->q_cbytes += msgsz;
16889 @@ -712,6 +698,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16891  out_unlock0:
16892         ipc_unlock_object(&msq->q_perm);
16893 +       wake_up_q(&wake_q);
16894  out_unlock1:
16895         rcu_read_unlock();
16896         if (msg != NULL)
16897 @@ -932,57 +919,25 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
16898                 rcu_read_lock();
16900                 /* Lockless receive, part 2:
16901 -                * Wait until pipelined_send or expunge_all are outside of
16902 -                * wake_up_process(). There is a race with exit(), see
16903 -                * ipc/mqueue.c for the details. The correct serialization
16904 -                * ensures that a receiver cannot continue without the wakeup
16905 -                * being visibible _before_ setting r_msg:
16906 +                * The work in pipelined_send() and expunge_all():
16907 +                * - Set pointer to message
16908 +                * - Queue the receiver task for later wakeup
16909 +                * - Wake up the process after the lock is dropped.
16910                  *
16911 -                * CPU 0                             CPU 1
16912 -                * <loop receiver>
16913 -                *   smp_rmb(); (A) <-- pair -.      <waker thread>
16914 -                *   <load ->r_msg>           |        msr->r_msg = NULL;
16915 -                *                            |        wake_up_process();
16916 -                * <continue>                 `------> smp_wmb(); (B)
16917 -                *                                     msr->r_msg = msg;
16918 -                *
16919 -                * Where (A) orders the message value read and where (B) orders
16920 -                * the write to the r_msg -- done in both pipelined_send and
16921 -                * expunge_all.
16922 +                * Should the process wake up before this wakeup (due to a
16923 +                * signal) it will either see the message and continue â€¦
16924                  */
16925 -               for (;;) {
16926 -                       /*
16927 -                        * Pairs with writer barrier in pipelined_send
16928 -                        * or expunge_all.
16929 -                        */
16930 -                       smp_rmb(); /* barrier (A) */
16931 -                       msg = (struct msg_msg *)msr_d.r_msg;
16932 -                       if (msg)
16933 -                               break;
16935 -                       /*
16936 -                        * The cpu_relax() call is a compiler barrier
16937 -                        * which forces everything in this loop to be
16938 -                        * re-loaded.
16939 -                        */
16940 -                       cpu_relax();
16941 -               }
16943 -               /* Lockless receive, part 3:
16944 -                * If there is a message or an error then accept it without
16945 -                * locking.
16946 -                */
16947 +               msg = (struct msg_msg *)msr_d.r_msg;
16948                 if (msg != ERR_PTR(-EAGAIN))
16949                         goto out_unlock1;
16951 -               /* Lockless receive, part 3:
16952 -                * Acquire the queue spinlock.
16953 -                */
16954 +                /*
16955 +                 * â€¦ or see -EAGAIN, acquire the lock to check the message
16956 +                 * again.
16957 +                 */
16958                 ipc_lock_object(&msq->q_perm);
16960 -               /* Lockless receive, part 4:
16961 -                * Repeat test after acquiring the spinlock.
16962 -                */
16963                 msg = (struct msg_msg *)msr_d.r_msg;
16964                 if (msg != ERR_PTR(-EAGAIN))
16965                         goto out_unlock0;
16966 diff --git a/ipc/sem.c b/ipc/sem.c
16967 index 9862c3d1c26d..ef34d7376697 100644
16968 --- a/ipc/sem.c
16969 +++ b/ipc/sem.c
16970 @@ -708,6 +708,13 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
16971  static void wake_up_sem_queue_prepare(struct list_head *pt,
16972                                 struct sem_queue *q, int error)
16974 +#ifdef CONFIG_PREEMPT_RT_BASE
16975 +       struct task_struct *p = q->sleeper;
16976 +       get_task_struct(p);
16977 +       q->status = error;
16978 +       wake_up_process(p);
16979 +       put_task_struct(p);
16980 +#else
16981         if (list_empty(pt)) {
16982                 /*
16983                  * Hold preempt off so that we don't get preempted and have the
16984 @@ -719,6 +726,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
16985         q->pid = error;
16987         list_add_tail(&q->list, pt);
16988 +#endif
16991  /**
16992 @@ -732,6 +740,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
16993   */
16994  static void wake_up_sem_queue_do(struct list_head *pt)
16996 +#ifndef CONFIG_PREEMPT_RT_BASE
16997         struct sem_queue *q, *t;
16998         int did_something;
17000 @@ -744,6 +753,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
17001         }
17002         if (did_something)
17003                 preempt_enable();
17004 +#endif
17007  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
17008 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
17009 index ebdb0043203a..b9e6aa7e5aa6 100644
17010 --- a/kernel/Kconfig.locks
17011 +++ b/kernel/Kconfig.locks
17012 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
17014  config MUTEX_SPIN_ON_OWNER
17015         def_bool y
17016 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
17017 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
17019  config RWSEM_SPIN_ON_OWNER
17020         def_bool y
17021 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
17022 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
17024  config LOCK_SPIN_ON_OWNER
17025         def_bool y
17026 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
17027 index 3f9c97419f02..11dbe26a8279 100644
17028 --- a/kernel/Kconfig.preempt
17029 +++ b/kernel/Kconfig.preempt
17030 @@ -1,3 +1,16 @@
17031 +config PREEMPT
17032 +       bool
17033 +       select PREEMPT_COUNT
17035 +config PREEMPT_RT_BASE
17036 +       bool
17037 +       select PREEMPT
17039 +config HAVE_PREEMPT_LAZY
17040 +       bool
17042 +config PREEMPT_LAZY
17043 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
17045  choice
17046         prompt "Preemption Model"
17047 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
17049           Select this if you are building a kernel for a desktop system.
17051 -config PREEMPT
17052 +config PREEMPT__LL
17053         bool "Preemptible Kernel (Low-Latency Desktop)"
17054 -       select PREEMPT_COUNT
17055 +       select PREEMPT
17056         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
17057         help
17058           This option reduces the latency of the kernel by making
17059 @@ -52,6 +65,22 @@ config PREEMPT
17060           embedded system with latency requirements in the milliseconds
17061           range.
17063 +config PREEMPT_RTB
17064 +       bool "Preemptible Kernel (Basic RT)"
17065 +       select PREEMPT_RT_BASE
17066 +       help
17067 +         This option is basically the same as (Low-Latency Desktop) but
17068 +         enables changes which are preliminary for the full preemptible
17069 +         RT kernel.
17071 +config PREEMPT_RT_FULL
17072 +       bool "Fully Preemptible Kernel (RT)"
17073 +       depends on IRQ_FORCED_THREADING
17074 +       select PREEMPT_RT_BASE
17075 +       select PREEMPT_RCU
17076 +       help
17077 +         All and everything
17079  endchoice
17081  config PREEMPT_COUNT
17082 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
17083 index 4cb94b678e9f..8c41ee8a6fee 100644
17084 --- a/kernel/cgroup.c
17085 +++ b/kernel/cgroup.c
17086 @@ -4741,10 +4741,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
17087         queue_work(cgroup_destroy_wq, &css->destroy_work);
17090 -static void css_release_work_fn(struct work_struct *work)
17091 +static void css_release_work_fn(struct swork_event *sev)
17093         struct cgroup_subsys_state *css =
17094 -               container_of(work, struct cgroup_subsys_state, destroy_work);
17095 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
17096         struct cgroup_subsys *ss = css->ss;
17097         struct cgroup *cgrp = css->cgroup;
17099 @@ -4783,8 +4783,8 @@ static void css_release(struct percpu_ref *ref)
17100         struct cgroup_subsys_state *css =
17101                 container_of(ref, struct cgroup_subsys_state, refcnt);
17103 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
17104 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
17105 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
17106 +       swork_queue(&css->destroy_swork);
17109  static void init_and_link_css(struct cgroup_subsys_state *css,
17110 @@ -5401,6 +5401,7 @@ static int __init cgroup_wq_init(void)
17111          */
17112         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
17113         BUG_ON(!cgroup_destroy_wq);
17114 +       BUG_ON(swork_get());
17116         /*
17117          * Used to destroy pidlists and separate to serve as flush domain.
17118 diff --git a/kernel/cpu.c b/kernel/cpu.c
17119 index 40d20bf5de28..0be18c1684d8 100644
17120 --- a/kernel/cpu.c
17121 +++ b/kernel/cpu.c
17122 @@ -75,8 +75,8 @@ static struct {
17123  #endif
17124  } cpu_hotplug = {
17125         .active_writer = NULL,
17126 -       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
17127         .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
17128 +       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
17129  #ifdef CONFIG_DEBUG_LOCK_ALLOC
17130         .dep_map = {.name = "cpu_hotplug.lock" },
17131  #endif
17132 @@ -89,6 +89,289 @@ static struct {
17133  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
17134  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
17136 +/**
17137 + * hotplug_pcp - per cpu hotplug descriptor
17138 + * @unplug:    set when pin_current_cpu() needs to sync tasks
17139 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
17140 + * @refcount:  counter of tasks in pinned sections
17141 + * @grab_lock: set when the tasks entering pinned sections should wait
17142 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
17143 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
17144 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
17145 + *
17146 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
17147 + * is used as a flag and still exists after @sync_tsk has exited and
17148 + * @sync_tsk set to NULL.
17149 + */
17150 +struct hotplug_pcp {
17151 +       struct task_struct *unplug;
17152 +       struct task_struct *sync_tsk;
17153 +       int refcount;
17154 +       int grab_lock;
17155 +       struct completion synced;
17156 +       struct completion unplug_wait;
17157 +#ifdef CONFIG_PREEMPT_RT_FULL
17158 +       /*
17159 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
17160 +        * the task, otherwise the mutex will cause the task to fail
17161 +        * to sleep when required. (Because it's called from migrate_disable())
17162 +        *
17163 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
17164 +        * state.
17165 +        */
17166 +       spinlock_t lock;
17167 +#else
17168 +       struct mutex mutex;
17169 +#endif
17170 +       int mutex_init;
17173 +#ifdef CONFIG_PREEMPT_RT_FULL
17174 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
17175 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
17176 +#else
17177 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
17178 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
17179 +#endif
17181 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
17183 +/**
17184 + * pin_current_cpu - Prevent the current cpu from being unplugged
17185 + *
17186 + * Lightweight version of get_online_cpus() to prevent cpu from being
17187 + * unplugged when code runs in a migration disabled region.
17188 + *
17189 + * Must be called with preemption disabled (preempt_count = 1)!
17190 + */
17191 +void pin_current_cpu(void)
17193 +       struct hotplug_pcp *hp;
17194 +       int force = 0;
17196 +retry:
17197 +       hp = this_cpu_ptr(&hotplug_pcp);
17199 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
17200 +           hp->unplug == current) {
17201 +               hp->refcount++;
17202 +               return;
17203 +       }
17204 +       if (hp->grab_lock) {
17205 +               preempt_enable();
17206 +               hotplug_lock(hp);
17207 +               hotplug_unlock(hp);
17208 +       } else {
17209 +               preempt_enable();
17210 +               /*
17211 +                * Try to push this task off of this CPU.
17212 +                */
17213 +               if (!migrate_me()) {
17214 +                       preempt_disable();
17215 +                       hp = this_cpu_ptr(&hotplug_pcp);
17216 +                       if (!hp->grab_lock) {
17217 +                               /*
17218 +                                * Just let it continue it's already pinned
17219 +                                * or about to sleep.
17220 +                                */
17221 +                               force = 1;
17222 +                               goto retry;
17223 +                       }
17224 +                       preempt_enable();
17225 +               }
17226 +       }
17227 +       preempt_disable();
17228 +       goto retry;
17231 +/**
17232 + * unpin_current_cpu - Allow unplug of current cpu
17233 + *
17234 + * Must be called with preemption or interrupts disabled!
17235 + */
17236 +void unpin_current_cpu(void)
17238 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
17240 +       WARN_ON(hp->refcount <= 0);
17242 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
17243 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
17244 +               wake_up_process(hp->unplug);
17247 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
17249 +       set_current_state(TASK_UNINTERRUPTIBLE);
17250 +       while (hp->refcount) {
17251 +               schedule_preempt_disabled();
17252 +               set_current_state(TASK_UNINTERRUPTIBLE);
17253 +       }
17256 +static int sync_unplug_thread(void *data)
17258 +       struct hotplug_pcp *hp = data;
17260 +       wait_for_completion(&hp->unplug_wait);
17261 +       preempt_disable();
17262 +       hp->unplug = current;
17263 +       wait_for_pinned_cpus(hp);
17265 +       /*
17266 +        * This thread will synchronize the cpu_down() with threads
17267 +        * that have pinned the CPU. When the pinned CPU count reaches
17268 +        * zero, we inform the cpu_down code to continue to the next step.
17269 +        */
17270 +       set_current_state(TASK_UNINTERRUPTIBLE);
17271 +       preempt_enable();
17272 +       complete(&hp->synced);
17274 +       /*
17275 +        * If all succeeds, the next step will need tasks to wait till
17276 +        * the CPU is offline before continuing. To do this, the grab_lock
17277 +        * is set and tasks going into pin_current_cpu() will block on the
17278 +        * mutex. But we still need to wait for those that are already in
17279 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
17280 +        * will kick this thread out.
17281 +        */
17282 +       while (!hp->grab_lock && !kthread_should_stop()) {
17283 +               schedule();
17284 +               set_current_state(TASK_UNINTERRUPTIBLE);
17285 +       }
17287 +       /* Make sure grab_lock is seen before we see a stale completion */
17288 +       smp_mb();
17290 +       /*
17291 +        * Now just before cpu_down() enters stop machine, we need to make
17292 +        * sure all tasks that are in pinned CPU sections are out, and new
17293 +        * tasks will now grab the lock, keeping them from entering pinned
17294 +        * CPU sections.
17295 +        */
17296 +       if (!kthread_should_stop()) {
17297 +               preempt_disable();
17298 +               wait_for_pinned_cpus(hp);
17299 +               preempt_enable();
17300 +               complete(&hp->synced);
17301 +       }
17303 +       set_current_state(TASK_UNINTERRUPTIBLE);
17304 +       while (!kthread_should_stop()) {
17305 +               schedule();
17306 +               set_current_state(TASK_UNINTERRUPTIBLE);
17307 +       }
17308 +       set_current_state(TASK_RUNNING);
17310 +       /*
17311 +        * Force this thread off this CPU as it's going down and
17312 +        * we don't want any more work on this CPU.
17313 +        */
17314 +       current->flags &= ~PF_NO_SETAFFINITY;
17315 +       set_cpus_allowed_ptr(current, cpu_present_mask);
17316 +       migrate_me();
17317 +       return 0;
17320 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
17322 +       wake_up_process(hp->sync_tsk);
17323 +       wait_for_completion(&hp->synced);
17326 +static void __cpu_unplug_wait(unsigned int cpu)
17328 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
17330 +       complete(&hp->unplug_wait);
17331 +       wait_for_completion(&hp->synced);
17335 + * Start the sync_unplug_thread on the target cpu and wait for it to
17336 + * complete.
17337 + */
17338 +static int cpu_unplug_begin(unsigned int cpu)
17340 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
17341 +       int err;
17343 +       /* Protected by cpu_hotplug.lock */
17344 +       if (!hp->mutex_init) {
17345 +#ifdef CONFIG_PREEMPT_RT_FULL
17346 +               spin_lock_init(&hp->lock);
17347 +#else
17348 +               mutex_init(&hp->mutex);
17349 +#endif
17350 +               hp->mutex_init = 1;
17351 +       }
17353 +       /* Inform the scheduler to migrate tasks off this CPU */
17354 +       tell_sched_cpu_down_begin(cpu);
17356 +       init_completion(&hp->synced);
17357 +       init_completion(&hp->unplug_wait);
17359 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
17360 +       if (IS_ERR(hp->sync_tsk)) {
17361 +               err = PTR_ERR(hp->sync_tsk);
17362 +               hp->sync_tsk = NULL;
17363 +               return err;
17364 +       }
17365 +       kthread_bind(hp->sync_tsk, cpu);
17367 +       /*
17368 +        * Wait for tasks to get out of the pinned sections,
17369 +        * it's still OK if new tasks enter. Some CPU notifiers will
17370 +        * wait for tasks that are going to enter these sections and
17371 +        * we must not have them block.
17372 +        */
17373 +       wake_up_process(hp->sync_tsk);
17374 +       return 0;
17377 +static void cpu_unplug_sync(unsigned int cpu)
17379 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
17381 +       init_completion(&hp->synced);
17382 +       /* The completion needs to be initialzied before setting grab_lock */
17383 +       smp_wmb();
17385 +       /* Grab the mutex before setting grab_lock */
17386 +       hotplug_lock(hp);
17387 +       hp->grab_lock = 1;
17389 +       /*
17390 +        * The CPU notifiers have been completed.
17391 +        * Wait for tasks to get out of pinned CPU sections and have new
17392 +        * tasks block until the CPU is completely down.
17393 +        */
17394 +       __cpu_unplug_sync(hp);
17396 +       /* All done with the sync thread */
17397 +       kthread_stop(hp->sync_tsk);
17398 +       hp->sync_tsk = NULL;
17401 +static void cpu_unplug_done(unsigned int cpu)
17403 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
17405 +       hp->unplug = NULL;
17406 +       /* Let all tasks know cpu unplug is finished before cleaning up */
17407 +       smp_wmb();
17409 +       if (hp->sync_tsk)
17410 +               kthread_stop(hp->sync_tsk);
17412 +       if (hp->grab_lock) {
17413 +               hotplug_unlock(hp);
17414 +               /* protected by cpu_hotplug.lock */
17415 +               hp->grab_lock = 0;
17416 +       }
17417 +       tell_sched_cpu_down_done(cpu);
17420  void get_online_cpus(void)
17422 @@ -338,13 +621,15 @@ static int take_cpu_down(void *_param)
17423  /* Requires cpu_add_remove_lock to be held */
17424  static int _cpu_down(unsigned int cpu, int tasks_frozen)
17426 -       int err, nr_calls = 0;
17427 +       int mycpu, err, nr_calls = 0;
17428         void *hcpu = (void *)(long)cpu;
17429         unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
17430         struct take_cpu_down_param tcd_param = {
17431                 .mod = mod,
17432                 .hcpu = hcpu,
17433         };
17434 +       cpumask_var_t cpumask;
17435 +       cpumask_var_t cpumask_org;
17437         if (num_online_cpus() == 1)
17438                 return -EBUSY;
17439 @@ -352,7 +637,34 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17440         if (!cpu_online(cpu))
17441                 return -EINVAL;
17443 +       /* Move the downtaker off the unplug cpu */
17444 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
17445 +               return -ENOMEM;
17446 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
17447 +               free_cpumask_var(cpumask);
17448 +               return -ENOMEM;
17449 +       }
17451 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
17452 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
17453 +       set_cpus_allowed_ptr(current, cpumask);
17454 +       free_cpumask_var(cpumask);
17455 +       migrate_disable();
17456 +       mycpu = smp_processor_id();
17457 +       if (mycpu == cpu) {
17458 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
17459 +               migrate_enable();
17460 +               err = -EBUSY;
17461 +               goto restore_cpus;
17462 +       }
17463 +       migrate_enable();
17465         cpu_hotplug_begin();
17466 +       err = cpu_unplug_begin(cpu);
17467 +       if (err) {
17468 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
17469 +               goto out_cancel;
17470 +       }
17472         err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
17473         if (err) {
17474 @@ -378,8 +690,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17475         else
17476                 synchronize_rcu();
17478 +       __cpu_unplug_wait(cpu);
17479         smpboot_park_threads(cpu);
17481 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
17482 +       cpu_unplug_sync(cpu);
17484         /*
17485          * Prevent irq alloc/free while the dying cpu reorganizes the
17486          * interrupt affinities.
17487 @@ -424,9 +740,14 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17488         check_for_tasks(cpu);
17490  out_release:
17491 +       cpu_unplug_done(cpu);
17492 +out_cancel:
17493         cpu_hotplug_done();
17494         if (!err)
17495                 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
17496 +restore_cpus:
17497 +       set_cpus_allowed_ptr(current, cpumask_org);
17498 +       free_cpumask_var(cpumask_org);
17499         return err;
17502 diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
17503 index 009cc9a17d95..10f4640f991e 100644
17504 --- a/kernel/cpu_pm.c
17505 +++ b/kernel/cpu_pm.c
17506 @@ -22,14 +22,13 @@
17507  #include <linux/spinlock.h>
17508  #include <linux/syscore_ops.h>
17510 -static DEFINE_RWLOCK(cpu_pm_notifier_lock);
17511 -static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
17512 +static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
17514  static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
17516         int ret;
17518 -       ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
17519 +       ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
17520                 nr_to_call, nr_calls);
17522         return notifier_to_errno(ret);
17523 @@ -47,14 +46,7 @@ static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
17524   */
17525  int cpu_pm_register_notifier(struct notifier_block *nb)
17527 -       unsigned long flags;
17528 -       int ret;
17530 -       write_lock_irqsave(&cpu_pm_notifier_lock, flags);
17531 -       ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
17532 -       write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
17534 -       return ret;
17535 +       return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
17537  EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
17539 @@ -69,14 +61,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
17540   */
17541  int cpu_pm_unregister_notifier(struct notifier_block *nb)
17543 -       unsigned long flags;
17544 -       int ret;
17546 -       write_lock_irqsave(&cpu_pm_notifier_lock, flags);
17547 -       ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
17548 -       write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
17550 -       return ret;
17551 +       return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
17553  EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
17555 @@ -100,7 +85,6 @@ int cpu_pm_enter(void)
17556         int nr_calls;
17557         int ret = 0;
17559 -       read_lock(&cpu_pm_notifier_lock);
17560         ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
17561         if (ret)
17562                 /*
17563 @@ -108,7 +92,6 @@ int cpu_pm_enter(void)
17564                  * PM entry who are notified earlier to prepare for it.
17565                  */
17566                 cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
17567 -       read_unlock(&cpu_pm_notifier_lock);
17569         return ret;
17571 @@ -128,13 +111,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
17572   */
17573  int cpu_pm_exit(void)
17575 -       int ret;
17577 -       read_lock(&cpu_pm_notifier_lock);
17578 -       ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
17579 -       read_unlock(&cpu_pm_notifier_lock);
17581 -       return ret;
17582 +       return cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
17584  EXPORT_SYMBOL_GPL(cpu_pm_exit);
17586 @@ -159,7 +136,6 @@ int cpu_cluster_pm_enter(void)
17587         int nr_calls;
17588         int ret = 0;
17590 -       read_lock(&cpu_pm_notifier_lock);
17591         ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
17592         if (ret)
17593                 /*
17594 @@ -167,7 +143,6 @@ int cpu_cluster_pm_enter(void)
17595                  * PM entry who are notified earlier to prepare for it.
17596                  */
17597                 cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
17598 -       read_unlock(&cpu_pm_notifier_lock);
17600         return ret;
17602 @@ -190,13 +165,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
17603   */
17604  int cpu_cluster_pm_exit(void)
17606 -       int ret;
17608 -       read_lock(&cpu_pm_notifier_lock);
17609 -       ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
17610 -       read_unlock(&cpu_pm_notifier_lock);
17612 -       return ret;
17613 +       return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
17615  EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
17617 diff --git a/kernel/cpuset.c b/kernel/cpuset.c
17618 index dd3ae6ee064d..8dc21a9ac292 100644
17619 --- a/kernel/cpuset.c
17620 +++ b/kernel/cpuset.c
17621 @@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
17622   */
17624  static DEFINE_MUTEX(cpuset_mutex);
17625 -static DEFINE_SPINLOCK(callback_lock);
17626 +static DEFINE_RAW_SPINLOCK(callback_lock);
17628  static struct workqueue_struct *cpuset_migrate_mm_wq;
17630 @@ -907,9 +907,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
17631                         continue;
17632                 rcu_read_unlock();
17634 -               spin_lock_irq(&callback_lock);
17635 +               raw_spin_lock_irq(&callback_lock);
17636                 cpumask_copy(cp->effective_cpus, new_cpus);
17637 -               spin_unlock_irq(&callback_lock);
17638 +               raw_spin_unlock_irq(&callback_lock);
17640                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
17641                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
17642 @@ -974,9 +974,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
17643         if (retval < 0)
17644                 return retval;
17646 -       spin_lock_irq(&callback_lock);
17647 +       raw_spin_lock_irq(&callback_lock);
17648         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
17649 -       spin_unlock_irq(&callback_lock);
17650 +       raw_spin_unlock_irq(&callback_lock);
17652         /* use trialcs->cpus_allowed as a temp variable */
17653         update_cpumasks_hier(cs, trialcs->cpus_allowed);
17654 @@ -1185,9 +1185,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
17655                         continue;
17656                 rcu_read_unlock();
17658 -               spin_lock_irq(&callback_lock);
17659 +               raw_spin_lock_irq(&callback_lock);
17660                 cp->effective_mems = *new_mems;
17661 -               spin_unlock_irq(&callback_lock);
17662 +               raw_spin_unlock_irq(&callback_lock);
17664                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
17665                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
17666 @@ -1255,9 +1255,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
17667         if (retval < 0)
17668                 goto done;
17670 -       spin_lock_irq(&callback_lock);
17671 +       raw_spin_lock_irq(&callback_lock);
17672         cs->mems_allowed = trialcs->mems_allowed;
17673 -       spin_unlock_irq(&callback_lock);
17674 +       raw_spin_unlock_irq(&callback_lock);
17676         /* use trialcs->mems_allowed as a temp variable */
17677         update_nodemasks_hier(cs, &trialcs->mems_allowed);
17678 @@ -1348,9 +1348,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
17679         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
17680                         || (is_spread_page(cs) != is_spread_page(trialcs)));
17682 -       spin_lock_irq(&callback_lock);
17683 +       raw_spin_lock_irq(&callback_lock);
17684         cs->flags = trialcs->flags;
17685 -       spin_unlock_irq(&callback_lock);
17686 +       raw_spin_unlock_irq(&callback_lock);
17688         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
17689                 rebuild_sched_domains_locked();
17690 @@ -1762,7 +1762,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
17691         cpuset_filetype_t type = seq_cft(sf)->private;
17692         int ret = 0;
17694 -       spin_lock_irq(&callback_lock);
17695 +       raw_spin_lock_irq(&callback_lock);
17697         switch (type) {
17698         case FILE_CPULIST:
17699 @@ -1781,7 +1781,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
17700                 ret = -EINVAL;
17701         }
17703 -       spin_unlock_irq(&callback_lock);
17704 +       raw_spin_unlock_irq(&callback_lock);
17705         return ret;
17708 @@ -1996,12 +1996,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
17710         cpuset_inc();
17712 -       spin_lock_irq(&callback_lock);
17713 +       raw_spin_lock_irq(&callback_lock);
17714         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
17715                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
17716                 cs->effective_mems = parent->effective_mems;
17717         }
17718 -       spin_unlock_irq(&callback_lock);
17719 +       raw_spin_unlock_irq(&callback_lock);
17721         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
17722                 goto out_unlock;
17723 @@ -2028,12 +2028,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
17724         }
17725         rcu_read_unlock();
17727 -       spin_lock_irq(&callback_lock);
17728 +       raw_spin_lock_irq(&callback_lock);
17729         cs->mems_allowed = parent->mems_allowed;
17730         cs->effective_mems = parent->mems_allowed;
17731         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
17732         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
17733 -       spin_unlock_irq(&callback_lock);
17734 +       raw_spin_unlock_irq(&callback_lock);
17735  out_unlock:
17736         mutex_unlock(&cpuset_mutex);
17737         return 0;
17738 @@ -2072,7 +2072,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
17739  static void cpuset_bind(struct cgroup_subsys_state *root_css)
17741         mutex_lock(&cpuset_mutex);
17742 -       spin_lock_irq(&callback_lock);
17743 +       raw_spin_lock_irq(&callback_lock);
17745         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
17746                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
17747 @@ -2083,7 +2083,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
17748                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
17749         }
17751 -       spin_unlock_irq(&callback_lock);
17752 +       raw_spin_unlock_irq(&callback_lock);
17753         mutex_unlock(&cpuset_mutex);
17756 @@ -2184,12 +2184,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
17758         bool is_empty;
17760 -       spin_lock_irq(&callback_lock);
17761 +       raw_spin_lock_irq(&callback_lock);
17762         cpumask_copy(cs->cpus_allowed, new_cpus);
17763         cpumask_copy(cs->effective_cpus, new_cpus);
17764         cs->mems_allowed = *new_mems;
17765         cs->effective_mems = *new_mems;
17766 -       spin_unlock_irq(&callback_lock);
17767 +       raw_spin_unlock_irq(&callback_lock);
17769         /*
17770          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
17771 @@ -2226,10 +2226,10 @@ hotplug_update_tasks(struct cpuset *cs,
17772         if (nodes_empty(*new_mems))
17773                 *new_mems = parent_cs(cs)->effective_mems;
17775 -       spin_lock_irq(&callback_lock);
17776 +       raw_spin_lock_irq(&callback_lock);
17777         cpumask_copy(cs->effective_cpus, new_cpus);
17778         cs->effective_mems = *new_mems;
17779 -       spin_unlock_irq(&callback_lock);
17780 +       raw_spin_unlock_irq(&callback_lock);
17782         if (cpus_updated)
17783                 update_tasks_cpumask(cs);
17784 @@ -2322,21 +2322,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
17786         /* synchronize cpus_allowed to cpu_active_mask */
17787         if (cpus_updated) {
17788 -               spin_lock_irq(&callback_lock);
17789 +               raw_spin_lock_irq(&callback_lock);
17790                 if (!on_dfl)
17791                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
17792                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
17793 -               spin_unlock_irq(&callback_lock);
17794 +               raw_spin_unlock_irq(&callback_lock);
17795                 /* we don't mess with cpumasks of tasks in top_cpuset */
17796         }
17798         /* synchronize mems_allowed to N_MEMORY */
17799         if (mems_updated) {
17800 -               spin_lock_irq(&callback_lock);
17801 +               raw_spin_lock_irq(&callback_lock);
17802                 if (!on_dfl)
17803                         top_cpuset.mems_allowed = new_mems;
17804                 top_cpuset.effective_mems = new_mems;
17805 -               spin_unlock_irq(&callback_lock);
17806 +               raw_spin_unlock_irq(&callback_lock);
17807                 update_tasks_nodemask(&top_cpuset);
17808         }
17810 @@ -2441,11 +2441,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
17812         unsigned long flags;
17814 -       spin_lock_irqsave(&callback_lock, flags);
17815 +       raw_spin_lock_irqsave(&callback_lock, flags);
17816         rcu_read_lock();
17817         guarantee_online_cpus(task_cs(tsk), pmask);
17818         rcu_read_unlock();
17819 -       spin_unlock_irqrestore(&callback_lock, flags);
17820 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
17823  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
17824 @@ -2493,11 +2493,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
17825         nodemask_t mask;
17826         unsigned long flags;
17828 -       spin_lock_irqsave(&callback_lock, flags);
17829 +       raw_spin_lock_irqsave(&callback_lock, flags);
17830         rcu_read_lock();
17831         guarantee_online_mems(task_cs(tsk), &mask);
17832         rcu_read_unlock();
17833 -       spin_unlock_irqrestore(&callback_lock, flags);
17834 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
17836         return mask;
17838 @@ -2589,14 +2589,14 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
17839                 return 1;
17841         /* Not hardwall and node outside mems_allowed: scan up cpusets */
17842 -       spin_lock_irqsave(&callback_lock, flags);
17843 +       raw_spin_lock_irqsave(&callback_lock, flags);
17845         rcu_read_lock();
17846         cs = nearest_hardwall_ancestor(task_cs(current));
17847         allowed = node_isset(node, cs->mems_allowed);
17848         rcu_read_unlock();
17850 -       spin_unlock_irqrestore(&callback_lock, flags);
17851 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
17852         return allowed;
17855 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
17856 index 77777d918676..3203e9dee9f8 100644
17857 --- a/kernel/debug/kdb/kdb_io.c
17858 +++ b/kernel/debug/kdb/kdb_io.c
17859 @@ -554,7 +554,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
17860         int linecount;
17861         int colcount;
17862         int logging, saved_loglevel = 0;
17863 -       int saved_trap_printk;
17864         int got_printf_lock = 0;
17865         int retlen = 0;
17866         int fnd, len;
17867 @@ -565,8 +564,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
17868         unsigned long uninitialized_var(flags);
17870         preempt_disable();
17871 -       saved_trap_printk = kdb_trap_printk;
17872 -       kdb_trap_printk = 0;
17874         /* Serialize kdb_printf if multiple cpus try to write at once.
17875          * But if any cpu goes recursive in kdb, just print the output,
17876 @@ -855,7 +852,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
17877         } else {
17878                 __release(kdb_printf_lock);
17879         }
17880 -       kdb_trap_printk = saved_trap_printk;
17881         preempt_enable();
17882         return retlen;
17884 @@ -865,9 +861,11 @@ int kdb_printf(const char *fmt, ...)
17885         va_list ap;
17886         int r;
17888 +       kdb_trap_printk++;
17889         va_start(ap, fmt);
17890         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
17891         va_end(ap);
17892 +       kdb_trap_printk--;
17894         return r;
17896 diff --git a/kernel/events/core.c b/kernel/events/core.c
17897 index 8f75386e61a7..489cf0e1ac9c 100644
17898 --- a/kernel/events/core.c
17899 +++ b/kernel/events/core.c
17900 @@ -802,6 +802,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
17901         raw_spin_lock_init(&cpuctx->hrtimer_lock);
17902         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
17903         timer->function = perf_mux_hrtimer_handler;
17904 +       timer->irqsafe = 1;
17907  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
17908 @@ -7241,6 +7242,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
17910         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17911         hwc->hrtimer.function = perf_swevent_hrtimer;
17912 +       hwc->hrtimer.irqsafe = 1;
17914         /*
17915          * Since hrtimers have a fixed rate, we can do a static freq->period
17916 diff --git a/kernel/exit.c b/kernel/exit.c
17917 index ffba5df4abd5..e199407f8831 100644
17918 --- a/kernel/exit.c
17919 +++ b/kernel/exit.c
17920 @@ -144,7 +144,7 @@ static void __exit_signal(struct task_struct *tsk)
17921          * Do this under ->siglock, we can race with another thread
17922          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
17923          */
17924 -       flush_sigqueue(&tsk->pending);
17925 +       flush_task_sigqueue(tsk);
17926         tsk->sighand = NULL;
17927         spin_unlock(&sighand->siglock);
17929 diff --git a/kernel/fork.c b/kernel/fork.c
17930 index ac00f14208b7..9859641947bd 100644
17931 --- a/kernel/fork.c
17932 +++ b/kernel/fork.c
17933 @@ -109,7 +109,7 @@ int max_threads;            /* tunable limit on nr_threads */
17935  DEFINE_PER_CPU(unsigned long, process_counts) = 0;
17937 -__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
17938 +DEFINE_RWLOCK(tasklist_lock);  /* outer */
17940  #ifdef CONFIG_PROVE_RCU
17941  int lockdep_tasklist_lock_is_held(void)
17942 @@ -246,7 +246,9 @@ static inline void put_signal_struct(struct signal_struct *sig)
17943         if (atomic_dec_and_test(&sig->sigcnt))
17944                 free_signal_struct(sig);
17947 +#ifdef CONFIG_PREEMPT_RT_BASE
17948 +static
17949 +#endif
17950  void __put_task_struct(struct task_struct *tsk)
17952         WARN_ON(!tsk->exit_state);
17953 @@ -263,7 +265,18 @@ void __put_task_struct(struct task_struct *tsk)
17954         if (!profile_handoff_task(tsk))
17955                 free_task(tsk);
17957 +#ifndef CONFIG_PREEMPT_RT_BASE
17958  EXPORT_SYMBOL_GPL(__put_task_struct);
17959 +#else
17960 +void __put_task_struct_cb(struct rcu_head *rhp)
17962 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
17964 +       __put_task_struct(tsk);
17967 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
17968 +#endif
17970  void __init __weak arch_task_cache_init(void) { }
17972 @@ -388,6 +401,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
17973         tsk->splice_pipe = NULL;
17974         tsk->task_frag.page = NULL;
17975         tsk->wake_q.next = NULL;
17976 +       tsk->wake_q_sleeper.next = NULL;
17978         account_kernel_stack(ti, 1);
17980 @@ -699,6 +713,19 @@ void __mmdrop(struct mm_struct *mm)
17982  EXPORT_SYMBOL_GPL(__mmdrop);
17984 +#ifdef CONFIG_PREEMPT_RT_BASE
17986 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
17987 + * want another facility to make this work.
17988 + */
17989 +void __mmdrop_delayed(struct rcu_head *rhp)
17991 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
17993 +       __mmdrop(mm);
17995 +#endif
17997  /*
17998   * Decrement the use count and release all resources for an mm.
17999   */
18000 @@ -1249,6 +1276,9 @@ static void rt_mutex_init_task(struct task_struct *p)
18001   */
18002  static void posix_cpu_timers_init(struct task_struct *tsk)
18004 +#ifdef CONFIG_PREEMPT_RT_BASE
18005 +       tsk->posix_timer_list = NULL;
18006 +#endif
18007         tsk->cputime_expires.prof_exp = 0;
18008         tsk->cputime_expires.virt_exp = 0;
18009         tsk->cputime_expires.sched_exp = 0;
18010 @@ -1375,15 +1405,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
18011         spin_lock_init(&p->alloc_lock);
18013         init_sigpending(&p->pending);
18014 +       p->sigqueue_cache = NULL;
18016         p->utime = p->stime = p->gtime = 0;
18017         p->utimescaled = p->stimescaled = 0;
18018         prev_cputime_init(&p->prev_cputime);
18020  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
18021 -       seqlock_init(&p->vtime_seqlock);
18022 +       seqcount_init(&p->vtime_seqcount);
18023         p->vtime_snap = 0;
18024 -       p->vtime_snap_whence = VTIME_SLEEPING;
18025 +       p->vtime_snap_whence = VTIME_INACTIVE;
18026  #endif
18028  #if defined(SPLIT_RSS_COUNTING)
18029 diff --git a/kernel/futex.c b/kernel/futex.c
18030 index fc68462801de..b577ac5dc4a0 100644
18031 --- a/kernel/futex.c
18032 +++ b/kernel/futex.c
18033 @@ -815,7 +815,9 @@ void exit_pi_state_list(struct task_struct *curr)
18034                  * task still owns the PI-state:
18035                  */
18036                 if (head->next != next) {
18037 +                       raw_spin_unlock_irq(&curr->pi_lock);
18038                         spin_unlock(&hb->lock);
18039 +                       raw_spin_lock_irq(&curr->pi_lock);
18040                         continue;
18041                 }
18043 @@ -1210,6 +1212,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
18044         struct futex_pi_state *pi_state = this->pi_state;
18045         u32 uninitialized_var(curval), newval;
18046         WAKE_Q(wake_q);
18047 +       WAKE_Q(wake_sleeper_q);
18048         bool deboost;
18049         int ret = 0;
18051 @@ -1223,7 +1226,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
18052         if (pi_state->owner != current)
18053                 return -EINVAL;
18055 -       raw_spin_lock(&pi_state->pi_mutex.wait_lock);
18056 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
18057         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
18059         /*
18060 @@ -1259,24 +1262,25 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
18061                         ret = -EINVAL;
18062         }
18063         if (ret) {
18064 -               raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
18065 +               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
18066                 return ret;
18067         }
18069 -       raw_spin_lock_irq(&pi_state->owner->pi_lock);
18070 +       raw_spin_lock(&pi_state->owner->pi_lock);
18071         WARN_ON(list_empty(&pi_state->list));
18072         list_del_init(&pi_state->list);
18073 -       raw_spin_unlock_irq(&pi_state->owner->pi_lock);
18074 +       raw_spin_unlock(&pi_state->owner->pi_lock);
18076 -       raw_spin_lock_irq(&new_owner->pi_lock);
18077 +       raw_spin_lock(&new_owner->pi_lock);
18078         WARN_ON(!list_empty(&pi_state->list));
18079         list_add(&pi_state->list, &new_owner->pi_state_list);
18080         pi_state->owner = new_owner;
18081 -       raw_spin_unlock_irq(&new_owner->pi_lock);
18082 +       raw_spin_unlock(&new_owner->pi_lock);
18084 -       raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
18085 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
18087 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
18088 +       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
18089 +                                       &wake_sleeper_q);
18091         /*
18092          * First unlock HB so the waiter does not spin on it once he got woken
18093 @@ -1284,8 +1288,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
18094          * deboost first (and lose our higher priority), then the task might get
18095          * scheduled away before the wake up can take place.
18096          */
18097 -       spin_unlock(&hb->lock);
18098 +       deboost |= spin_unlock_no_deboost(&hb->lock);
18099         wake_up_q(&wake_q);
18100 +       wake_up_q_sleeper(&wake_sleeper_q);
18101         if (deboost)
18102                 rt_mutex_adjust_prio(current);
18104 @@ -1822,6 +1827,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
18105                                 requeue_pi_wake_futex(this, &key2, hb2);
18106                                 drop_count++;
18107                                 continue;
18108 +                       } else if (ret == -EAGAIN) {
18109 +                               /*
18110 +                                * Waiter was woken by timeout or
18111 +                                * signal and has set pi_blocked_on to
18112 +                                * PI_WAKEUP_INPROGRESS before we
18113 +                                * tried to enqueue it on the rtmutex.
18114 +                                */
18115 +                               this->pi_state = NULL;
18116 +                               free_pi_state(pi_state);
18117 +                               continue;
18118                         } else if (ret) {
18119                                 /* -EDEADLK */
18120                                 this->pi_state = NULL;
18121 @@ -2143,11 +2158,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
18122                  * we returned due to timeout or signal without taking the
18123                  * rt_mutex. Too late.
18124                  */
18125 -               raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
18126 +               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
18127                 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
18128                 if (!owner)
18129                         owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
18130 -               raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
18131 +               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
18132                 ret = fixup_pi_state_owner(uaddr, q, owner);
18133                 goto out;
18134         }
18135 @@ -2694,7 +2709,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
18137         struct hrtimer_sleeper timeout, *to = NULL;
18138         struct rt_mutex_waiter rt_waiter;
18139 -       struct futex_hash_bucket *hb;
18140 +       struct futex_hash_bucket *hb, *hb2;
18141         union futex_key key2 = FUTEX_KEY_INIT;
18142         struct futex_q q = futex_q_init;
18143         int res, ret;
18144 @@ -2719,10 +2734,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
18145          * The waiter is allocated on our stack, manipulated by the requeue
18146          * code while we sleep on uaddr.
18147          */
18148 -       debug_rt_mutex_init_waiter(&rt_waiter);
18149 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
18150 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
18151 -       rt_waiter.task = NULL;
18152 +       rt_mutex_init_waiter(&rt_waiter, false);
18154         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
18155         if (unlikely(ret != 0))
18156 @@ -2753,20 +2765,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
18157         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
18158         futex_wait_queue_me(hb, &q, to);
18160 -       spin_lock(&hb->lock);
18161 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
18162 -       spin_unlock(&hb->lock);
18163 -       if (ret)
18164 -               goto out_put_keys;
18165 +       /*
18166 +        * On RT we must avoid races with requeue and trying to block
18167 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
18168 +        * serializing access to pi_blocked_on with pi_lock.
18169 +        */
18170 +       raw_spin_lock_irq(&current->pi_lock);
18171 +       if (current->pi_blocked_on) {
18172 +               /*
18173 +                * We have been requeued or are in the process of
18174 +                * being requeued.
18175 +                */
18176 +               raw_spin_unlock_irq(&current->pi_lock);
18177 +       } else {
18178 +               /*
18179 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
18180 +                * prevents a concurrent requeue from moving us to the
18181 +                * uaddr2 rtmutex. After that we can safely acquire
18182 +                * (and possibly block on) hb->lock.
18183 +                */
18184 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
18185 +               raw_spin_unlock_irq(&current->pi_lock);
18187 +               spin_lock(&hb->lock);
18189 +               /*
18190 +                * Clean up pi_blocked_on. We might leak it otherwise
18191 +                * when we succeeded with the hb->lock in the fast
18192 +                * path.
18193 +                */
18194 +               raw_spin_lock_irq(&current->pi_lock);
18195 +               current->pi_blocked_on = NULL;
18196 +               raw_spin_unlock_irq(&current->pi_lock);
18198 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
18199 +               spin_unlock(&hb->lock);
18200 +               if (ret)
18201 +                       goto out_put_keys;
18202 +       }
18204         /*
18205 -        * In order for us to be here, we know our q.key == key2, and since
18206 -        * we took the hb->lock above, we also know that futex_requeue() has
18207 -        * completed and we no longer have to concern ourselves with a wakeup
18208 -        * race with the atomic proxy lock acquisition by the requeue code. The
18209 -        * futex_requeue dropped our key1 reference and incremented our key2
18210 -        * reference count.
18211 +        * In order to be here, we have either been requeued, are in
18212 +        * the process of being requeued, or requeue successfully
18213 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
18214 +        * non-null above, we may be racing with a requeue.  Do not
18215 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
18216 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
18217 +        * reference and incremented our key2 reference count.
18218          */
18219 +       hb2 = hash_futex(&key2);
18221         /* Check if the requeue code acquired the second futex for us. */
18222         if (!q.rt_waiter) {
18223 @@ -2775,7 +2822,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
18224                  * did a lock-steal - fix up the PI-state in that case.
18225                  */
18226                 if (q.pi_state && (q.pi_state->owner != current)) {
18227 -                       spin_lock(q.lock_ptr);
18228 +                       spin_lock(&hb2->lock);
18229 +                       BUG_ON(&hb2->lock != q.lock_ptr);
18230                         ret = fixup_pi_state_owner(uaddr2, &q, current);
18231                         if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
18232                                 rt_mutex_unlock(&q.pi_state->pi_mutex);
18233 @@ -2784,7 +2832,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
18234                          * the requeue_pi() code acquired for us.
18235                          */
18236                         free_pi_state(q.pi_state);
18237 -                       spin_unlock(q.lock_ptr);
18238 +                       spin_unlock(&hb2->lock);
18239                 }
18240         } else {
18241                 struct rt_mutex *pi_mutex;
18242 @@ -2799,7 +2847,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
18243                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
18244                 debug_rt_mutex_free_waiter(&rt_waiter);
18246 -               spin_lock(q.lock_ptr);
18247 +               spin_lock(&hb2->lock);
18248 +               BUG_ON(&hb2->lock != q.lock_ptr);
18249                 /*
18250                  * Fixup the pi_state owner and possibly acquire the lock if we
18251                  * haven't already.
18252 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
18253 index 57bff7857e87..6c65c9252991 100644
18254 --- a/kernel/irq/handle.c
18255 +++ b/kernel/irq/handle.c
18256 @@ -134,6 +134,8 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
18258  irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
18260 +       struct pt_regs *regs = get_irq_regs();
18261 +       u64 ip = regs ? instruction_pointer(regs) : 0;
18262         irqreturn_t retval = IRQ_NONE;
18263         unsigned int flags = 0, irq = desc->irq_data.irq;
18264         struct irqaction *action = desc->action;
18265 @@ -176,7 +178,11 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
18266                 action = action->next;
18267         }
18269 -       add_interrupt_randomness(irq, flags);
18270 +#ifdef CONFIG_PREEMPT_RT_FULL
18271 +       desc->random_ip = ip;
18272 +#else
18273 +       add_interrupt_randomness(irq, flags, ip);
18274 +#endif
18276         if (!noirqdebug)
18277                 note_interrupt(desc, retval);
18278 diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
18279 index 239e2ae2c947..0b73349a42d5 100644
18280 --- a/kernel/irq/irqdesc.c
18281 +++ b/kernel/irq/irqdesc.c
18282 @@ -24,10 +24,27 @@
18283  static struct lock_class_key irq_desc_lock_class;
18285  #if defined(CONFIG_SMP)
18286 +static int __init irq_affinity_setup(char *str)
18288 +       zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
18289 +       cpulist_parse(str, irq_default_affinity);
18290 +       /*
18291 +        * Set at least the boot cpu. We don't want to end up with
18292 +        * bugreports caused by random comandline masks
18293 +        */
18294 +       cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
18295 +       return 1;
18297 +__setup("irqaffinity=", irq_affinity_setup);
18299  static void __init init_irq_default_affinity(void)
18301 -       alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
18302 -       cpumask_setall(irq_default_affinity);
18303 +#ifdef CONFIG_CPUMASK_OFFSTACK
18304 +       if (!irq_default_affinity)
18305 +               zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
18306 +#endif
18307 +       if (cpumask_empty(irq_default_affinity))
18308 +               cpumask_setall(irq_default_affinity);
18310  #else
18311  static void __init init_irq_default_affinity(void)
18312 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
18313 index a079ed14f230..2cf55df7ad0e 100644
18314 --- a/kernel/irq/manage.c
18315 +++ b/kernel/irq/manage.c
18316 @@ -22,6 +22,7 @@
18317  #include "internals.h"
18319  #ifdef CONFIG_IRQ_FORCED_THREADING
18320 +# ifndef CONFIG_PREEMPT_RT_BASE
18321  __read_mostly bool force_irqthreads;
18323  static int __init setup_forced_irqthreads(char *arg)
18324 @@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
18325         return 0;
18327  early_param("threadirqs", setup_forced_irqthreads);
18328 +# endif
18329  #endif
18331  static void __synchronize_hardirq(struct irq_desc *desc)
18332 @@ -181,6 +183,62 @@ static inline void
18333  irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
18334  #endif
18336 +#ifdef CONFIG_PREEMPT_RT_FULL
18337 +static void _irq_affinity_notify(struct irq_affinity_notify *notify);
18338 +static struct task_struct *set_affinity_helper;
18339 +static LIST_HEAD(affinity_list);
18340 +static DEFINE_RAW_SPINLOCK(affinity_list_lock);
18342 +static int set_affinity_thread(void *unused)
18344 +       while (1) {
18345 +               struct irq_affinity_notify *notify;
18346 +               int empty;
18348 +               set_current_state(TASK_INTERRUPTIBLE);
18350 +               raw_spin_lock_irq(&affinity_list_lock);
18351 +               empty = list_empty(&affinity_list);
18352 +               raw_spin_unlock_irq(&affinity_list_lock);
18354 +               if (empty)
18355 +                       schedule();
18356 +               if (kthread_should_stop())
18357 +                       break;
18358 +               set_current_state(TASK_RUNNING);
18359 +try_next:
18360 +               notify = NULL;
18362 +               raw_spin_lock_irq(&affinity_list_lock);
18363 +               if (!list_empty(&affinity_list)) {
18364 +                       notify = list_first_entry(&affinity_list,
18365 +                                       struct irq_affinity_notify, list);
18366 +                       list_del_init(&notify->list);
18367 +               }
18368 +               raw_spin_unlock_irq(&affinity_list_lock);
18370 +               if (!notify)
18371 +                       continue;
18372 +               _irq_affinity_notify(notify);
18373 +               goto try_next;
18374 +       }
18375 +       return 0;
18378 +static void init_helper_thread(void)
18380 +       if (set_affinity_helper)
18381 +               return;
18382 +       set_affinity_helper = kthread_run(set_affinity_thread, NULL,
18383 +                       "affinity-cb");
18384 +       WARN_ON(IS_ERR(set_affinity_helper));
18386 +#else
18388 +static inline void init_helper_thread(void) { }
18390 +#endif
18392  int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
18393                         bool force)
18395 @@ -220,7 +278,17 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
18397         if (desc->affinity_notify) {
18398                 kref_get(&desc->affinity_notify->kref);
18400 +#ifdef CONFIG_PREEMPT_RT_FULL
18401 +               raw_spin_lock(&affinity_list_lock);
18402 +               if (list_empty(&desc->affinity_notify->list))
18403 +                       list_add_tail(&affinity_list,
18404 +                                       &desc->affinity_notify->list);
18405 +               raw_spin_unlock(&affinity_list_lock);
18406 +               wake_up_process(set_affinity_helper);
18407 +#else
18408                 schedule_work(&desc->affinity_notify->work);
18409 +#endif
18410         }
18411         irqd_set(data, IRQD_AFFINITY_SET);
18413 @@ -258,10 +326,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
18415  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
18417 -static void irq_affinity_notify(struct work_struct *work)
18418 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
18420 -       struct irq_affinity_notify *notify =
18421 -               container_of(work, struct irq_affinity_notify, work);
18422         struct irq_desc *desc = irq_to_desc(notify->irq);
18423         cpumask_var_t cpumask;
18424         unsigned long flags;
18425 @@ -283,6 +349,13 @@ static void irq_affinity_notify(struct work_struct *work)
18426         kref_put(&notify->kref, notify->release);
18429 +static void irq_affinity_notify(struct work_struct *work)
18431 +       struct irq_affinity_notify *notify =
18432 +               container_of(work, struct irq_affinity_notify, work);
18433 +       _irq_affinity_notify(notify);
18436  /**
18437   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
18438   *     @irq:           Interrupt for which to enable/disable notification
18439 @@ -312,6 +385,8 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
18440                 notify->irq = irq;
18441                 kref_init(&notify->kref);
18442                 INIT_WORK(&notify->work, irq_affinity_notify);
18443 +               INIT_LIST_HEAD(&notify->list);
18444 +               init_helper_thread();
18445         }
18447         raw_spin_lock_irqsave(&desc->lock, flags);
18448 @@ -865,7 +940,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
18449         local_bh_disable();
18450         ret = action->thread_fn(action->irq, action->dev_id);
18451         irq_finalize_oneshot(desc, action);
18452 -       local_bh_enable();
18453 +       /*
18454 +        * Interrupts which have real time requirements can be set up
18455 +        * to avoid softirq processing in the thread handler. This is
18456 +        * safe as these interrupts do not raise soft interrupts.
18457 +        */
18458 +       if (irq_settings_no_softirq_call(desc))
18459 +               _local_bh_enable();
18460 +       else
18461 +               local_bh_enable();
18462         return ret;
18465 @@ -962,6 +1045,12 @@ static int irq_thread(void *data)
18466                 if (action_ret == IRQ_WAKE_THREAD)
18467                         irq_wake_secondary(desc, action);
18469 +#ifdef CONFIG_PREEMPT_RT_FULL
18470 +               migrate_disable();
18471 +               add_interrupt_randomness(action->irq, 0,
18472 +                                desc->random_ip ^ (unsigned long) action);
18473 +               migrate_enable();
18474 +#endif
18475                 wake_threads_waitq(desc);
18476         }
18478 @@ -1317,6 +1406,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
18479                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
18480                 }
18482 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
18483 +                       irq_settings_set_no_softirq_call(desc);
18485                 /* Set default affinity mask once everything is setup */
18486                 setup_affinity(desc, mask);
18488 @@ -1970,7 +2062,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
18489   *     This call sets the internal irqchip state of an interrupt,
18490   *     depending on the value of @which.
18491   *
18492 - *     This function should be called with preemption disabled if the
18493 + *     This function should be called with migration disabled if the
18494   *     interrupt controller has per-cpu registers.
18495   */
18496  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
18497 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
18498 index 320579d89091..2df2d4445b1e 100644
18499 --- a/kernel/irq/settings.h
18500 +++ b/kernel/irq/settings.h
18501 @@ -16,6 +16,7 @@ enum {
18502         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
18503         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
18504         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
18505 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
18506         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
18507  };
18509 @@ -30,6 +31,7 @@ enum {
18510  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
18511  #define IRQ_IS_POLLED          GOT_YOU_MORON
18512  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
18513 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
18514  #undef IRQF_MODIFY_MASK
18515  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
18517 @@ -40,6 +42,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
18518         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
18521 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
18523 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
18526 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
18528 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
18531  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
18533         return desc->status_use_accessors & _IRQ_PER_CPU;
18534 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
18535 index 32144175458d..ed26f2554972 100644
18536 --- a/kernel/irq/spurious.c
18537 +++ b/kernel/irq/spurious.c
18538 @@ -444,6 +444,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
18540  static int __init irqfixup_setup(char *str)
18542 +#ifdef CONFIG_PREEMPT_RT_BASE
18543 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
18544 +       return 1;
18545 +#endif
18546         irqfixup = 1;
18547         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
18548         printk(KERN_WARNING "This may impact system performance.\n");
18549 @@ -456,6 +460,10 @@ module_param(irqfixup, int, 0644);
18551  static int __init irqpoll_setup(char *str)
18553 +#ifdef CONFIG_PREEMPT_RT_BASE
18554 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
18555 +       return 1;
18556 +#endif
18557         irqfixup = 2;
18558         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
18559                                 "enabled\n");
18560 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
18561 index bcf107ce0854..2899ba0d23d1 100644
18562 --- a/kernel/irq_work.c
18563 +++ b/kernel/irq_work.c
18564 @@ -17,6 +17,7 @@
18565  #include <linux/cpu.h>
18566  #include <linux/notifier.h>
18567  #include <linux/smp.h>
18568 +#include <linux/interrupt.h>
18569  #include <asm/processor.h>
18572 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
18573   */
18574  bool irq_work_queue_on(struct irq_work *work, int cpu)
18576 +       struct llist_head *list;
18578         /* All work should have been flushed before going offline */
18579         WARN_ON_ONCE(cpu_is_offline(cpu));
18581 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
18582         if (!irq_work_claim(work))
18583                 return false;
18585 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
18586 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
18587 +               list = &per_cpu(lazy_list, cpu);
18588 +       else
18589 +               list = &per_cpu(raised_list, cpu);
18591 +       if (llist_add(&work->llnode, list))
18592                 arch_send_call_function_single_ipi(cpu);
18594         return true;
18595 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
18596  /* Enqueue the irq work @work on the current CPU */
18597  bool irq_work_queue(struct irq_work *work)
18599 +       struct llist_head *list;
18600 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
18602         /* Only queue if not already pending */
18603         if (!irq_work_claim(work))
18604                 return false;
18605 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
18606         /* Queue the entry and raise the IPI if needed. */
18607         preempt_disable();
18609 -       /* If the work is "lazy", handle it from next tick if any */
18610 -       if (work->flags & IRQ_WORK_LAZY) {
18611 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
18612 -                   tick_nohz_tick_stopped())
18613 -                       arch_irq_work_raise();
18614 -       } else {
18615 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
18616 +       lazy_work = work->flags & IRQ_WORK_LAZY;
18618 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
18619 +               list = this_cpu_ptr(&lazy_list);
18620 +       else
18621 +               list = this_cpu_ptr(&raised_list);
18623 +       if (llist_add(&work->llnode, list)) {
18624 +               if (!lazy_work || tick_nohz_tick_stopped())
18625                         arch_irq_work_raise();
18626         }
18628 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
18629         raised = this_cpu_ptr(&raised_list);
18630         lazy = this_cpu_ptr(&lazy_list);
18632 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
18633 -               if (llist_empty(lazy))
18634 -                       return false;
18635 +       if (llist_empty(raised) && llist_empty(lazy))
18636 +               return false;
18638         /* All work should have been flushed before going offline */
18639         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
18640 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
18641         struct irq_work *work;
18642         struct llist_node *llnode;
18644 -       BUG_ON(!irqs_disabled());
18645 +       BUG_ON_NONRT(!irqs_disabled());
18647         if (llist_empty(list))
18648                 return;
18649 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
18650  void irq_work_run(void)
18652         irq_work_run_list(this_cpu_ptr(&raised_list));
18653 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
18654 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
18655 +               /*
18656 +                * NOTE: we raise softirq via IPI for safety,
18657 +                * and execute in irq_work_tick() to move the
18658 +                * overhead from hard to soft irq context.
18659 +                */
18660 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
18661 +                       raise_softirq(TIMER_SOFTIRQ);
18662 +       } else
18663 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
18665  EXPORT_SYMBOL_GPL(irq_work_run);
18667 @@ -179,8 +200,17 @@ void irq_work_tick(void)
18669         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
18670                 irq_work_run_list(raised);
18672 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
18673 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
18676 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
18677 +void irq_work_tick_soft(void)
18679         irq_work_run_list(this_cpu_ptr(&lazy_list));
18681 +#endif
18683  /*
18684   * Synchronize against the irq_work @entry, ensures the entry is not
18685 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
18686 index e83b26464061..c0e08d1cf33e 100644
18687 --- a/kernel/ksysfs.c
18688 +++ b/kernel/ksysfs.c
18689 @@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
18691  #endif /* CONFIG_KEXEC_CORE */
18693 +#if defined(CONFIG_PREEMPT_RT_FULL)
18694 +static ssize_t  realtime_show(struct kobject *kobj,
18695 +                             struct kobj_attribute *attr, char *buf)
18697 +       return sprintf(buf, "%d\n", 1);
18699 +KERNEL_ATTR_RO(realtime);
18700 +#endif
18702  /* whether file capabilities are enabled */
18703  static ssize_t fscaps_show(struct kobject *kobj,
18704                                   struct kobj_attribute *attr, char *buf)
18705 @@ -203,6 +212,9 @@ static struct attribute * kernel_attrs[] = {
18706         &vmcoreinfo_attr.attr,
18707  #endif
18708         &rcu_expedited_attr.attr,
18709 +#ifdef CONFIG_PREEMPT_RT_FULL
18710 +       &realtime_attr.attr,
18711 +#endif
18712         NULL
18713  };
18715 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
18716 index 8e96f6cc2a4a..447b03082d88 100644
18717 --- a/kernel/locking/Makefile
18718 +++ b/kernel/locking/Makefile
18719 @@ -1,5 +1,5 @@
18721 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
18722 +obj-y += semaphore.o percpu-rwsem.o
18724  ifdef CONFIG_FUNCTION_TRACER
18725  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
18726 @@ -8,7 +8,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
18727  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
18728  endif
18730 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
18731 +obj-y += mutex.o
18732  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
18733 +obj-y += rwsem.o
18734 +endif
18735  obj-$(CONFIG_LOCKDEP) += lockdep.o
18736  ifeq ($(CONFIG_PROC_FS),y)
18737  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
18738 @@ -22,7 +26,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
18739  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
18740  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
18741  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
18742 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
18743  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
18744  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
18745 +endif
18746 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
18747  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
18748  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
18749 diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
18750 index 951cfcd10b4a..57e0ea72c28a 100644
18751 --- a/kernel/locking/lglock.c
18752 +++ b/kernel/locking/lglock.c
18753 @@ -4,6 +4,15 @@
18754  #include <linux/cpu.h>
18755  #include <linux/string.h>
18757 +#ifndef CONFIG_PREEMPT_RT_FULL
18758 +# define lg_lock_ptr           arch_spinlock_t
18759 +# define lg_do_lock(l)         arch_spin_lock(l)
18760 +# define lg_do_unlock(l)       arch_spin_unlock(l)
18761 +#else
18762 +# define lg_lock_ptr           struct rt_mutex
18763 +# define lg_do_lock(l)         __rt_spin_lock__no_mg(l)
18764 +# define lg_do_unlock(l)       __rt_spin_unlock(l)
18765 +#endif
18766  /*
18767   * Note there is no uninit, so lglocks cannot be defined in
18768   * modules (but it's fine to use them from there)
18769 @@ -12,51 +21,60 @@
18771  void lg_lock_init(struct lglock *lg, char *name)
18773 +#ifdef CONFIG_PREEMPT_RT_FULL
18774 +       int i;
18776 +       for_each_possible_cpu(i) {
18777 +               struct rt_mutex *lock = per_cpu_ptr(lg->lock, i);
18779 +               rt_mutex_init(lock);
18780 +       }
18781 +#endif
18782         LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
18784  EXPORT_SYMBOL(lg_lock_init);
18786  void lg_local_lock(struct lglock *lg)
18788 -       arch_spinlock_t *lock;
18789 +       lg_lock_ptr *lock;
18791 -       preempt_disable();
18792 +       migrate_disable();
18793         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18794         lock = this_cpu_ptr(lg->lock);
18795 -       arch_spin_lock(lock);
18796 +       lg_do_lock(lock);
18798  EXPORT_SYMBOL(lg_local_lock);
18800  void lg_local_unlock(struct lglock *lg)
18802 -       arch_spinlock_t *lock;
18803 +       lg_lock_ptr *lock;
18805         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18806         lock = this_cpu_ptr(lg->lock);
18807 -       arch_spin_unlock(lock);
18808 -       preempt_enable();
18809 +       lg_do_unlock(lock);
18810 +       migrate_enable();
18812  EXPORT_SYMBOL(lg_local_unlock);
18814  void lg_local_lock_cpu(struct lglock *lg, int cpu)
18816 -       arch_spinlock_t *lock;
18817 +       lg_lock_ptr *lock;
18819 -       preempt_disable();
18820 +       preempt_disable_nort();
18821         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18822         lock = per_cpu_ptr(lg->lock, cpu);
18823 -       arch_spin_lock(lock);
18824 +       lg_do_lock(lock);
18826  EXPORT_SYMBOL(lg_local_lock_cpu);
18828  void lg_local_unlock_cpu(struct lglock *lg, int cpu)
18830 -       arch_spinlock_t *lock;
18831 +       lg_lock_ptr *lock;
18833         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18834         lock = per_cpu_ptr(lg->lock, cpu);
18835 -       arch_spin_unlock(lock);
18836 -       preempt_enable();
18837 +       lg_do_unlock(lock);
18838 +       preempt_enable_nort();
18840  EXPORT_SYMBOL(lg_local_unlock_cpu);
18842 @@ -68,30 +86,30 @@ void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
18843         if (cpu2 < cpu1)
18844                 swap(cpu1, cpu2);
18846 -       preempt_disable();
18847 +       preempt_disable_nort();
18848         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18849 -       arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
18850 -       arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
18851 +       lg_do_lock(per_cpu_ptr(lg->lock, cpu1));
18852 +       lg_do_lock(per_cpu_ptr(lg->lock, cpu2));
18855  void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
18857         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18858 -       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
18859 -       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
18860 -       preempt_enable();
18861 +       lg_do_unlock(per_cpu_ptr(lg->lock, cpu1));
18862 +       lg_do_unlock(per_cpu_ptr(lg->lock, cpu2));
18863 +       preempt_enable_nort();
18866  void lg_global_lock(struct lglock *lg)
18868         int i;
18870 -       preempt_disable();
18871 +       preempt_disable_nort();
18872         lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18873         for_each_possible_cpu(i) {
18874 -               arch_spinlock_t *lock;
18875 +               lg_lock_ptr *lock;
18876                 lock = per_cpu_ptr(lg->lock, i);
18877 -               arch_spin_lock(lock);
18878 +               lg_do_lock(lock);
18879         }
18881  EXPORT_SYMBOL(lg_global_lock);
18882 @@ -102,10 +120,35 @@ void lg_global_unlock(struct lglock *lg)
18884         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18885         for_each_possible_cpu(i) {
18886 -               arch_spinlock_t *lock;
18887 +               lg_lock_ptr *lock;
18888                 lock = per_cpu_ptr(lg->lock, i);
18889 -               arch_spin_unlock(lock);
18890 +               lg_do_unlock(lock);
18891         }
18892 -       preempt_enable();
18893 +       preempt_enable_nort();
18895  EXPORT_SYMBOL(lg_global_unlock);
18897 +#ifdef CONFIG_PREEMPT_RT_FULL
18899 + * HACK: If you use this, you get to keep the pieces.
18900 + * Used in queue_stop_cpus_work() when stop machinery
18901 + * is called from inactive CPU, so we can't schedule.
18902 + */
18903 +# define lg_do_trylock_relax(l)                        \
18904 +       do {                                    \
18905 +               while (!__rt_spin_trylock(l))   \
18906 +                       cpu_relax();            \
18907 +       } while (0)
18909 +void lg_global_trylock_relax(struct lglock *lg)
18911 +       int i;
18913 +       lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18914 +       for_each_possible_cpu(i) {
18915 +               lg_lock_ptr *lock;
18916 +               lock = per_cpu_ptr(lg->lock, i);
18917 +               lg_do_trylock_relax(lock);
18918 +       }
18920 +#endif
18921 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
18922 index 0e2c4911ba61..aed13ffa9fd3 100644
18923 --- a/kernel/locking/lockdep.c
18924 +++ b/kernel/locking/lockdep.c
18925 @@ -668,6 +668,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
18926         struct lockdep_subclass_key *key;
18927         struct list_head *hash_head;
18928         struct lock_class *class;
18929 +       bool is_static = false;
18931  #ifdef CONFIG_DEBUG_LOCKDEP
18932         /*
18933 @@ -695,10 +696,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
18935         /*
18936          * Static locks do not have their class-keys yet - for them the key
18937 -        * is the lock object itself:
18938 +        * is the lock object itself. If the lock is in the per cpu area,
18939 +        * the canonical address of the lock (per cpu offset removed) is
18940 +        * used.
18941          */
18942 -       if (unlikely(!lock->key))
18943 -               lock->key = (void *)lock;
18944 +       if (unlikely(!lock->key)) {
18945 +               unsigned long can_addr, addr = (unsigned long)lock;
18947 +               if (__is_kernel_percpu_address(addr, &can_addr))
18948 +                       lock->key = (void *)can_addr;
18949 +               else if (__is_module_percpu_address(addr, &can_addr))
18950 +                       lock->key = (void *)can_addr;
18951 +               else if (static_obj(lock))
18952 +                       lock->key = (void *)lock;
18953 +               else
18954 +                       return ERR_PTR(-EINVAL);
18955 +               is_static = true;
18956 +       }
18958         /*
18959          * NOTE: the class-key must be unique. For dynamic locks, a static
18960 @@ -730,7 +744,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
18961                 }
18962         }
18964 -       return NULL;
18965 +       return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
18968  /*
18969 @@ -748,19 +762,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
18970         DEBUG_LOCKS_WARN_ON(!irqs_disabled());
18972         class = look_up_lock_class(lock, subclass);
18973 -       if (likely(class))
18974 +       if (likely(!IS_ERR_OR_NULL(class)))
18975                 goto out_set_class_cache;
18977         /*
18978          * Debug-check: all keys must be persistent!
18979 -        */
18980 -       if (!static_obj(lock->key)) {
18981 +        */
18982 +       if (IS_ERR(class)) {
18983                 debug_locks_off();
18984                 printk("INFO: trying to register non-static key.\n");
18985                 printk("the code is fine but needs lockdep annotation.\n");
18986                 printk("turning off the locking correctness validator.\n");
18987                 dump_stack();
18989                 return NULL;
18990         }
18992 @@ -3285,7 +3298,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
18993                  * Clearly if the lock hasn't been acquired _ever_, we're not
18994                  * holding it either, so report failure.
18995                  */
18996 -               if (!class)
18997 +               if (IS_ERR_OR_NULL(class))
18998                         return 0;
19000                 /*
19001 @@ -3532,6 +3545,7 @@ static void check_flags(unsigned long flags)
19002                 }
19003         }
19005 +#ifndef CONFIG_PREEMPT_RT_FULL
19006         /*
19007          * We dont accurately track softirq state in e.g.
19008          * hardirq contexts (such as on 4KSTACKS), so only
19009 @@ -3546,6 +3560,7 @@ static void check_flags(unsigned long flags)
19010                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
19011                 }
19012         }
19013 +#endif
19015         if (!debug_locks)
19016                 print_irqtrace_events(current);
19017 @@ -3984,7 +3999,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
19018                  * If the class exists we look it up and zap it:
19019                  */
19020                 class = look_up_lock_class(lock, j);
19021 -               if (class)
19022 +               if (!IS_ERR_OR_NULL(class))
19023                         zap_class(class);
19024         }
19025         /*
19026 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
19027 index d580b7d6ee6d..4680c502b65b 100644
19028 --- a/kernel/locking/locktorture.c
19029 +++ b/kernel/locking/locktorture.c
19030 @@ -26,7 +26,6 @@
19031  #include <linux/kthread.h>
19032  #include <linux/sched/rt.h>
19033  #include <linux/spinlock.h>
19034 -#include <linux/rwlock.h>
19035  #include <linux/mutex.h>
19036  #include <linux/rwsem.h>
19037  #include <linux/smp.h>
19038 diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
19039 new file mode 100644
19040 index 000000000000..d4ab61c1848b
19041 --- /dev/null
19042 +++ b/kernel/locking/rt.c
19043 @@ -0,0 +1,474 @@
19045 + * kernel/rt.c
19046 + *
19047 + * Real-Time Preemption Support
19048 + *
19049 + * started by Ingo Molnar:
19050 + *
19051 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
19052 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
19053 + *
19054 + * historic credit for proving that Linux spinlocks can be implemented via
19055 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
19056 + * and others) who prototyped it on 2.4 and did lots of comparative
19057 + * research and analysis; TimeSys, for proving that you can implement a
19058 + * fully preemptible kernel via the use of IRQ threading and mutexes;
19059 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
19060 + * right one; and to MontaVista, who ported pmutexes to 2.6.
19061 + *
19062 + * This code is a from-scratch implementation and is not based on pmutexes,
19063 + * but the idea of converting spinlocks to mutexes is used here too.
19064 + *
19065 + * lock debugging, locking tree, deadlock detection:
19066 + *
19067 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
19068 + *  Released under the General Public License (GPL).
19069 + *
19070 + * Includes portions of the generic R/W semaphore implementation from:
19071 + *
19072 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
19073 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
19074 + *  - Derived also from comments by Linus
19075 + *
19076 + * Pending ownership of locks and ownership stealing:
19077 + *
19078 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
19079 + *
19080 + *   (also by Steven Rostedt)
19081 + *    - Converted single pi_lock to individual task locks.
19082 + *
19083 + * By Esben Nielsen:
19084 + *    Doing priority inheritance with help of the scheduler.
19085 + *
19086 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
19087 + *  - major rework based on Esben Nielsens initial patch
19088 + *  - replaced thread_info references by task_struct refs
19089 + *  - removed task->pending_owner dependency
19090 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
19091 + *    in the scheduler return path as discussed with Steven Rostedt
19092 + *
19093 + *  Copyright (C) 2006, Kihon Technologies Inc.
19094 + *    Steven Rostedt <rostedt@goodmis.org>
19095 + *  - debugged and patched Thomas Gleixner's rework.
19096 + *  - added back the cmpxchg to the rework.
19097 + *  - turned atomic require back on for SMP.
19098 + */
19100 +#include <linux/spinlock.h>
19101 +#include <linux/rtmutex.h>
19102 +#include <linux/sched.h>
19103 +#include <linux/delay.h>
19104 +#include <linux/module.h>
19105 +#include <linux/kallsyms.h>
19106 +#include <linux/syscalls.h>
19107 +#include <linux/interrupt.h>
19108 +#include <linux/plist.h>
19109 +#include <linux/fs.h>
19110 +#include <linux/futex.h>
19111 +#include <linux/hrtimer.h>
19113 +#include "rtmutex_common.h"
19116 + * struct mutex functions
19117 + */
19118 +void __mutex_do_init(struct mutex *mutex, const char *name,
19119 +                    struct lock_class_key *key)
19121 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19122 +       /*
19123 +        * Make sure we are not reinitializing a held lock:
19124 +        */
19125 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
19126 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
19127 +#endif
19128 +       mutex->lock.save_state = 0;
19130 +EXPORT_SYMBOL(__mutex_do_init);
19132 +void __lockfunc _mutex_lock(struct mutex *lock)
19134 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19135 +       rt_mutex_lock(&lock->lock);
19137 +EXPORT_SYMBOL(_mutex_lock);
19139 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
19141 +       int ret;
19143 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19144 +       ret = rt_mutex_lock_interruptible(&lock->lock);
19145 +       if (ret)
19146 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
19147 +       return ret;
19149 +EXPORT_SYMBOL(_mutex_lock_interruptible);
19151 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
19153 +       int ret;
19155 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19156 +       ret = rt_mutex_lock_killable(&lock->lock);
19157 +       if (ret)
19158 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
19159 +       return ret;
19161 +EXPORT_SYMBOL(_mutex_lock_killable);
19163 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19164 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
19166 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
19167 +       rt_mutex_lock(&lock->lock);
19169 +EXPORT_SYMBOL(_mutex_lock_nested);
19171 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
19173 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
19174 +       rt_mutex_lock(&lock->lock);
19176 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
19178 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
19180 +       int ret;
19182 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
19183 +       ret = rt_mutex_lock_interruptible(&lock->lock);
19184 +       if (ret)
19185 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
19186 +       return ret;
19188 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
19190 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
19192 +       int ret;
19194 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
19195 +       ret = rt_mutex_lock_killable(&lock->lock);
19196 +       if (ret)
19197 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
19198 +       return ret;
19200 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
19201 +#endif
19203 +int __lockfunc _mutex_trylock(struct mutex *lock)
19205 +       int ret = rt_mutex_trylock(&lock->lock);
19207 +       if (ret)
19208 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19210 +       return ret;
19212 +EXPORT_SYMBOL(_mutex_trylock);
19214 +void __lockfunc _mutex_unlock(struct mutex *lock)
19216 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
19217 +       rt_mutex_unlock(&lock->lock);
19219 +EXPORT_SYMBOL(_mutex_unlock);
19222 + * rwlock_t functions
19223 + */
19224 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
19226 +       int ret;
19228 +       migrate_disable();
19229 +       ret = rt_mutex_trylock(&rwlock->lock);
19230 +       if (ret)
19231 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
19232 +       else
19233 +               migrate_enable();
19235 +       return ret;
19237 +EXPORT_SYMBOL(rt_write_trylock);
19239 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
19241 +       int ret;
19243 +       *flags = 0;
19244 +       ret = rt_write_trylock(rwlock);
19245 +       return ret;
19247 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
19249 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
19251 +       struct rt_mutex *lock = &rwlock->lock;
19252 +       int ret = 1;
19254 +       /*
19255 +        * recursive read locks succeed when current owns the lock,
19256 +        * but not when read_depth == 0 which means that the lock is
19257 +        * write locked.
19258 +        */
19259 +       if (rt_mutex_owner(lock) != current) {
19260 +               migrate_disable();
19261 +               ret = rt_mutex_trylock(lock);
19262 +               if (ret)
19263 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
19264 +               else
19265 +                       migrate_enable();
19267 +       } else if (!rwlock->read_depth) {
19268 +               ret = 0;
19269 +       }
19271 +       if (ret)
19272 +               rwlock->read_depth++;
19274 +       return ret;
19276 +EXPORT_SYMBOL(rt_read_trylock);
19278 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
19280 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
19281 +       __rt_spin_lock(&rwlock->lock);
19283 +EXPORT_SYMBOL(rt_write_lock);
19285 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
19287 +       struct rt_mutex *lock = &rwlock->lock;
19290 +       /*
19291 +        * recursive read locks succeed when current owns the lock
19292 +        */
19293 +       if (rt_mutex_owner(lock) != current) {
19294 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
19295 +               __rt_spin_lock(lock);
19296 +       }
19297 +       rwlock->read_depth++;
19300 +EXPORT_SYMBOL(rt_read_lock);
19302 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
19304 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19305 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19306 +       __rt_spin_unlock(&rwlock->lock);
19307 +       migrate_enable();
19309 +EXPORT_SYMBOL(rt_write_unlock);
19311 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
19313 +       /* Release the lock only when read_depth is down to 0 */
19314 +       if (--rwlock->read_depth == 0) {
19315 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19316 +               __rt_spin_unlock(&rwlock->lock);
19317 +               migrate_enable();
19318 +       }
19320 +EXPORT_SYMBOL(rt_read_unlock);
19322 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
19324 +       rt_write_lock(rwlock);
19326 +       return 0;
19328 +EXPORT_SYMBOL(rt_write_lock_irqsave);
19330 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
19332 +       rt_read_lock(rwlock);
19334 +       return 0;
19336 +EXPORT_SYMBOL(rt_read_lock_irqsave);
19338 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
19340 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19341 +       /*
19342 +        * Make sure we are not reinitializing a held lock:
19343 +        */
19344 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
19345 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
19346 +#endif
19347 +       rwlock->lock.save_state = 1;
19348 +       rwlock->read_depth = 0;
19350 +EXPORT_SYMBOL(__rt_rwlock_init);
19353 + * rw_semaphores
19354 + */
19356 +void  rt_up_write(struct rw_semaphore *rwsem)
19358 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
19359 +       rt_mutex_unlock(&rwsem->lock);
19361 +EXPORT_SYMBOL(rt_up_write);
19363 +void __rt_up_read(struct rw_semaphore *rwsem)
19365 +       if (--rwsem->read_depth == 0)
19366 +               rt_mutex_unlock(&rwsem->lock);
19369 +void  rt_up_read(struct rw_semaphore *rwsem)
19371 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
19372 +       __rt_up_read(rwsem);
19374 +EXPORT_SYMBOL(rt_up_read);
19377 + * downgrade a write lock into a read lock
19378 + * - just wake up any readers at the front of the queue
19379 + */
19380 +void  rt_downgrade_write(struct rw_semaphore *rwsem)
19382 +       BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
19383 +       rwsem->read_depth = 1;
19385 +EXPORT_SYMBOL(rt_downgrade_write);
19387 +int  rt_down_write_trylock(struct rw_semaphore *rwsem)
19389 +       int ret = rt_mutex_trylock(&rwsem->lock);
19391 +       if (ret)
19392 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
19393 +       return ret;
19395 +EXPORT_SYMBOL(rt_down_write_trylock);
19397 +void  rt_down_write(struct rw_semaphore *rwsem)
19399 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
19400 +       rt_mutex_lock(&rwsem->lock);
19402 +EXPORT_SYMBOL(rt_down_write);
19404 +void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
19406 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
19407 +       rt_mutex_lock(&rwsem->lock);
19409 +EXPORT_SYMBOL(rt_down_write_nested);
19411 +void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
19412 +                              struct lockdep_map *nest)
19414 +       rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
19415 +       rt_mutex_lock(&rwsem->lock);
19417 +EXPORT_SYMBOL(rt_down_write_nested_lock);
19419 +int rt__down_read_trylock(struct rw_semaphore *rwsem)
19421 +       struct rt_mutex *lock = &rwsem->lock;
19422 +       int ret = 1;
19424 +       /*
19425 +        * recursive read locks succeed when current owns the rwsem,
19426 +        * but not when read_depth == 0 which means that the rwsem is
19427 +        * write locked.
19428 +        */
19429 +       if (rt_mutex_owner(lock) != current)
19430 +               ret = rt_mutex_trylock(&rwsem->lock);
19431 +       else if (!rwsem->read_depth)
19432 +               ret = 0;
19434 +       if (ret)
19435 +               rwsem->read_depth++;
19436 +       return ret;
19440 +int  rt_down_read_trylock(struct rw_semaphore *rwsem)
19442 +       int ret;
19444 +       ret = rt__down_read_trylock(rwsem);
19445 +       if (ret)
19446 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
19448 +       return ret;
19450 +EXPORT_SYMBOL(rt_down_read_trylock);
19452 +void rt__down_read(struct rw_semaphore *rwsem)
19454 +       struct rt_mutex *lock = &rwsem->lock;
19456 +       if (rt_mutex_owner(lock) != current)
19457 +               rt_mutex_lock(&rwsem->lock);
19458 +       rwsem->read_depth++;
19460 +EXPORT_SYMBOL(rt__down_read);
19462 +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
19464 +       rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
19465 +       rt__down_read(rwsem);
19468 +void  rt_down_read(struct rw_semaphore *rwsem)
19470 +       __rt_down_read(rwsem, 0);
19472 +EXPORT_SYMBOL(rt_down_read);
19474 +void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
19476 +       __rt_down_read(rwsem, subclass);
19478 +EXPORT_SYMBOL(rt_down_read_nested);
19480 +void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
19481 +                             struct lock_class_key *key)
19483 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19484 +       /*
19485 +        * Make sure we are not reinitializing a held lock:
19486 +        */
19487 +       debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
19488 +       lockdep_init_map(&rwsem->dep_map, name, key, 0);
19489 +#endif
19490 +       rwsem->read_depth = 0;
19491 +       rwsem->lock.save_state = 0;
19493 +EXPORT_SYMBOL(__rt_rwsem_init);
19495 +/**
19496 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
19497 + * @cnt: the atomic which we are to dec
19498 + * @lock: the mutex to return holding if we dec to 0
19499 + *
19500 + * return true and hold lock if we dec to 0, return false otherwise
19501 + */
19502 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
19504 +       /* dec if we can't possibly hit 0 */
19505 +       if (atomic_add_unless(cnt, -1, 1))
19506 +               return 0;
19507 +       /* we might hit 0, so take the lock */
19508 +       mutex_lock(lock);
19509 +       if (!atomic_dec_and_test(cnt)) {
19510 +               /* when we actually did the dec, we didn't hit 0 */
19511 +               mutex_unlock(lock);
19512 +               return 0;
19513 +       }
19514 +       /* we hit 0, and we hold the lock */
19515 +       return 1;
19517 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
19518 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
19519 index b066724d7a5b..bb42267257ad 100644
19520 --- a/kernel/locking/rtmutex.c
19521 +++ b/kernel/locking/rtmutex.c
19522 @@ -7,6 +7,11 @@
19523   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
19524   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
19525   *  Copyright (C) 2006 Esben Nielsen
19526 + *  Adaptive Spinlocks:
19527 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
19528 + *                                  and Peter Morreale,
19529 + * Adaptive Spinlocks simplification:
19530 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
19531   *
19532   *  See Documentation/locking/rt-mutex-design.txt for details.
19533   */
19534 @@ -16,6 +21,8 @@
19535  #include <linux/sched/rt.h>
19536  #include <linux/sched/deadline.h>
19537  #include <linux/timer.h>
19538 +#include <linux/ww_mutex.h>
19539 +#include <linux/blkdev.h>
19541  #include "rtmutex_common.h"
19543 @@ -133,6 +140,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
19544                 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
19547 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
19549 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
19550 +               waiter != PI_REQUEUE_INPROGRESS;
19553  /*
19554   * We can speed up the acquire/release, if there's no debugging state to be
19555   * set up.
19556 @@ -163,13 +176,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
19557   * 2) Drop lock->wait_lock
19558   * 3) Try to unlock the lock with cmpxchg
19559   */
19560 -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
19561 +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
19562 +                                       unsigned long flags)
19563         __releases(lock->wait_lock)
19565         struct task_struct *owner = rt_mutex_owner(lock);
19567         clear_rt_mutex_waiters(lock);
19568 -       raw_spin_unlock(&lock->wait_lock);
19569 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19570         /*
19571          * If a new waiter comes in between the unlock and the cmpxchg
19572          * we have two situations:
19573 @@ -211,11 +225,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
19574  /*
19575   * Simple slow path only version: lock->owner is protected by lock->wait_lock.
19576   */
19577 -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
19578 +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
19579 +                                       unsigned long flags)
19580         __releases(lock->wait_lock)
19582         lock->owner = NULL;
19583 -       raw_spin_unlock(&lock->wait_lock);
19584 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19585         return true;
19587  #endif
19588 @@ -412,6 +427,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
19589         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
19592 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
19594 +       if (waiter->savestate)
19595 +               wake_up_lock_sleeper(waiter->task);
19596 +       else
19597 +               wake_up_process(waiter->task);
19600  /*
19601   * Max number of times we'll walk the boosting chain:
19602   */
19603 @@ -419,7 +442,8 @@ int max_lock_depth = 1024;
19605  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
19607 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
19608 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
19609 +               p->pi_blocked_on->lock : NULL;
19612  /*
19613 @@ -497,7 +521,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19614         int ret = 0, depth = 0;
19615         struct rt_mutex *lock;
19616         bool detect_deadlock;
19617 -       unsigned long flags;
19618         bool requeue = true;
19620         detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
19621 @@ -540,7 +563,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19622         /*
19623          * [1] Task cannot go away as we did a get_task() before !
19624          */
19625 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
19626 +       raw_spin_lock_irq(&task->pi_lock);
19628         /*
19629          * [2] Get the waiter on which @task is blocked on.
19630 @@ -556,7 +579,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19631          * reached or the state of the chain has changed while we
19632          * dropped the locks.
19633          */
19634 -       if (!waiter)
19635 +       if (!rt_mutex_real_waiter(waiter))
19636                 goto out_unlock_pi;
19638         /*
19639 @@ -624,7 +647,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19640          * operations.
19641          */
19642         if (!raw_spin_trylock(&lock->wait_lock)) {
19643 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19644 +               raw_spin_unlock_irq(&task->pi_lock);
19645                 cpu_relax();
19646                 goto retry;
19647         }
19648 @@ -655,7 +678,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19649                 /*
19650                  * No requeue[7] here. Just release @task [8]
19651                  */
19652 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19653 +               raw_spin_unlock(&task->pi_lock);
19654                 put_task_struct(task);
19656                 /*
19657 @@ -663,14 +686,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19658                  * If there is no owner of the lock, end of chain.
19659                  */
19660                 if (!rt_mutex_owner(lock)) {
19661 -                       raw_spin_unlock(&lock->wait_lock);
19662 +                       raw_spin_unlock_irq(&lock->wait_lock);
19663                         return 0;
19664                 }
19666                 /* [10] Grab the next task, i.e. owner of @lock */
19667                 task = rt_mutex_owner(lock);
19668                 get_task_struct(task);
19669 -               raw_spin_lock_irqsave(&task->pi_lock, flags);
19670 +               raw_spin_lock(&task->pi_lock);
19672                 /*
19673                  * No requeue [11] here. We just do deadlock detection.
19674 @@ -685,8 +708,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19675                 top_waiter = rt_mutex_top_waiter(lock);
19677                 /* [13] Drop locks */
19678 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19679 -               raw_spin_unlock(&lock->wait_lock);
19680 +               raw_spin_unlock(&task->pi_lock);
19681 +               raw_spin_unlock_irq(&lock->wait_lock);
19683                 /* If owner is not blocked, end of chain. */
19684                 if (!next_lock)
19685 @@ -707,7 +730,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19686         rt_mutex_enqueue(lock, waiter);
19688         /* [8] Release the task */
19689 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19690 +       raw_spin_unlock(&task->pi_lock);
19691         put_task_struct(task);
19693         /*
19694 @@ -718,21 +741,24 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19695          * follow here. This is the end of the chain we are walking.
19696          */
19697         if (!rt_mutex_owner(lock)) {
19698 +               struct rt_mutex_waiter *lock_top_waiter;
19700                 /*
19701                  * If the requeue [7] above changed the top waiter,
19702                  * then we need to wake the new top waiter up to try
19703                  * to get the lock.
19704                  */
19705 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
19706 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
19707 -               raw_spin_unlock(&lock->wait_lock);
19708 +               lock_top_waiter = rt_mutex_top_waiter(lock);
19709 +               if (prerequeue_top_waiter != lock_top_waiter)
19710 +                       rt_mutex_wake_waiter(lock_top_waiter);
19711 +               raw_spin_unlock_irq(&lock->wait_lock);
19712                 return 0;
19713         }
19715         /* [10] Grab the next task, i.e. the owner of @lock */
19716         task = rt_mutex_owner(lock);
19717         get_task_struct(task);
19718 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
19719 +       raw_spin_lock(&task->pi_lock);
19721         /* [11] requeue the pi waiters if necessary */
19722         if (waiter == rt_mutex_top_waiter(lock)) {
19723 @@ -786,8 +812,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19724         top_waiter = rt_mutex_top_waiter(lock);
19726         /* [13] Drop the locks */
19727 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19728 -       raw_spin_unlock(&lock->wait_lock);
19729 +       raw_spin_unlock(&task->pi_lock);
19730 +       raw_spin_unlock_irq(&lock->wait_lock);
19732         /*
19733          * Make the actual exit decisions [12], based on the stored
19734 @@ -810,28 +836,46 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19735         goto again;
19737   out_unlock_pi:
19738 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19739 +       raw_spin_unlock_irq(&task->pi_lock);
19740   out_put_task:
19741         put_task_struct(task);
19743         return ret;
19747 +#define STEAL_NORMAL  0
19748 +#define STEAL_LATERAL 1
19751 + * Note that RT tasks are excluded from lateral-steals to prevent the
19752 + * introduction of an unbounded latency
19753 + */
19754 +static inline int lock_is_stealable(struct task_struct *task,
19755 +                                   struct task_struct *pendowner, int mode)
19757 +    if (mode == STEAL_NORMAL || rt_task(task)) {
19758 +           if (task->prio >= pendowner->prio)
19759 +                   return 0;
19760 +    } else if (task->prio > pendowner->prio)
19761 +           return 0;
19762 +    return 1;
19765  /*
19766   * Try to take an rt-mutex
19767   *
19768 - * Must be called with lock->wait_lock held.
19769 + * Must be called with lock->wait_lock held and interrupts disabled
19770   *
19771   * @lock:   The lock to be acquired.
19772   * @task:   The task which wants to acquire the lock
19773   * @waiter: The waiter that is queued to the lock's wait tree if the
19774   *         callsite called task_blocked_on_lock(), otherwise NULL
19775   */
19776 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19777 -                               struct rt_mutex_waiter *waiter)
19778 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
19779 +                                 struct task_struct *task,
19780 +                                 struct rt_mutex_waiter *waiter, int mode)
19782 -       unsigned long flags;
19784         /*
19785          * Before testing whether we can acquire @lock, we set the
19786          * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
19787 @@ -867,8 +911,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19788                  * If waiter is not the highest priority waiter of
19789                  * @lock, give up.
19790                  */
19791 -               if (waiter != rt_mutex_top_waiter(lock))
19792 +               if (waiter != rt_mutex_top_waiter(lock)) {
19793 +                       /* XXX lock_is_stealable() ? */
19794                         return 0;
19795 +               }
19797                 /*
19798                  * We can acquire the lock. Remove the waiter from the
19799 @@ -886,14 +932,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19800                  * not need to be dequeued.
19801                  */
19802                 if (rt_mutex_has_waiters(lock)) {
19803 -                       /*
19804 -                        * If @task->prio is greater than or equal to
19805 -                        * the top waiter priority (kernel view),
19806 -                        * @task lost.
19807 -                        */
19808 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
19809 -                               return 0;
19810 +                       struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
19812 +                       if (task != pown && !lock_is_stealable(task, pown, mode))
19813 +                               return 0;
19814                         /*
19815                          * The current top waiter stays enqueued. We
19816                          * don't have to change anything in the lock
19817 @@ -916,7 +958,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19818          * case, but conditionals are more expensive than a redundant
19819          * store.
19820          */
19821 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
19822 +       raw_spin_lock(&task->pi_lock);
19823         task->pi_blocked_on = NULL;
19824         /*
19825          * Finish the lock acquisition. @task is the new owner. If
19826 @@ -925,7 +967,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19827          */
19828         if (rt_mutex_has_waiters(lock))
19829                 rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
19830 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19831 +       raw_spin_unlock(&task->pi_lock);
19833  takeit:
19834         /* We got the lock. */
19835 @@ -942,12 +984,444 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19836         return 1;
19839 +#ifdef CONFIG_PREEMPT_RT_FULL
19841 + * preemptible spin_lock functions:
19842 + */
19843 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
19844 +                                        void  (*slowfn)(struct rt_mutex *lock,
19845 +                                                        bool mg_off),
19846 +                                        bool do_mig_dis)
19848 +       might_sleep_no_state_check();
19850 +       if (do_mig_dis)
19851 +               migrate_disable();
19853 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
19854 +               rt_mutex_deadlock_account_lock(lock, current);
19855 +       else
19856 +               slowfn(lock, do_mig_dis);
19859 +static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
19860 +                                          int  (*slowfn)(struct rt_mutex *lock))
19862 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
19863 +               rt_mutex_deadlock_account_unlock(current);
19864 +               return 0;
19865 +       }
19866 +       return slowfn(lock);
19868 +#ifdef CONFIG_SMP
19870 + * Note that owner is a speculative pointer and dereferencing relies
19871 + * on rcu_read_lock() and the check against the lock owner.
19872 + */
19873 +static int adaptive_wait(struct rt_mutex *lock,
19874 +                        struct task_struct *owner)
19876 +       int res = 0;
19878 +       rcu_read_lock();
19879 +       for (;;) {
19880 +               if (owner != rt_mutex_owner(lock))
19881 +                       break;
19882 +               /*
19883 +                * Ensure that owner->on_cpu is dereferenced _after_
19884 +                * checking the above to be valid.
19885 +                */
19886 +               barrier();
19887 +               if (!owner->on_cpu) {
19888 +                       res = 1;
19889 +                       break;
19890 +               }
19891 +               cpu_relax();
19892 +       }
19893 +       rcu_read_unlock();
19894 +       return res;
19896 +#else
19897 +static int adaptive_wait(struct rt_mutex *lock,
19898 +                        struct task_struct *orig_owner)
19900 +       return 1;
19902 +#endif
19904 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19905 +                                  struct rt_mutex_waiter *waiter,
19906 +                                  struct task_struct *task,
19907 +                                  enum rtmutex_chainwalk chwalk);
19909 + * Slow path lock function spin_lock style: this variant is very
19910 + * careful not to miss any non-lock wakeups.
19911 + *
19912 + * We store the current state under p->pi_lock in p->saved_state and
19913 + * the try_to_wake_up() code handles this accordingly.
19914 + */
19915 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
19916 +                                                   bool mg_off)
19918 +       struct task_struct *lock_owner, *self = current;
19919 +       struct rt_mutex_waiter waiter, *top_waiter;
19920 +       unsigned long flags;
19921 +       int ret;
19923 +       rt_mutex_init_waiter(&waiter, true);
19925 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19927 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
19928 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19929 +               return;
19930 +       }
19932 +       BUG_ON(rt_mutex_owner(lock) == self);
19934 +       /*
19935 +        * We save whatever state the task is in and we'll restore it
19936 +        * after acquiring the lock taking real wakeups into account
19937 +        * as well. We are serialized via pi_lock against wakeups. See
19938 +        * try_to_wake_up().
19939 +        */
19940 +       raw_spin_lock(&self->pi_lock);
19941 +       self->saved_state = self->state;
19942 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19943 +       raw_spin_unlock(&self->pi_lock);
19945 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
19946 +       BUG_ON(ret);
19948 +       for (;;) {
19949 +               /* Try to acquire the lock again. */
19950 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
19951 +                       break;
19953 +               top_waiter = rt_mutex_top_waiter(lock);
19954 +               lock_owner = rt_mutex_owner(lock);
19956 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19958 +               debug_rt_mutex_print_deadlock(&waiter);
19960 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
19961 +                       if (mg_off)
19962 +                               migrate_enable();
19963 +                       schedule();
19964 +                       if (mg_off)
19965 +                               migrate_disable();
19966 +               }
19968 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
19970 +               raw_spin_lock(&self->pi_lock);
19971 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19972 +               raw_spin_unlock(&self->pi_lock);
19973 +       }
19975 +       /*
19976 +        * Restore the task state to current->saved_state. We set it
19977 +        * to the original state above and the try_to_wake_up() code
19978 +        * has possibly updated it when a real (non-rtmutex) wakeup
19979 +        * happened while we were blocked. Clear saved_state so
19980 +        * try_to_wakeup() does not get confused.
19981 +        */
19982 +       raw_spin_lock(&self->pi_lock);
19983 +       __set_current_state_no_track(self->saved_state);
19984 +       self->saved_state = TASK_RUNNING;
19985 +       raw_spin_unlock(&self->pi_lock);
19987 +       /*
19988 +        * try_to_take_rt_mutex() sets the waiter bit
19989 +        * unconditionally. We might have to fix that up:
19990 +        */
19991 +       fixup_rt_mutex_waiters(lock);
19993 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
19994 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
19996 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19998 +       debug_rt_mutex_free_waiter(&waiter);
20001 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
20002 +                                   struct wake_q_head *wake_sleeper_q,
20003 +                                   struct rt_mutex *lock);
20005 + * Slow path to release a rt_mutex spin_lock style
20006 + */
20007 +static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
20009 +       unsigned long flags;
20010 +       WAKE_Q(wake_q);
20011 +       WAKE_Q(wake_sleeper_q);
20013 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
20015 +       debug_rt_mutex_unlock(lock);
20017 +       rt_mutex_deadlock_account_unlock(current);
20019 +       if (!rt_mutex_has_waiters(lock)) {
20020 +               lock->owner = NULL;
20021 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
20022 +               return 0;
20023 +       }
20025 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
20027 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
20028 +       wake_up_q(&wake_q);
20029 +       wake_up_q_sleeper(&wake_sleeper_q);
20031 +       /* Undo pi boosting.when necessary */
20032 +       rt_mutex_adjust_prio(current);
20033 +       return 0;
20036 +static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock)
20038 +       unsigned long flags;
20039 +       WAKE_Q(wake_q);
20040 +       WAKE_Q(wake_sleeper_q);
20042 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
20044 +       debug_rt_mutex_unlock(lock);
20046 +       rt_mutex_deadlock_account_unlock(current);
20048 +       if (!rt_mutex_has_waiters(lock)) {
20049 +               lock->owner = NULL;
20050 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
20051 +               return 0;
20052 +       }
20054 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
20056 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
20057 +       wake_up_q(&wake_q);
20058 +       wake_up_q_sleeper(&wake_sleeper_q);
20059 +       return 1;
20062 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
20064 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
20065 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
20067 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
20069 +void __lockfunc rt_spin_lock(spinlock_t *lock)
20071 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
20072 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
20074 +EXPORT_SYMBOL(rt_spin_lock);
20076 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
20078 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
20080 +EXPORT_SYMBOL(__rt_spin_lock);
20082 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
20084 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
20086 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
20088 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
20089 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
20091 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
20092 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
20094 +EXPORT_SYMBOL(rt_spin_lock_nested);
20095 +#endif
20097 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
20099 +       /* NOTE: we always pass in '1' for nested, for simplicity */
20100 +       spin_release(&lock->dep_map, 1, _RET_IP_);
20101 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
20103 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
20105 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
20107 +       /* NOTE: we always pass in '1' for nested, for simplicity */
20108 +       spin_release(&lock->dep_map, 1, _RET_IP_);
20109 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
20110 +       migrate_enable();
20112 +EXPORT_SYMBOL(rt_spin_unlock);
20114 +int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock)
20116 +       int ret;
20118 +       /* NOTE: we always pass in '1' for nested, for simplicity */
20119 +       spin_release(&lock->dep_map, 1, _RET_IP_);
20120 +       ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost);
20121 +       migrate_enable();
20122 +       return ret;
20125 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
20127 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
20129 +EXPORT_SYMBOL(__rt_spin_unlock);
20132 + * Wait for the lock to get unlocked: instead of polling for an unlock
20133 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
20134 + * schedule if there's contention:
20135 + */
20136 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
20138 +       spin_lock(lock);
20139 +       spin_unlock(lock);
20141 +EXPORT_SYMBOL(rt_spin_unlock_wait);
20143 +int __lockfunc __rt_spin_trylock(struct rt_mutex *lock)
20145 +       return rt_mutex_trylock(lock);
20148 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
20150 +       int ret;
20152 +       ret = rt_mutex_trylock(&lock->lock);
20153 +       if (ret)
20154 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
20155 +       return ret;
20157 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
20159 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
20161 +       int ret;
20163 +       migrate_disable();
20164 +       ret = rt_mutex_trylock(&lock->lock);
20165 +       if (ret)
20166 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
20167 +       else
20168 +               migrate_enable();
20169 +       return ret;
20171 +EXPORT_SYMBOL(rt_spin_trylock);
20173 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
20175 +       int ret;
20177 +       local_bh_disable();
20178 +       ret = rt_mutex_trylock(&lock->lock);
20179 +       if (ret) {
20180 +               migrate_disable();
20181 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
20182 +       } else
20183 +               local_bh_enable();
20184 +       return ret;
20186 +EXPORT_SYMBOL(rt_spin_trylock_bh);
20188 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
20190 +       int ret;
20192 +       *flags = 0;
20193 +       ret = rt_mutex_trylock(&lock->lock);
20194 +       if (ret) {
20195 +               migrate_disable();
20196 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
20197 +       }
20198 +       return ret;
20200 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
20202 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
20204 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
20205 +       if (atomic_add_unless(atomic, -1, 1))
20206 +               return 0;
20207 +       rt_spin_lock(lock);
20208 +       if (atomic_dec_and_test(atomic))
20209 +               return 1;
20210 +       rt_spin_unlock(lock);
20211 +       return 0;
20213 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
20215 +       void
20216 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
20218 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
20219 +       /*
20220 +        * Make sure we are not reinitializing a held lock:
20221 +        */
20222 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
20223 +       lockdep_init_map(&lock->dep_map, name, key, 0);
20224 +#endif
20226 +EXPORT_SYMBOL(__rt_spin_lock_init);
20228 +#endif /* PREEMPT_RT_FULL */
20230 +#ifdef CONFIG_PREEMPT_RT_FULL
20231 +       static inline int __sched
20232 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
20234 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
20235 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
20237 +       if (!hold_ctx)
20238 +               return 0;
20240 +       if (unlikely(ctx == hold_ctx))
20241 +               return -EALREADY;
20243 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
20244 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
20245 +#ifdef CONFIG_DEBUG_MUTEXES
20246 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
20247 +               ctx->contending_lock = ww;
20248 +#endif
20249 +               return -EDEADLK;
20250 +       }
20252 +       return 0;
20254 +#else
20255 +       static inline int __sched
20256 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
20258 +       BUG();
20259 +       return 0;
20262 +#endif
20264 +static inline int
20265 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
20266 +                    struct rt_mutex_waiter *waiter)
20268 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
20271  /*
20272   * Task blocks on lock.
20273   *
20274   * Prepare waiter and propagate pi chain
20275   *
20276 - * This must be called with lock->wait_lock held.
20277 + * This must be called with lock->wait_lock held and interrupts disabled
20278   */
20279  static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20280                                    struct rt_mutex_waiter *waiter,
20281 @@ -958,7 +1432,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20282         struct rt_mutex_waiter *top_waiter = waiter;
20283         struct rt_mutex *next_lock;
20284         int chain_walk = 0, res;
20285 -       unsigned long flags;
20287         /*
20288          * Early deadlock detection. We really don't want the task to
20289 @@ -972,7 +1445,24 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20290         if (owner == task)
20291                 return -EDEADLK;
20293 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
20294 +       raw_spin_lock(&task->pi_lock);
20296 +       /*
20297 +        * In the case of futex requeue PI, this will be a proxy
20298 +        * lock. The task will wake unaware that it is enqueueed on
20299 +        * this lock. Avoid blocking on two locks and corrupting
20300 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
20301 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
20302 +        * before requeue (due to a signal or timeout). Do not enqueue
20303 +        * the task if PI_WAKEUP_INPROGRESS is set.
20304 +        */
20305 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
20306 +               raw_spin_unlock(&task->pi_lock);
20307 +               return -EAGAIN;
20308 +       }
20310 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
20312         __rt_mutex_adjust_prio(task);
20313         waiter->task = task;
20314         waiter->lock = lock;
20315 @@ -985,18 +1475,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20317         task->pi_blocked_on = waiter;
20319 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20320 +       raw_spin_unlock(&task->pi_lock);
20322         if (!owner)
20323                 return 0;
20325 -       raw_spin_lock_irqsave(&owner->pi_lock, flags);
20326 +       raw_spin_lock(&owner->pi_lock);
20327         if (waiter == rt_mutex_top_waiter(lock)) {
20328                 rt_mutex_dequeue_pi(owner, top_waiter);
20329                 rt_mutex_enqueue_pi(owner, waiter);
20331                 __rt_mutex_adjust_prio(owner);
20332 -               if (owner->pi_blocked_on)
20333 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
20334                         chain_walk = 1;
20335         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
20336                 chain_walk = 1;
20337 @@ -1005,7 +1495,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20338         /* Store the lock on which owner is blocked or NULL */
20339         next_lock = task_blocked_on_lock(owner);
20341 -       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
20342 +       raw_spin_unlock(&owner->pi_lock);
20343         /*
20344          * Even if full deadlock detection is on, if the owner is not
20345          * blocked itself, we can avoid finding this out in the chain
20346 @@ -1021,12 +1511,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20347          */
20348         get_task_struct(owner);
20350 -       raw_spin_unlock(&lock->wait_lock);
20351 +       raw_spin_unlock_irq(&lock->wait_lock);
20353         res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
20354                                          next_lock, waiter, task);
20356 -       raw_spin_lock(&lock->wait_lock);
20357 +       raw_spin_lock_irq(&lock->wait_lock);
20359         return res;
20361 @@ -1035,15 +1525,15 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20362   * Remove the top waiter from the current tasks pi waiter tree and
20363   * queue it up.
20364   *
20365 - * Called with lock->wait_lock held.
20366 + * Called with lock->wait_lock held and interrupts disabled.
20367   */
20368  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
20369 +                                   struct wake_q_head *wake_sleeper_q,
20370                                     struct rt_mutex *lock)
20372         struct rt_mutex_waiter *waiter;
20373 -       unsigned long flags;
20375 -       raw_spin_lock_irqsave(&current->pi_lock, flags);
20376 +       raw_spin_lock(&current->pi_lock);
20378         waiter = rt_mutex_top_waiter(lock);
20380 @@ -1065,15 +1555,18 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
20381          */
20382         lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
20384 -       raw_spin_unlock_irqrestore(&current->pi_lock, flags);
20385 +       raw_spin_unlock(&current->pi_lock);
20387 -       wake_q_add(wake_q, waiter->task);
20388 +       if (waiter->savestate)
20389 +               wake_q_add_sleeper(wake_sleeper_q, waiter->task);
20390 +       else
20391 +               wake_q_add(wake_q, waiter->task);
20394  /*
20395   * Remove a waiter from a lock and give up
20396   *
20397 - * Must be called with lock->wait_lock held and
20398 + * Must be called with lock->wait_lock held and interrupts disabled. I must
20399   * have just failed to try_to_take_rt_mutex().
20400   */
20401  static void remove_waiter(struct rt_mutex *lock,
20402 @@ -1081,13 +1574,12 @@ static void remove_waiter(struct rt_mutex *lock,
20404         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
20405         struct task_struct *owner = rt_mutex_owner(lock);
20406 -       struct rt_mutex *next_lock;
20407 -       unsigned long flags;
20408 +       struct rt_mutex *next_lock = NULL;
20410 -       raw_spin_lock_irqsave(&current->pi_lock, flags);
20411 +       raw_spin_lock(&current->pi_lock);
20412         rt_mutex_dequeue(lock, waiter);
20413         current->pi_blocked_on = NULL;
20414 -       raw_spin_unlock_irqrestore(&current->pi_lock, flags);
20415 +       raw_spin_unlock(&current->pi_lock);
20417         /*
20418          * Only update priority if the waiter was the highest priority
20419 @@ -1096,7 +1588,7 @@ static void remove_waiter(struct rt_mutex *lock,
20420         if (!owner || !is_top_waiter)
20421                 return;
20423 -       raw_spin_lock_irqsave(&owner->pi_lock, flags);
20424 +       raw_spin_lock(&owner->pi_lock);
20426         rt_mutex_dequeue_pi(owner, waiter);
20428 @@ -1106,9 +1598,10 @@ static void remove_waiter(struct rt_mutex *lock,
20429         __rt_mutex_adjust_prio(owner);
20431         /* Store the lock on which owner is blocked or NULL */
20432 -       next_lock = task_blocked_on_lock(owner);
20433 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
20434 +               next_lock = task_blocked_on_lock(owner);
20436 -       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
20437 +       raw_spin_unlock(&owner->pi_lock);
20439         /*
20440          * Don't walk the chain, if the owner task is not blocked
20441 @@ -1120,12 +1613,12 @@ static void remove_waiter(struct rt_mutex *lock,
20442         /* gets dropped in rt_mutex_adjust_prio_chain()! */
20443         get_task_struct(owner);
20445 -       raw_spin_unlock(&lock->wait_lock);
20446 +       raw_spin_unlock_irq(&lock->wait_lock);
20448         rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
20449                                    next_lock, NULL, current);
20451 -       raw_spin_lock(&lock->wait_lock);
20452 +       raw_spin_lock_irq(&lock->wait_lock);
20455  /*
20456 @@ -1142,17 +1635,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
20457         raw_spin_lock_irqsave(&task->pi_lock, flags);
20459         waiter = task->pi_blocked_on;
20460 -       if (!waiter || (waiter->prio == task->prio &&
20461 +       if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
20462                         !dl_prio(task->prio))) {
20463                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20464                 return;
20465         }
20466         next_lock = waiter->lock;
20467 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20469         /* gets dropped in rt_mutex_adjust_prio_chain()! */
20470         get_task_struct(task);
20472 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20473         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
20474                                    next_lock, NULL, task);
20476 @@ -1161,16 +1654,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
20477   * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
20478   * @lock:               the rt_mutex to take
20479   * @state:              the state the task should block in (TASK_INTERRUPTIBLE
20480 - *                      or TASK_UNINTERRUPTIBLE)
20481 + *                      or TASK_UNINTERRUPTIBLE)
20482   * @timeout:            the pre-initialized and started timer, or NULL for none
20483   * @waiter:             the pre-initialized rt_mutex_waiter
20484   *
20485 - * lock->wait_lock must be held by the caller.
20486 + * Must be called with lock->wait_lock held and interrupts disabled
20487   */
20488  static int __sched
20489  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
20490                     struct hrtimer_sleeper *timeout,
20491 -                   struct rt_mutex_waiter *waiter)
20492 +                   struct rt_mutex_waiter *waiter,
20493 +                   struct ww_acquire_ctx *ww_ctx)
20495         int ret = 0;
20497 @@ -1179,27 +1673,28 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
20498                 if (try_to_take_rt_mutex(lock, current, waiter))
20499                         break;
20501 -               /*
20502 -                * TASK_INTERRUPTIBLE checks for signals and
20503 -                * timeout. Ignored otherwise.
20504 -                */
20505 -               if (unlikely(state == TASK_INTERRUPTIBLE)) {
20506 -                       /* Signal pending? */
20507 -                       if (signal_pending(current))
20508 -                               ret = -EINTR;
20509 -                       if (timeout && !timeout->task)
20510 -                               ret = -ETIMEDOUT;
20511 +               if (timeout && !timeout->task) {
20512 +                       ret = -ETIMEDOUT;
20513 +                       break;
20514 +               }
20515 +               if (signal_pending_state(state, current)) {
20516 +                       ret = -EINTR;
20517 +                       break;
20518 +               }
20520 +               if (ww_ctx && ww_ctx->acquired > 0) {
20521 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
20522                         if (ret)
20523                                 break;
20524                 }
20526 -               raw_spin_unlock(&lock->wait_lock);
20527 +               raw_spin_unlock_irq(&lock->wait_lock);
20529                 debug_rt_mutex_print_deadlock(waiter);
20531                 schedule();
20533 -               raw_spin_lock(&lock->wait_lock);
20534 +               raw_spin_lock_irq(&lock->wait_lock);
20535                 set_current_state(state);
20536         }
20538 @@ -1227,26 +1722,112 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
20539         }
20542 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
20543 +                                                  struct ww_acquire_ctx *ww_ctx)
20545 +#ifdef CONFIG_DEBUG_MUTEXES
20546 +       /*
20547 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
20548 +        * but released with a normal mutex_unlock in this call.
20549 +        *
20550 +        * This should never happen, always use ww_mutex_unlock.
20551 +        */
20552 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
20554 +       /*
20555 +        * Not quite done after calling ww_acquire_done() ?
20556 +        */
20557 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
20559 +       if (ww_ctx->contending_lock) {
20560 +               /*
20561 +                * After -EDEADLK you tried to
20562 +                * acquire a different ww_mutex? Bad!
20563 +                */
20564 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
20566 +               /*
20567 +                * You called ww_mutex_lock after receiving -EDEADLK,
20568 +                * but 'forgot' to unlock everything else first?
20569 +                */
20570 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
20571 +               ww_ctx->contending_lock = NULL;
20572 +       }
20574 +       /*
20575 +        * Naughty, using a different class will lead to undefined behavior!
20576 +        */
20577 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
20578 +#endif
20579 +       ww_ctx->acquired++;
20582 +#ifdef CONFIG_PREEMPT_RT_FULL
20583 +static void ww_mutex_account_lock(struct rt_mutex *lock,
20584 +                                 struct ww_acquire_ctx *ww_ctx)
20586 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
20587 +       struct rt_mutex_waiter *waiter, *n;
20589 +       /*
20590 +        * This branch gets optimized out for the common case,
20591 +        * and is only important for ww_mutex_lock.
20592 +        */
20593 +       ww_mutex_lock_acquired(ww, ww_ctx);
20594 +       ww->ctx = ww_ctx;
20596 +       /*
20597 +        * Give any possible sleeping processes the chance to wake up,
20598 +        * so they can recheck if they have to back off.
20599 +        */
20600 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
20601 +                                            tree_entry) {
20602 +               /* XXX debug rt mutex waiter wakeup */
20604 +               BUG_ON(waiter->lock != lock);
20605 +               rt_mutex_wake_waiter(waiter);
20606 +       }
20609 +#else
20611 +static void ww_mutex_account_lock(struct rt_mutex *lock,
20612 +                                 struct ww_acquire_ctx *ww_ctx)
20614 +       BUG();
20616 +#endif
20618  /*
20619   * Slow path lock function:
20620   */
20621  static int __sched
20622  rt_mutex_slowlock(struct rt_mutex *lock, int state,
20623                   struct hrtimer_sleeper *timeout,
20624 -                 enum rtmutex_chainwalk chwalk)
20625 +                 enum rtmutex_chainwalk chwalk,
20626 +                 struct ww_acquire_ctx *ww_ctx)
20628         struct rt_mutex_waiter waiter;
20629 +       unsigned long flags;
20630         int ret = 0;
20632 -       debug_rt_mutex_init_waiter(&waiter);
20633 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
20634 -       RB_CLEAR_NODE(&waiter.tree_entry);
20635 +       rt_mutex_init_waiter(&waiter, false);
20637 -       raw_spin_lock(&lock->wait_lock);
20638 +       /*
20639 +        * Technically we could use raw_spin_[un]lock_irq() here, but this can
20640 +        * be called in early boot if the cmpxchg() fast path is disabled
20641 +        * (debug, no architecture support). In this case we will acquire the
20642 +        * rtmutex with lock->wait_lock held. But we cannot unconditionally
20643 +        * enable interrupts in that early boot case. So we need to use the
20644 +        * irqsave/restore variants.
20645 +        */
20646 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
20648         /* Try to acquire the lock again: */
20649         if (try_to_take_rt_mutex(lock, current, NULL)) {
20650 -               raw_spin_unlock(&lock->wait_lock);
20651 +               if (ww_ctx)
20652 +                       ww_mutex_account_lock(lock, ww_ctx);
20653 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
20654                 return 0;
20655         }
20657 @@ -1260,13 +1841,23 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
20659         if (likely(!ret))
20660                 /* sleep on the mutex */
20661 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
20662 +               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
20663 +                                         ww_ctx);
20664 +       else if (ww_ctx) {
20665 +               /* ww_mutex received EDEADLK, let it become EALREADY */
20666 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
20667 +               BUG_ON(!ret);
20668 +       }
20670         if (unlikely(ret)) {
20671                 __set_current_state(TASK_RUNNING);
20672                 if (rt_mutex_has_waiters(lock))
20673                         remove_waiter(lock, &waiter);
20674 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
20675 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
20676 +               if (!ww_ctx)
20677 +                       rt_mutex_handle_deadlock(ret, chwalk, &waiter);
20678 +       } else if (ww_ctx) {
20679 +               ww_mutex_account_lock(lock, ww_ctx);
20680         }
20682         /*
20683 @@ -1275,7 +1866,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
20684          */
20685         fixup_rt_mutex_waiters(lock);
20687 -       raw_spin_unlock(&lock->wait_lock);
20688 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
20690         /* Remove pending timer: */
20691         if (unlikely(timeout))
20692 @@ -1291,6 +1882,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
20693   */
20694  static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
20696 +       unsigned long flags;
20697         int ret;
20699         /*
20700 @@ -1302,10 +1894,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
20701                 return 0;
20703         /*
20704 -        * The mutex has currently no owner. Lock the wait lock and
20705 -        * try to acquire the lock.
20706 +        * The mutex has currently no owner. Lock the wait lock and try to
20707 +        * acquire the lock. We use irqsave here to support early boot calls.
20708          */
20709 -       raw_spin_lock(&lock->wait_lock);
20710 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
20712         ret = try_to_take_rt_mutex(lock, current, NULL);
20714 @@ -1315,7 +1907,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
20715          */
20716         fixup_rt_mutex_waiters(lock);
20718 -       raw_spin_unlock(&lock->wait_lock);
20719 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
20721         return ret;
20723 @@ -1325,9 +1917,13 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
20724   * Return whether the current task needs to undo a potential priority boosting.
20725   */
20726  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
20727 -                                       struct wake_q_head *wake_q)
20728 +                                       struct wake_q_head *wake_q,
20729 +                                       struct wake_q_head *wake_sleeper_q)
20731 -       raw_spin_lock(&lock->wait_lock);
20732 +       unsigned long flags;
20734 +       /* irqsave required to support early boot calls */
20735 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
20737         debug_rt_mutex_unlock(lock);
20739 @@ -1366,10 +1962,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
20740          */
20741         while (!rt_mutex_has_waiters(lock)) {
20742                 /* Drops lock->wait_lock ! */
20743 -               if (unlock_rt_mutex_safe(lock) == true)
20744 +               if (unlock_rt_mutex_safe(lock, flags) == true)
20745                         return false;
20746                 /* Relock the rtmutex and try again */
20747 -               raw_spin_lock(&lock->wait_lock);
20748 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
20749         }
20751         /*
20752 @@ -1378,9 +1974,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
20753          *
20754          * Queue the next waiter for wakeup once we release the wait_lock.
20755          */
20756 -       mark_wakeup_next_waiter(wake_q, lock);
20757 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
20759 -       raw_spin_unlock(&lock->wait_lock);
20760 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
20762         /* check PI boosting */
20763         return true;
20764 @@ -1394,31 +1990,50 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
20765   */
20766  static inline int
20767  rt_mutex_fastlock(struct rt_mutex *lock, int state,
20768 +                 struct ww_acquire_ctx *ww_ctx,
20769                   int (*slowfn)(struct rt_mutex *lock, int state,
20770                                 struct hrtimer_sleeper *timeout,
20771 -                               enum rtmutex_chainwalk chwalk))
20772 +                               enum rtmutex_chainwalk chwalk,
20773 +                               struct ww_acquire_ctx *ww_ctx))
20775         if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
20776                 rt_mutex_deadlock_account_lock(lock, current);
20777                 return 0;
20778 -       } else
20779 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
20780 +       }
20782 +       /*
20783 +        * If rt_mutex blocks, the function sched_submit_work will not call
20784 +        * blk_schedule_flush_plug (because tsk_is_pi_blocked would be true).
20785 +        * We must call blk_schedule_flush_plug here, if we don't call it,
20786 +        * a deadlock in device mapper may happen.
20787 +        */
20788 +       if (unlikely(blk_needs_flush_plug(current)))
20789 +               blk_schedule_flush_plug(current);
20791 +       return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
20792 +                     ww_ctx);
20795  static inline int
20796  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
20797                         struct hrtimer_sleeper *timeout,
20798                         enum rtmutex_chainwalk chwalk,
20799 +                       struct ww_acquire_ctx *ww_ctx,
20800                         int (*slowfn)(struct rt_mutex *lock, int state,
20801                                       struct hrtimer_sleeper *timeout,
20802 -                                     enum rtmutex_chainwalk chwalk))
20803 +                                     enum rtmutex_chainwalk chwalk,
20804 +                                     struct ww_acquire_ctx *ww_ctx))
20806         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
20807             likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
20808                 rt_mutex_deadlock_account_lock(lock, current);
20809                 return 0;
20810 -       } else
20811 -               return slowfn(lock, state, timeout, chwalk);
20812 +       }
20814 +       if (unlikely(blk_needs_flush_plug(current)))
20815 +               blk_schedule_flush_plug(current);
20817 +       return slowfn(lock, state, timeout, chwalk, ww_ctx);
20820  static inline int
20821 @@ -1435,17 +2050,20 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
20822  static inline void
20823  rt_mutex_fastunlock(struct rt_mutex *lock,
20824                     bool (*slowfn)(struct rt_mutex *lock,
20825 -                                  struct wake_q_head *wqh))
20826 +                                  struct wake_q_head *wqh,
20827 +                                  struct wake_q_head *wq_sleeper))
20829         WAKE_Q(wake_q);
20830 +       WAKE_Q(wake_sleeper_q);
20832         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
20833                 rt_mutex_deadlock_account_unlock(current);
20835         } else {
20836 -               bool deboost = slowfn(lock, &wake_q);
20837 +               bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
20839                 wake_up_q(&wake_q);
20840 +               wake_up_q_sleeper(&wake_sleeper_q);
20842                 /* Undo pi boosting if necessary: */
20843                 if (deboost)
20844 @@ -1462,7 +2080,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
20846         might_sleep();
20848 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
20849 +       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
20851  EXPORT_SYMBOL_GPL(rt_mutex_lock);
20853 @@ -1479,7 +2097,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
20855         might_sleep();
20857 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
20858 +       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
20860  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
20862 @@ -1492,10 +2110,29 @@ int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
20863         might_sleep();
20865         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
20866 -                                      RT_MUTEX_FULL_CHAINWALK,
20867 +                                      RT_MUTEX_FULL_CHAINWALK, NULL,
20868                                        rt_mutex_slowlock);
20871 +/**
20872 + * rt_mutex_lock_killable - lock a rt_mutex killable
20873 + *
20874 + * @lock:              the rt_mutex to be locked
20875 + * @detect_deadlock:   deadlock detection on/off
20876 + *
20877 + * Returns:
20878 + *  0          on success
20879 + * -EINTR      when interrupted by a signal
20880 + * -EDEADLK    when the lock would deadlock (when deadlock detection is on)
20881 + */
20882 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
20884 +       might_sleep();
20886 +       return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
20888 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
20890  /**
20891   * rt_mutex_timed_lock - lock a rt_mutex interruptible
20892   *                     the timeout structure is provided
20893 @@ -1516,6 +2153,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
20895         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
20896                                        RT_MUTEX_MIN_CHAINWALK,
20897 +                                      NULL,
20898                                        rt_mutex_slowlock);
20900  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
20901 @@ -1533,7 +2171,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
20902   */
20903  int __sched rt_mutex_trylock(struct rt_mutex *lock)
20905 +#ifdef CONFIG_PREEMPT_RT_FULL
20906 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
20907 +#else
20908         if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
20909 +#endif
20910                 return 0;
20912         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
20913 @@ -1559,13 +2201,14 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
20914   * required or not.
20915   */
20916  bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
20917 -                                  struct wake_q_head *wqh)
20918 +                                  struct wake_q_head *wqh,
20919 +                                  struct wake_q_head *wq_sleeper)
20921         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
20922                 rt_mutex_deadlock_account_unlock(current);
20923                 return false;
20924         }
20925 -       return rt_mutex_slowunlock(lock, wqh);
20926 +       return rt_mutex_slowunlock(lock, wqh, wq_sleeper);
20929  /**
20930 @@ -1598,13 +2241,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
20931  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
20933         lock->owner = NULL;
20934 -       raw_spin_lock_init(&lock->wait_lock);
20935         lock->waiters = RB_ROOT;
20936         lock->waiters_leftmost = NULL;
20938         debug_rt_mutex_init(lock, name);
20940 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
20941 +EXPORT_SYMBOL(__rt_mutex_init);
20943  /**
20944   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
20945 @@ -1619,7 +2261,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
20946  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
20947                                 struct task_struct *proxy_owner)
20949 -       __rt_mutex_init(lock, NULL);
20950 +       rt_mutex_init(lock);
20951         debug_rt_mutex_proxy_lock(lock, proxy_owner);
20952         rt_mutex_set_owner(lock, proxy_owner);
20953         rt_mutex_deadlock_account_lock(lock, proxy_owner);
20954 @@ -1660,13 +2302,42 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
20956         int ret;
20958 -       raw_spin_lock(&lock->wait_lock);
20959 +       raw_spin_lock_irq(&lock->wait_lock);
20961         if (try_to_take_rt_mutex(lock, task, NULL)) {
20962 -               raw_spin_unlock(&lock->wait_lock);
20963 +               raw_spin_unlock_irq(&lock->wait_lock);
20964                 return 1;
20965         }
20967 +#ifdef CONFIG_PREEMPT_RT_FULL
20968 +       /*
20969 +        * In PREEMPT_RT there's an added race.
20970 +        * If the task, that we are about to requeue, times out,
20971 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
20972 +        * to skip this task. But right after the task sets
20973 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
20974 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
20975 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
20976 +        * lock that it blocks on. We *must not* place this task
20977 +        * on this proxy lock in that case.
20978 +        *
20979 +        * To prevent this race, we first take the task's pi_lock
20980 +        * and check if it has updated its pi_blocked_on. If it has,
20981 +        * we assume that it woke up and we return -EAGAIN.
20982 +        * Otherwise, we set the task's pi_blocked_on to
20983 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
20984 +        * it will know that we are in the process of requeuing it.
20985 +        */
20986 +       raw_spin_lock(&task->pi_lock);
20987 +       if (task->pi_blocked_on) {
20988 +               raw_spin_unlock(&task->pi_lock);
20989 +               raw_spin_unlock_irq(&lock->wait_lock);
20990 +               return -EAGAIN;
20991 +       }
20992 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
20993 +       raw_spin_unlock(&task->pi_lock);
20994 +#endif
20996         /* We enforce deadlock detection for futexes */
20997         ret = task_blocks_on_rt_mutex(lock, waiter, task,
20998                                       RT_MUTEX_FULL_CHAINWALK);
20999 @@ -1681,10 +2352,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
21000                 ret = 0;
21001         }
21003 -       if (unlikely(ret))
21004 +       if (ret && rt_mutex_has_waiters(lock))
21005                 remove_waiter(lock, waiter);
21007 -       raw_spin_unlock(&lock->wait_lock);
21008 +       raw_spin_unlock_irq(&lock->wait_lock);
21010         debug_rt_mutex_print_deadlock(waiter);
21012 @@ -1732,12 +2403,12 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
21014         int ret;
21016 -       raw_spin_lock(&lock->wait_lock);
21017 +       raw_spin_lock_irq(&lock->wait_lock);
21019         set_current_state(TASK_INTERRUPTIBLE);
21021         /* sleep on the mutex */
21022 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
21023 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
21025         if (unlikely(ret))
21026                 remove_waiter(lock, waiter);
21027 @@ -1748,7 +2419,93 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
21028          */
21029         fixup_rt_mutex_waiters(lock);
21031 -       raw_spin_unlock(&lock->wait_lock);
21032 +       raw_spin_unlock_irq(&lock->wait_lock);
21034 +       return ret;
21037 +static inline int
21038 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
21040 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
21041 +       unsigned tmp;
21043 +       if (ctx->deadlock_inject_countdown-- == 0) {
21044 +               tmp = ctx->deadlock_inject_interval;
21045 +               if (tmp > UINT_MAX/4)
21046 +                       tmp = UINT_MAX;
21047 +               else
21048 +                       tmp = tmp*2 + tmp + tmp/2;
21050 +               ctx->deadlock_inject_interval = tmp;
21051 +               ctx->deadlock_inject_countdown = tmp;
21052 +               ctx->contending_lock = lock;
21054 +               ww_mutex_unlock(lock);
21056 +               return -EDEADLK;
21057 +       }
21058 +#endif
21060 +       return 0;
21063 +#ifdef CONFIG_PREEMPT_RT_FULL
21064 +int __sched
21065 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
21067 +       int ret;
21069 +       might_sleep();
21071 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
21072 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
21073 +       if (ret)
21074 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
21075 +       else if (!ret && ww_ctx->acquired > 1)
21076 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
21078 +       return ret;
21080 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
21082 +int __sched
21083 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
21085 +       int ret;
21087 +       might_sleep();
21089 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
21090 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
21091 +       if (ret)
21092 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
21093 +       else if (!ret && ww_ctx->acquired > 1)
21094 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
21096         return ret;
21098 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
21100 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
21102 +       int nest = !!lock->ctx;
21104 +       /*
21105 +        * The unlocking fastpath is the 0->1 transition from 'locked'
21106 +        * into 'unlocked' state:
21107 +        */
21108 +       if (nest) {
21109 +#ifdef CONFIG_DEBUG_MUTEXES
21110 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
21111 +#endif
21112 +               if (lock->ctx->acquired > 0)
21113 +                       lock->ctx->acquired--;
21114 +               lock->ctx = NULL;
21115 +       }
21117 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
21118 +       rt_mutex_unlock(&lock->base.lock);
21120 +EXPORT_SYMBOL(ww_mutex_unlock);
21121 +#endif
21122 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
21123 index e317e1cbb3eb..f457c7574920 100644
21124 --- a/kernel/locking/rtmutex_common.h
21125 +++ b/kernel/locking/rtmutex_common.h
21126 @@ -27,6 +27,7 @@ struct rt_mutex_waiter {
21127         struct rb_node          pi_tree_entry;
21128         struct task_struct      *task;
21129         struct rt_mutex         *lock;
21130 +       bool                    savestate;
21131  #ifdef CONFIG_DEBUG_RT_MUTEXES
21132         unsigned long           ip;
21133         struct pid              *deadlock_task_pid;
21134 @@ -98,6 +99,9 @@ enum rtmutex_chainwalk {
21135  /*
21136   * PI-futex support (proxy locking functions, etc.):
21137   */
21138 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
21139 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
21141  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
21142  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
21143                                        struct task_struct *proxy_owner);
21144 @@ -111,7 +115,8 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
21145                                       struct rt_mutex_waiter *waiter);
21146  extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
21147  extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
21148 -                                 struct wake_q_head *wqh);
21149 +                                 struct wake_q_head *wqh,
21150 +                                 struct wake_q_head *wq_sleeper);
21151  extern void rt_mutex_adjust_prio(struct task_struct *task);
21153  #ifdef CONFIG_DEBUG_RT_MUTEXES
21154 @@ -120,4 +125,14 @@ extern void rt_mutex_adjust_prio(struct task_struct *task);
21155  # include "rtmutex.h"
21156  #endif
21158 +static inline void
21159 +rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
21161 +       debug_rt_mutex_init_waiter(waiter);
21162 +       waiter->task = NULL;
21163 +       waiter->savestate = savestate;
21164 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
21165 +       RB_CLEAR_NODE(&waiter->tree_entry);
21168  #endif
21169 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
21170 index db3ccb1dd614..909779647bd1 100644
21171 --- a/kernel/locking/spinlock.c
21172 +++ b/kernel/locking/spinlock.c
21173 @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
21174   *         __[spin|read|write]_lock_bh()
21175   */
21176  BUILD_LOCK_OPS(spin, raw_spinlock);
21178 +#ifndef CONFIG_PREEMPT_RT_FULL
21179  BUILD_LOCK_OPS(read, rwlock);
21180  BUILD_LOCK_OPS(write, rwlock);
21181 +#endif
21183  #endif
21185 @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
21186  EXPORT_SYMBOL(_raw_spin_unlock_bh);
21187  #endif
21189 +#ifndef CONFIG_PREEMPT_RT_FULL
21191  #ifndef CONFIG_INLINE_READ_TRYLOCK
21192  int __lockfunc _raw_read_trylock(rwlock_t *lock)
21194 @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
21195  EXPORT_SYMBOL(_raw_write_unlock_bh);
21196  #endif
21198 +#endif /* !PREEMPT_RT_FULL */
21200  #ifdef CONFIG_DEBUG_LOCK_ALLOC
21202  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
21203 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
21204 index 0374a596cffa..94970338d518 100644
21205 --- a/kernel/locking/spinlock_debug.c
21206 +++ b/kernel/locking/spinlock_debug.c
21207 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
21209  EXPORT_SYMBOL(__raw_spin_lock_init);
21211 +#ifndef CONFIG_PREEMPT_RT_FULL
21212  void __rwlock_init(rwlock_t *lock, const char *name,
21213                    struct lock_class_key *key)
21215 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
21218  EXPORT_SYMBOL(__rwlock_init);
21219 +#endif
21221  static void spin_dump(raw_spinlock_t *lock, const char *msg)
21223 @@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
21224         arch_spin_unlock(&lock->raw_lock);
21227 +#ifndef CONFIG_PREEMPT_RT_FULL
21228  static void rwlock_bug(rwlock_t *lock, const char *msg)
21230         if (!debug_locks_off())
21231 @@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
21232         debug_write_unlock(lock);
21233         arch_write_unlock(&lock->raw_lock);
21236 +#endif
21237 diff --git a/kernel/module.c b/kernel/module.c
21238 index 0a56098d3738..33c1954b07ed 100644
21239 --- a/kernel/module.c
21240 +++ b/kernel/module.c
21241 @@ -682,16 +682,7 @@ static void percpu_modcopy(struct module *mod,
21242                 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
21245 -/**
21246 - * is_module_percpu_address - test whether address is from module static percpu
21247 - * @addr: address to test
21248 - *
21249 - * Test whether @addr belongs to module static percpu area.
21250 - *
21251 - * RETURNS:
21252 - * %true if @addr is from module static percpu area
21253 - */
21254 -bool is_module_percpu_address(unsigned long addr)
21255 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
21257         struct module *mod;
21258         unsigned int cpu;
21259 @@ -705,9 +696,11 @@ bool is_module_percpu_address(unsigned long addr)
21260                         continue;
21261                 for_each_possible_cpu(cpu) {
21262                         void *start = per_cpu_ptr(mod->percpu, cpu);
21263 +                       void *va = (void *)addr;
21265 -                       if ((void *)addr >= start &&
21266 -                           (void *)addr < start + mod->percpu_size) {
21267 +                       if (va >= start && va < start + mod->percpu_size) {
21268 +                               if (can_addr)
21269 +                                       *can_addr = (unsigned long) (va - start);
21270                                 preempt_enable();
21271                                 return true;
21272                         }
21273 @@ -718,6 +711,20 @@ bool is_module_percpu_address(unsigned long addr)
21274         return false;
21277 +/**
21278 + * is_module_percpu_address - test whether address is from module static percpu
21279 + * @addr: address to test
21280 + *
21281 + * Test whether @addr belongs to module static percpu area.
21282 + *
21283 + * RETURNS:
21284 + * %true if @addr is from module static percpu area
21285 + */
21286 +bool is_module_percpu_address(unsigned long addr)
21288 +       return __is_module_percpu_address(addr, NULL);
21291  #else /* ... !CONFIG_SMP */
21293  static inline void __percpu *mod_percpu(struct module *mod)
21294 @@ -749,6 +756,11 @@ bool is_module_percpu_address(unsigned long addr)
21295         return false;
21298 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
21300 +       return false;
21303  #endif /* CONFIG_SMP */
21305  #define MODINFO_ATTR(field)    \
21306 diff --git a/kernel/panic.c b/kernel/panic.c
21307 index 1d07cf9af849..e182564e6533 100644
21308 --- a/kernel/panic.c
21309 +++ b/kernel/panic.c
21310 @@ -61,6 +61,37 @@ void __weak panic_smp_self_stop(void)
21311                 cpu_relax();
21315 + * Stop ourselves in NMI context if another CPU has already panicked. Arch code
21316 + * may override this to prepare for crash dumping, e.g. save regs info.
21317 + */
21318 +void __weak nmi_panic_self_stop(struct pt_regs *regs)
21320 +       panic_smp_self_stop();
21323 +atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
21326 + * A variant of panic() called from NMI context. We return if we've already
21327 + * panicked on this CPU. If another CPU already panicked, loop in
21328 + * nmi_panic_self_stop() which can provide architecture dependent code such
21329 + * as saving register state for crash dump.
21330 + */
21331 +void nmi_panic(struct pt_regs *regs, const char *msg)
21333 +       int old_cpu, cpu;
21335 +       cpu = raw_smp_processor_id();
21336 +       old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
21338 +       if (old_cpu == PANIC_CPU_INVALID)
21339 +               panic("%s", msg);
21340 +       else if (old_cpu != cpu)
21341 +               nmi_panic_self_stop(regs);
21343 +EXPORT_SYMBOL(nmi_panic);
21345  /**
21346   *     panic - halt the system
21347   *     @fmt: The text string to print
21348 @@ -71,17 +102,17 @@ void __weak panic_smp_self_stop(void)
21349   */
21350  void panic(const char *fmt, ...)
21352 -       static DEFINE_SPINLOCK(panic_lock);
21353         static char buf[1024];
21354         va_list args;
21355         long i, i_next = 0;
21356         int state = 0;
21357 +       int old_cpu, this_cpu;
21359         /*
21360          * Disable local interrupts. This will prevent panic_smp_self_stop
21361          * from deadlocking the first cpu that invokes the panic, since
21362          * there is nothing to prevent an interrupt handler (that runs
21363 -        * after the panic_lock is acquired) from invoking panic again.
21364 +        * after setting panic_cpu) from invoking panic() again.
21365          */
21366         local_irq_disable();
21368 @@ -94,8 +125,16 @@ void panic(const char *fmt, ...)
21369          * multiple parallel invocations of panic, all other CPUs either
21370          * stop themself or will wait until they are stopped by the 1st CPU
21371          * with smp_send_stop().
21372 +        *
21373 +        * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
21374 +        * comes here, so go ahead.
21375 +        * `old_cpu == this_cpu' means we came from nmi_panic() which sets
21376 +        * panic_cpu to this CPU.  In this case, this is also the 1st CPU.
21377          */
21378 -       if (!spin_trylock(&panic_lock))
21379 +       this_cpu = raw_smp_processor_id();
21380 +       old_cpu  = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
21382 +       if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
21383                 panic_smp_self_stop();
21385         console_verbose();
21386 @@ -400,9 +439,11 @@ static u64 oops_id;
21388  static int init_oops_id(void)
21390 +#ifndef CONFIG_PREEMPT_RT_FULL
21391         if (!oops_id)
21392                 get_random_bytes(&oops_id, sizeof(oops_id));
21393         else
21394 +#endif
21395                 oops_id++;
21397         return 0;
21398 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
21399 index 3124cebaec31..c1b981521dd0 100644
21400 --- a/kernel/power/hibernate.c
21401 +++ b/kernel/power/hibernate.c
21402 @@ -285,6 +285,8 @@ static int create_image(int platform_mode)
21404         local_irq_disable();
21406 +       system_state = SYSTEM_SUSPEND;
21408         error = syscore_suspend();
21409         if (error) {
21410                 printk(KERN_ERR "PM: Some system devices failed to power down, "
21411 @@ -314,6 +316,7 @@ static int create_image(int platform_mode)
21412         syscore_resume();
21414   Enable_irqs:
21415 +       system_state = SYSTEM_RUNNING;
21416         local_irq_enable();
21418   Enable_cpus:
21419 @@ -438,6 +441,7 @@ static int resume_target_kernel(bool platform_mode)
21420                 goto Enable_cpus;
21422         local_irq_disable();
21423 +       system_state = SYSTEM_SUSPEND;
21425         error = syscore_suspend();
21426         if (error)
21427 @@ -471,6 +475,7 @@ static int resume_target_kernel(bool platform_mode)
21428         syscore_resume();
21430   Enable_irqs:
21431 +       system_state = SYSTEM_RUNNING;
21432         local_irq_enable();
21434   Enable_cpus:
21435 @@ -556,6 +561,7 @@ int hibernation_platform_enter(void)
21436                 goto Enable_cpus;
21438         local_irq_disable();
21439 +       system_state = SYSTEM_SUSPEND;
21440         syscore_suspend();
21441         if (pm_wakeup_pending()) {
21442                 error = -EAGAIN;
21443 @@ -568,6 +574,7 @@ int hibernation_platform_enter(void)
21445   Power_up:
21446         syscore_resume();
21447 +       system_state = SYSTEM_RUNNING;
21448         local_irq_enable();
21450   Enable_cpus:
21451 @@ -642,6 +649,10 @@ static void power_down(void)
21452                 cpu_relax();
21455 +#ifndef CONFIG_SUSPEND
21456 +bool pm_in_action;
21457 +#endif
21459  /**
21460   * hibernate - Carry out system hibernation, including saving the image.
21461   */
21462 @@ -654,6 +665,8 @@ int hibernate(void)
21463                 return -EPERM;
21464         }
21466 +       pm_in_action = true;
21468         lock_system_sleep();
21469         /* The snapshot device should not be opened while we're running */
21470         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
21471 @@ -719,6 +732,7 @@ int hibernate(void)
21472         atomic_inc(&snapshot_device_available);
21473   Unlock:
21474         unlock_system_sleep();
21475 +       pm_in_action = false;
21476         return error;
21479 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
21480 index f9fe133c13e2..393bc342c586 100644
21481 --- a/kernel/power/suspend.c
21482 +++ b/kernel/power/suspend.c
21483 @@ -359,6 +359,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
21484         arch_suspend_disable_irqs();
21485         BUG_ON(!irqs_disabled());
21487 +       system_state = SYSTEM_SUSPEND;
21489         error = syscore_suspend();
21490         if (!error) {
21491                 *wakeup = pm_wakeup_pending();
21492 @@ -375,6 +377,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
21493                 syscore_resume();
21494         }
21496 +       system_state = SYSTEM_RUNNING;
21498         arch_suspend_enable_irqs();
21499         BUG_ON(irqs_disabled());
21501 @@ -518,6 +522,8 @@ static int enter_state(suspend_state_t state)
21502         return error;
21505 +bool pm_in_action;
21507  /**
21508   * pm_suspend - Externally visible function for suspending the system.
21509   * @state: System sleep state to enter.
21510 @@ -532,6 +538,8 @@ int pm_suspend(suspend_state_t state)
21511         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
21512                 return -EINVAL;
21514 +       pm_in_action = true;
21516         error = enter_state(state);
21517         if (error) {
21518                 suspend_stats.fail++;
21519 @@ -539,6 +547,7 @@ int pm_suspend(suspend_state_t state)
21520         } else {
21521                 suspend_stats.success++;
21522         }
21523 +       pm_in_action = false;
21524         return error;
21526  EXPORT_SYMBOL(pm_suspend);
21527 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
21528 index 0b5613554769..99deb2617308 100644
21529 --- a/kernel/printk/printk.c
21530 +++ b/kernel/printk/printk.c
21531 @@ -241,6 +241,65 @@ struct printk_log {
21532   */
21533  static DEFINE_RAW_SPINLOCK(logbuf_lock);
21535 +#ifdef CONFIG_EARLY_PRINTK
21536 +struct console *early_console;
21538 +static void early_vprintk(const char *fmt, va_list ap)
21540 +       if (early_console) {
21541 +               char buf[512];
21542 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
21544 +               early_console->write(early_console, buf, n);
21545 +       }
21548 +asmlinkage void early_printk(const char *fmt, ...)
21550 +       va_list ap;
21552 +       va_start(ap, fmt);
21553 +       early_vprintk(fmt, ap);
21554 +       va_end(ap);
21558 + * This is independent of any log levels - a global
21559 + * kill switch that turns off all of printk.
21560 + *
21561 + * Used by the NMI watchdog if early-printk is enabled.
21562 + */
21563 +static bool __read_mostly printk_killswitch;
21565 +static int __init force_early_printk_setup(char *str)
21567 +       printk_killswitch = true;
21568 +       return 0;
21570 +early_param("force_early_printk", force_early_printk_setup);
21572 +void printk_kill(void)
21574 +       printk_killswitch = true;
21577 +#ifdef CONFIG_PRINTK
21578 +static int forced_early_printk(const char *fmt, va_list ap)
21580 +       if (!printk_killswitch)
21581 +               return 0;
21582 +       early_vprintk(fmt, ap);
21583 +       return 1;
21585 +#endif
21587 +#else
21588 +static inline int forced_early_printk(const char *fmt, va_list ap)
21590 +       return 0;
21592 +#endif
21594  #ifdef CONFIG_PRINTK
21595  DECLARE_WAIT_QUEUE_HEAD(log_wait);
21596  /* the next printk record to read by syslog(READ) or /proc/kmsg */
21597 @@ -1203,6 +1262,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
21599         char *text;
21600         int len = 0;
21601 +       int attempts = 0;
21603         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
21604         if (!text)
21605 @@ -1214,7 +1274,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
21606                 u64 seq;
21607                 u32 idx;
21608                 enum log_flags prev;
21610 +               int num_msg;
21611 +try_again:
21612 +               attempts++;
21613 +               if (attempts > 10) {
21614 +                       len = -EBUSY;
21615 +                       goto out;
21616 +               }
21617 +               num_msg = 0;
21618                 if (clear_seq < log_first_seq) {
21619                         /* messages are gone, move to first available one */
21620                         clear_seq = log_first_seq;
21621 @@ -1235,6 +1302,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
21622                         prev = msg->flags;
21623                         idx = log_next(idx);
21624                         seq++;
21625 +                       num_msg++;
21626 +                       if (num_msg > 5) {
21627 +                               num_msg = 0;
21628 +                               raw_spin_unlock_irq(&logbuf_lock);
21629 +                               raw_spin_lock_irq(&logbuf_lock);
21630 +                               if (clear_seq < log_first_seq)
21631 +                                       goto try_again;
21632 +                       }
21633                 }
21635                 /* move first record forward until length fits into the buffer */
21636 @@ -1248,6 +1323,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
21637                         prev = msg->flags;
21638                         idx = log_next(idx);
21639                         seq++;
21640 +                       num_msg++;
21641 +                       if (num_msg > 5) {
21642 +                               num_msg = 0;
21643 +                               raw_spin_unlock_irq(&logbuf_lock);
21644 +                               raw_spin_lock_irq(&logbuf_lock);
21645 +                               if (clear_seq < log_first_seq)
21646 +                                       goto try_again;
21647 +                       }
21648                 }
21650                 /* last message fitting into this dump */
21651 @@ -1288,6 +1371,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
21652                 clear_seq = log_next_seq;
21653                 clear_idx = log_next_idx;
21654         }
21655 +out:
21656         raw_spin_unlock_irq(&logbuf_lock);
21658         kfree(text);
21659 @@ -1443,6 +1527,12 @@ static void call_console_drivers(int level,
21660         if (!console_drivers)
21661                 return;
21663 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
21664 +               if (in_irq() || in_nmi())
21665 +                       return;
21666 +       }
21668 +       migrate_disable();
21669         for_each_console(con) {
21670                 if (exclusive_console && con != exclusive_console)
21671                         continue;
21672 @@ -1458,6 +1548,7 @@ static void call_console_drivers(int level,
21673                 else
21674                         con->write(con, text, len);
21675         }
21676 +       migrate_enable();
21679  /*
21680 @@ -1518,6 +1609,15 @@ static inline int can_use_console(unsigned int cpu)
21681  static int console_trylock_for_printk(void)
21683         unsigned int cpu = smp_processor_id();
21684 +#ifdef CONFIG_PREEMPT_RT_FULL
21685 +       int lock = !early_boot_irqs_disabled && (preempt_count() == 0) &&
21686 +               !irqs_disabled();
21687 +#else
21688 +       int lock = 1;
21689 +#endif
21691 +       if (!lock)
21692 +               return 0;
21694         if (!console_trylock())
21695                 return 0;
21696 @@ -1672,6 +1772,13 @@ asmlinkage int vprintk_emit(int facility, int level,
21697         /* cpu currently holding logbuf_lock in this function */
21698         static unsigned int logbuf_cpu = UINT_MAX;
21700 +       /*
21701 +        * Fall back to early_printk if a debugging subsystem has
21702 +        * killed printk output
21703 +        */
21704 +       if (unlikely(forced_early_printk(fmt, args)))
21705 +               return 1;
21707         if (level == LOGLEVEL_SCHED) {
21708                 level = LOGLEVEL_DEFAULT;
21709                 in_sched = true;
21710 @@ -1813,8 +1920,7 @@ asmlinkage int vprintk_emit(int facility, int level,
21711                  * console_sem which would prevent anyone from printing to
21712                  * console
21713                  */
21714 -               preempt_disable();
21716 +               migrate_disable();
21717                 /*
21718                  * Try to acquire and then immediately release the console
21719                  * semaphore.  The release will print out buffers and wake up
21720 @@ -1822,7 +1928,7 @@ asmlinkage int vprintk_emit(int facility, int level,
21721                  */
21722                 if (console_trylock_for_printk())
21723                         console_unlock();
21724 -               preempt_enable();
21725 +               migrate_enable();
21726                 lockdep_on();
21727         }
21729 @@ -1961,26 +2067,6 @@ DEFINE_PER_CPU(printk_func_t, printk_func);
21731  #endif /* CONFIG_PRINTK */
21733 -#ifdef CONFIG_EARLY_PRINTK
21734 -struct console *early_console;
21736 -asmlinkage __visible void early_printk(const char *fmt, ...)
21738 -       va_list ap;
21739 -       char buf[512];
21740 -       int n;
21742 -       if (!early_console)
21743 -               return;
21745 -       va_start(ap, fmt);
21746 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
21747 -       va_end(ap);
21749 -       early_console->write(early_console, buf, n);
21751 -#endif
21753  static int __add_preferred_console(char *name, int idx, char *options,
21754                                    char *brl_options)
21756 @@ -2202,11 +2288,16 @@ static void console_cont_flush(char *text, size_t size)
21757                 goto out;
21759         len = cont_print_text(text, size);
21760 +#ifdef CONFIG_PREEMPT_RT_FULL
21761 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
21762 +       call_console_drivers(cont.level, NULL, 0, text, len);
21763 +#else
21764         raw_spin_unlock(&logbuf_lock);
21765         stop_critical_timings();
21766         call_console_drivers(cont.level, NULL, 0, text, len);
21767         start_critical_timings();
21768         local_irq_restore(flags);
21769 +#endif
21770         return;
21771  out:
21772         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
21773 @@ -2316,13 +2407,17 @@ void console_unlock(void)
21774                 console_idx = log_next(console_idx);
21775                 console_seq++;
21776                 console_prev = msg->flags;
21777 +#ifdef CONFIG_PREEMPT_RT_FULL
21778 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
21779 +               call_console_drivers(level, ext_text, ext_len, text, len);
21780 +#else
21781                 raw_spin_unlock(&logbuf_lock);
21783                 stop_critical_timings();        /* don't trace print latency */
21784                 call_console_drivers(level, ext_text, ext_len, text, len);
21785                 start_critical_timings();
21786                 local_irq_restore(flags);
21788 +#endif
21789                 if (do_cond_resched)
21790                         cond_resched();
21791         }
21792 @@ -2374,6 +2469,11 @@ void console_unblank(void)
21794         struct console *c;
21796 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
21797 +               if (in_irq() || in_nmi())
21798 +                       return;
21799 +       }
21801         /*
21802          * console_unblank can no longer be called in interrupt context unless
21803          * oops_in_progress is set to 1..
21804 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
21805 index 5e2cd1030702..d4467d4eb488 100644
21806 --- a/kernel/ptrace.c
21807 +++ b/kernel/ptrace.c
21808 @@ -142,7 +142,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
21810         spin_lock_irq(&task->sighand->siglock);
21811         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
21812 -               task->state = __TASK_TRACED;
21813 +               unsigned long flags;
21815 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
21816 +               if (task->state & __TASK_TRACED)
21817 +                       task->state = __TASK_TRACED;
21818 +               else
21819 +                       task->saved_state = __TASK_TRACED;
21820 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
21821                 ret = true;
21822         }
21823         spin_unlock_irq(&task->sighand->siglock);
21824 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
21825 index d89328e260df..5bb3364a6284 100644
21826 --- a/kernel/rcu/rcutorture.c
21827 +++ b/kernel/rcu/rcutorture.c
21828 @@ -390,6 +390,7 @@ static struct rcu_torture_ops rcu_ops = {
21829         .name           = "rcu"
21830  };
21832 +#ifndef CONFIG_PREEMPT_RT_FULL
21833  /*
21834   * Definitions for rcu_bh torture testing.
21835   */
21836 @@ -429,6 +430,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
21837         .name           = "rcu_bh"
21838  };
21840 +#else
21841 +static struct rcu_torture_ops rcu_bh_ops = {
21842 +       .ttype          = INVALID_RCU_FLAVOR,
21844 +#endif
21846  /*
21847   * Don't even think about trying any of these in real life!!!
21848   * The names includes "busted", and they really means it!
21849 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
21850 index 8a62cbfe1f2f..f60894c6b35c 100644
21851 --- a/kernel/rcu/tree.c
21852 +++ b/kernel/rcu/tree.c
21853 @@ -56,6 +56,11 @@
21854  #include <linux/random.h>
21855  #include <linux/trace_events.h>
21856  #include <linux/suspend.h>
21857 +#include <linux/delay.h>
21858 +#include <linux/gfp.h>
21859 +#include <linux/oom.h>
21860 +#include <linux/smpboot.h>
21861 +#include "../time/tick-internal.h"
21863  #include "tree.h"
21864  #include "rcu.h"
21865 @@ -266,6 +271,19 @@ void rcu_sched_qs(void)
21866         }
21869 +#ifdef CONFIG_PREEMPT_RT_FULL
21870 +static void rcu_preempt_qs(void);
21872 +void rcu_bh_qs(void)
21874 +       unsigned long flags;
21876 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
21877 +       local_irq_save(flags);
21878 +       rcu_preempt_qs();
21879 +       local_irq_restore(flags);
21881 +#else
21882  void rcu_bh_qs(void)
21884         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
21885 @@ -275,6 +293,7 @@ void rcu_bh_qs(void)
21886                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
21887         }
21889 +#endif
21891  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
21893 @@ -435,11 +454,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
21894  /*
21895   * Return the number of RCU BH batches started thus far for debug & stats.
21896   */
21897 +#ifndef CONFIG_PREEMPT_RT_FULL
21898  unsigned long rcu_batches_started_bh(void)
21900         return rcu_bh_state.gpnum;
21902  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
21903 +#endif
21905  /*
21906   * Return the number of RCU batches completed thus far for debug & stats.
21907 @@ -459,6 +480,7 @@ unsigned long rcu_batches_completed_sched(void)
21909  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
21911 +#ifndef CONFIG_PREEMPT_RT_FULL
21912  /*
21913   * Return the number of RCU BH batches completed thus far for debug & stats.
21914   */
21915 @@ -486,6 +508,13 @@ void rcu_bh_force_quiescent_state(void)
21917  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
21919 +#else
21920 +void rcu_force_quiescent_state(void)
21923 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
21924 +#endif
21926  /*
21927   * Force a quiescent state for RCU-sched.
21928   */
21929 @@ -536,9 +565,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
21930         case RCU_FLAVOR:
21931                 rsp = rcu_state_p;
21932                 break;
21933 +#ifndef CONFIG_PREEMPT_RT_FULL
21934         case RCU_BH_FLAVOR:
21935                 rsp = &rcu_bh_state;
21936                 break;
21937 +#endif
21938         case RCU_SCHED_FLAVOR:
21939                 rsp = &rcu_sched_state;
21940                 break;
21941 @@ -1602,7 +1633,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
21942         int needmore;
21943         struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
21945 -       rcu_nocb_gp_cleanup(rsp, rnp);
21946         rnp->need_future_gp[c & 0x1] = 0;
21947         needmore = rnp->need_future_gp[(c + 1) & 0x1];
21948         trace_rcu_future_gp(rnp, rdp, c,
21949 @@ -1623,7 +1653,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
21950             !READ_ONCE(rsp->gp_flags) ||
21951             !rsp->gp_kthread)
21952                 return;
21953 -       wake_up(&rsp->gp_wq);
21954 +       swake_up(&rsp->gp_wq);
21957  /*
21958 @@ -2003,6 +2033,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
21959         int nocb = 0;
21960         struct rcu_data *rdp;
21961         struct rcu_node *rnp = rcu_get_root(rsp);
21962 +       struct swait_queue_head *sq;
21964         WRITE_ONCE(rsp->gp_activity, jiffies);
21965         raw_spin_lock_irq(&rnp->lock);
21966 @@ -2041,7 +2072,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
21967                         needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
21968                 /* smp_mb() provided by prior unlock-lock pair. */
21969                 nocb += rcu_future_gp_cleanup(rsp, rnp);
21970 +               sq = rcu_nocb_gp_get(rnp);
21971                 raw_spin_unlock_irq(&rnp->lock);
21972 +               rcu_nocb_gp_cleanup(sq);
21973                 cond_resched_rcu_qs();
21974                 WRITE_ONCE(rsp->gp_activity, jiffies);
21975                 rcu_gp_slow(rsp, gp_cleanup_delay);
21976 @@ -2088,7 +2121,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
21977                                                READ_ONCE(rsp->gpnum),
21978                                                TPS("reqwait"));
21979                         rsp->gp_state = RCU_GP_WAIT_GPS;
21980 -                       wait_event_interruptible(rsp->gp_wq,
21981 +                       swait_event_interruptible(rsp->gp_wq,
21982                                                  READ_ONCE(rsp->gp_flags) &
21983                                                  RCU_GP_FLAG_INIT);
21984                         rsp->gp_state = RCU_GP_DONE_GPS;
21985 @@ -2118,7 +2151,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
21986                                                READ_ONCE(rsp->gpnum),
21987                                                TPS("fqswait"));
21988                         rsp->gp_state = RCU_GP_WAIT_FQS;
21989 -                       ret = wait_event_interruptible_timeout(rsp->gp_wq,
21990 +                       ret = swait_event_interruptible_timeout(rsp->gp_wq,
21991                                         rcu_gp_fqs_check_wake(rsp, &gf), j);
21992                         rsp->gp_state = RCU_GP_DOING_FQS;
21993                         /* Locking provides needed memory barriers. */
21994 @@ -2242,7 +2275,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
21995         WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
21996         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
21997         raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
21998 -       rcu_gp_kthread_wake(rsp);
21999 +       swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
22002  /*
22003 @@ -2903,7 +2936,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
22004         }
22005         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
22006         raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
22007 -       rcu_gp_kthread_wake(rsp);
22008 +       swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
22011  /*
22012 @@ -2946,18 +2979,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
22013  /*
22014   * Do RCU core processing for the current CPU.
22015   */
22016 -static void rcu_process_callbacks(struct softirq_action *unused)
22017 +static void rcu_process_callbacks(void)
22019         struct rcu_state *rsp;
22021         if (cpu_is_offline(smp_processor_id()))
22022                 return;
22023 -       trace_rcu_utilization(TPS("Start RCU core"));
22024         for_each_rcu_flavor(rsp)
22025                 __rcu_process_callbacks(rsp);
22026 -       trace_rcu_utilization(TPS("End RCU core"));
22029 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
22030  /*
22031   * Schedule RCU callback invocation.  If the specified type of RCU
22032   * does not support RCU priority boosting, just do a direct call,
22033 @@ -2969,18 +3001,105 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
22035         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
22036                 return;
22037 -       if (likely(!rsp->boost)) {
22038 -               rcu_do_batch(rsp, rdp);
22039 -               return;
22040 -       }
22041 -       invoke_rcu_callbacks_kthread();
22042 +       rcu_do_batch(rsp, rdp);
22045 +static void rcu_wake_cond(struct task_struct *t, int status)
22047 +       /*
22048 +        * If the thread is yielding, only wake it when this
22049 +        * is invoked from idle
22050 +        */
22051 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
22052 +               wake_up_process(t);
22056 + * Wake up this CPU's rcuc kthread to do RCU core processing.
22057 + */
22058  static void invoke_rcu_core(void)
22060 -       if (cpu_online(smp_processor_id()))
22061 -               raise_softirq(RCU_SOFTIRQ);
22062 +       unsigned long flags;
22063 +       struct task_struct *t;
22065 +       if (!cpu_online(smp_processor_id()))
22066 +               return;
22067 +       local_irq_save(flags);
22068 +       __this_cpu_write(rcu_cpu_has_work, 1);
22069 +       t = __this_cpu_read(rcu_cpu_kthread_task);
22070 +       if (t != NULL && current != t)
22071 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
22072 +       local_irq_restore(flags);
22075 +static void rcu_cpu_kthread_park(unsigned int cpu)
22077 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
22080 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
22082 +       return __this_cpu_read(rcu_cpu_has_work);
22086 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
22087 + * RCU softirq used in flavors and configurations of RCU that do not
22088 + * support RCU priority boosting.
22089 + */
22090 +static void rcu_cpu_kthread(unsigned int cpu)
22092 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
22093 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
22094 +       int spincnt;
22096 +       for (spincnt = 0; spincnt < 10; spincnt++) {
22097 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
22098 +               local_bh_disable();
22099 +               *statusp = RCU_KTHREAD_RUNNING;
22100 +               this_cpu_inc(rcu_cpu_kthread_loops);
22101 +               local_irq_disable();
22102 +               work = *workp;
22103 +               *workp = 0;
22104 +               local_irq_enable();
22105 +               if (work)
22106 +                       rcu_process_callbacks();
22107 +               local_bh_enable();
22108 +               if (*workp == 0) {
22109 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
22110 +                       *statusp = RCU_KTHREAD_WAITING;
22111 +                       return;
22112 +               }
22113 +       }
22114 +       *statusp = RCU_KTHREAD_YIELDING;
22115 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
22116 +       schedule_timeout_interruptible(2);
22117 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
22118 +       *statusp = RCU_KTHREAD_WAITING;
22121 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
22122 +       .store                  = &rcu_cpu_kthread_task,
22123 +       .thread_should_run      = rcu_cpu_kthread_should_run,
22124 +       .thread_fn              = rcu_cpu_kthread,
22125 +       .thread_comm            = "rcuc/%u",
22126 +       .setup                  = rcu_cpu_kthread_setup,
22127 +       .park                   = rcu_cpu_kthread_park,
22131 + * Spawn per-CPU RCU core processing kthreads.
22132 + */
22133 +static int __init rcu_spawn_core_kthreads(void)
22135 +       int cpu;
22137 +       for_each_possible_cpu(cpu)
22138 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
22139 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
22140 +       return 0;
22142 +early_initcall(rcu_spawn_core_kthreads);
22144  /*
22145   * Handle any core-RCU processing required by a call_rcu() invocation.
22146 @@ -3126,6 +3245,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
22148  EXPORT_SYMBOL_GPL(call_rcu_sched);
22150 +#ifndef CONFIG_PREEMPT_RT_FULL
22151  /*
22152   * Queue an RCU callback for invocation after a quicker grace period.
22153   */
22154 @@ -3134,6 +3254,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
22155         __call_rcu(head, func, &rcu_bh_state, -1, 0);
22157  EXPORT_SYMBOL_GPL(call_rcu_bh);
22158 +#endif
22160  /*
22161   * Queue an RCU callback for lazy invocation after a grace period.
22162 @@ -3225,6 +3346,7 @@ void synchronize_sched(void)
22164  EXPORT_SYMBOL_GPL(synchronize_sched);
22166 +#ifndef CONFIG_PREEMPT_RT_FULL
22167  /**
22168   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
22169   *
22170 @@ -3251,6 +3373,7 @@ void synchronize_rcu_bh(void)
22171                 wait_rcu_gp(call_rcu_bh);
22173  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
22174 +#endif
22176  /**
22177   * get_state_synchronize_rcu - Snapshot current RCU state
22178 @@ -3536,7 +3659,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
22179                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
22180                         if (wake) {
22181                                 smp_mb(); /* EGP done before wake_up(). */
22182 -                               wake_up(&rsp->expedited_wq);
22183 +                               swake_up(&rsp->expedited_wq);
22184                         }
22185                         break;
22186                 }
22187 @@ -3793,7 +3916,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
22188         jiffies_start = jiffies;
22190         for (;;) {
22191 -               ret = wait_event_interruptible_timeout(
22192 +               ret = swait_event_timeout(
22193                                 rsp->expedited_wq,
22194                                 sync_rcu_preempt_exp_done(rnp_root),
22195                                 jiffies_stall);
22196 @@ -3801,7 +3924,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
22197                         return;
22198                 if (ret < 0) {
22199                         /* Hit a signal, disable CPU stall warnings. */
22200 -                       wait_event(rsp->expedited_wq,
22201 +                       swait_event(rsp->expedited_wq,
22202                                    sync_rcu_preempt_exp_done(rnp_root));
22203                         return;
22204                 }
22205 @@ -4113,6 +4236,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
22206         mutex_unlock(&rsp->barrier_mutex);
22209 +#ifndef CONFIG_PREEMPT_RT_FULL
22210  /**
22211   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
22212   */
22213 @@ -4121,6 +4245,7 @@ void rcu_barrier_bh(void)
22214         _rcu_barrier(&rcu_bh_state);
22216  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
22217 +#endif
22219  /**
22220   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
22221 @@ -4467,8 +4592,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
22222                 }
22223         }
22225 -       init_waitqueue_head(&rsp->gp_wq);
22226 -       init_waitqueue_head(&rsp->expedited_wq);
22227 +       init_swait_queue_head(&rsp->gp_wq);
22228 +       init_swait_queue_head(&rsp->expedited_wq);
22229         rnp = rsp->level[rcu_num_lvls - 1];
22230         for_each_possible_cpu(i) {
22231                 while (i > rnp->grphi)
22232 @@ -4588,12 +4713,13 @@ void __init rcu_init(void)
22234         rcu_bootup_announce();
22235         rcu_init_geometry();
22236 +#ifndef CONFIG_PREEMPT_RT_FULL
22237         rcu_init_one(&rcu_bh_state, &rcu_bh_data);
22238 +#endif
22239         rcu_init_one(&rcu_sched_state, &rcu_sched_data);
22240         if (dump_tree)
22241                 rcu_dump_rcu_node_tree(&rcu_sched_state);
22242         __rcu_init_preempt();
22243 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
22245         /*
22246          * We don't need protection against CPU-hotplug here because
22247 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
22248 index 9fb4e238d4dc..c75834d8de24 100644
22249 --- a/kernel/rcu/tree.h
22250 +++ b/kernel/rcu/tree.h
22251 @@ -27,6 +27,7 @@
22252  #include <linux/threads.h>
22253  #include <linux/cpumask.h>
22254  #include <linux/seqlock.h>
22255 +#include <linux/swait.h>
22256  #include <linux/stop_machine.h>
22258  /*
22259 @@ -241,7 +242,7 @@ struct rcu_node {
22260                                 /* Refused to boost: not sure why, though. */
22261                                 /*  This can happen due to race conditions. */
22262  #ifdef CONFIG_RCU_NOCB_CPU
22263 -       wait_queue_head_t nocb_gp_wq[2];
22264 +       struct swait_queue_head nocb_gp_wq[2];
22265                                 /* Place for rcu_nocb_kthread() to wait GP. */
22266  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
22267         int need_future_gp[2];
22268 @@ -393,7 +394,7 @@ struct rcu_data {
22269         atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
22270         struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
22271         struct rcu_head **nocb_follower_tail;
22272 -       wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
22273 +       struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
22274         struct task_struct *nocb_kthread;
22275         int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
22277 @@ -472,7 +473,7 @@ struct rcu_state {
22278         unsigned long gpnum;                    /* Current gp number. */
22279         unsigned long completed;                /* # of last completed gp. */
22280         struct task_struct *gp_kthread;         /* Task for grace periods. */
22281 -       wait_queue_head_t gp_wq;                /* Where GP task waits. */
22282 +       struct swait_queue_head gp_wq;          /* Where GP task waits. */
22283         short gp_flags;                         /* Commands for GP task. */
22284         short gp_state;                         /* GP kthread sleep state. */
22286 @@ -504,7 +505,7 @@ struct rcu_state {
22287         atomic_long_t expedited_workdone3;      /* # done by others #3. */
22288         atomic_long_t expedited_normal;         /* # fallbacks to normal. */
22289         atomic_t expedited_need_qs;             /* # CPUs left to check in. */
22290 -       wait_queue_head_t expedited_wq;         /* Wait for check-ins. */
22291 +       struct swait_queue_head expedited_wq;   /* Wait for check-ins. */
22292         int ncpus_snap;                         /* # CPUs seen last time. */
22294         unsigned long jiffies_force_qs;         /* Time at which to invoke */
22295 @@ -556,18 +557,18 @@ extern struct list_head rcu_struct_flavors;
22296   */
22297  extern struct rcu_state rcu_sched_state;
22299 +#ifndef CONFIG_PREEMPT_RT_FULL
22300  extern struct rcu_state rcu_bh_state;
22301 +#endif
22303  #ifdef CONFIG_PREEMPT_RCU
22304  extern struct rcu_state rcu_preempt_state;
22305  #endif /* #ifdef CONFIG_PREEMPT_RCU */
22307 -#ifdef CONFIG_RCU_BOOST
22308  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
22309  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
22310  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
22311  DECLARE_PER_CPU(char, rcu_cpu_has_work);
22312 -#endif /* #ifdef CONFIG_RCU_BOOST */
22314  #ifndef RCU_TREE_NONCORE
22316 @@ -587,10 +588,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
22317  static void __init __rcu_init_preempt(void);
22318  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
22319  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
22320 -static void invoke_rcu_callbacks_kthread(void);
22321  static bool rcu_is_callbacks_kthread(void);
22322 +static void rcu_cpu_kthread_setup(unsigned int cpu);
22323  #ifdef CONFIG_RCU_BOOST
22324 -static void rcu_preempt_do_callbacks(void);
22325  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
22326                                                  struct rcu_node *rnp);
22327  #endif /* #ifdef CONFIG_RCU_BOOST */
22328 @@ -607,7 +607,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
22329  static void increment_cpu_stall_ticks(void);
22330  static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
22331  static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
22332 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
22333 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
22334 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
22335  static void rcu_init_one_nocb(struct rcu_node *rnp);
22336  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
22337                             bool lazy, unsigned long flags);
22338 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
22339 index 32cbe72bf545..45e3e3e02a5c 100644
22340 --- a/kernel/rcu/tree_plugin.h
22341 +++ b/kernel/rcu/tree_plugin.h
22342 @@ -24,25 +24,10 @@
22343   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
22344   */
22346 -#include <linux/delay.h>
22347 -#include <linux/gfp.h>
22348 -#include <linux/oom.h>
22349 -#include <linux/smpboot.h>
22350 -#include "../time/tick-internal.h"
22352  #ifdef CONFIG_RCU_BOOST
22354  #include "../locking/rtmutex_common.h"
22357 - * Control variables for per-CPU and per-rcu_node kthreads.  These
22358 - * handle all flavors of RCU.
22359 - */
22360 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
22361 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
22362 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
22363 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
22365  #else /* #ifdef CONFIG_RCU_BOOST */
22367  /*
22368 @@ -55,6 +40,14 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
22370  #endif /* #else #ifdef CONFIG_RCU_BOOST */
22373 + * Control variables for per-CPU and per-rcu_node kthreads.  These
22374 + * handle all flavors of RCU.
22375 + */
22376 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
22377 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
22378 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
22380  #ifdef CONFIG_RCU_NOCB_CPU
22381  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
22382  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
22383 @@ -432,7 +425,7 @@ void rcu_read_unlock_special(struct task_struct *t)
22384         }
22386         /* Hardware IRQ handlers cannot block, complain if they get here. */
22387 -       if (in_irq() || in_serving_softirq()) {
22388 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
22389                 lockdep_rcu_suspicious(__FILE__, __LINE__,
22390                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
22391                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
22392 @@ -645,15 +638,6 @@ static void rcu_preempt_check_callbacks(void)
22393                 t->rcu_read_unlock_special.b.need_qs = true;
22396 -#ifdef CONFIG_RCU_BOOST
22398 -static void rcu_preempt_do_callbacks(void)
22400 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
22403 -#endif /* #ifdef CONFIG_RCU_BOOST */
22405  /*
22406   * Queue a preemptible-RCU callback for invocation after a grace period.
22407   */
22408 @@ -930,6 +914,19 @@ void exit_rcu(void)
22410  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
22413 + * If boosting, set rcuc kthreads to realtime priority.
22414 + */
22415 +static void rcu_cpu_kthread_setup(unsigned int cpu)
22417 +#ifdef CONFIG_RCU_BOOST
22418 +       struct sched_param sp;
22420 +       sp.sched_priority = kthread_prio;
22421 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
22422 +#endif /* #ifdef CONFIG_RCU_BOOST */
22425  #ifdef CONFIG_RCU_BOOST
22427  #include "../locking/rtmutex_common.h"
22428 @@ -961,16 +958,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
22430  #endif /* #else #ifdef CONFIG_RCU_TRACE */
22432 -static void rcu_wake_cond(struct task_struct *t, int status)
22434 -       /*
22435 -        * If the thread is yielding, only wake it when this
22436 -        * is invoked from idle
22437 -        */
22438 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
22439 -               wake_up_process(t);
22442  /*
22443   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
22444   * or ->boost_tasks, advancing the pointer to the next task in the
22445 @@ -1114,23 +1101,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
22446         }
22450 - * Wake up the per-CPU kthread to invoke RCU callbacks.
22451 - */
22452 -static void invoke_rcu_callbacks_kthread(void)
22454 -       unsigned long flags;
22456 -       local_irq_save(flags);
22457 -       __this_cpu_write(rcu_cpu_has_work, 1);
22458 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
22459 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
22460 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
22461 -                             __this_cpu_read(rcu_cpu_kthread_status));
22462 -       }
22463 -       local_irq_restore(flags);
22466  /*
22467   * Is the current CPU running the RCU-callbacks kthread?
22468   * Caller must have preemption disabled.
22469 @@ -1186,67 +1156,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
22470         return 0;
22473 -static void rcu_kthread_do_work(void)
22475 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
22476 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
22477 -       rcu_preempt_do_callbacks();
22480 -static void rcu_cpu_kthread_setup(unsigned int cpu)
22482 -       struct sched_param sp;
22484 -       sp.sched_priority = kthread_prio;
22485 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
22488 -static void rcu_cpu_kthread_park(unsigned int cpu)
22490 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
22493 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
22495 -       return __this_cpu_read(rcu_cpu_has_work);
22499 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
22500 - * RCU softirq used in flavors and configurations of RCU that do not
22501 - * support RCU priority boosting.
22502 - */
22503 -static void rcu_cpu_kthread(unsigned int cpu)
22505 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
22506 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
22507 -       int spincnt;
22509 -       for (spincnt = 0; spincnt < 10; spincnt++) {
22510 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
22511 -               local_bh_disable();
22512 -               *statusp = RCU_KTHREAD_RUNNING;
22513 -               this_cpu_inc(rcu_cpu_kthread_loops);
22514 -               local_irq_disable();
22515 -               work = *workp;
22516 -               *workp = 0;
22517 -               local_irq_enable();
22518 -               if (work)
22519 -                       rcu_kthread_do_work();
22520 -               local_bh_enable();
22521 -               if (*workp == 0) {
22522 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
22523 -                       *statusp = RCU_KTHREAD_WAITING;
22524 -                       return;
22525 -               }
22526 -       }
22527 -       *statusp = RCU_KTHREAD_YIELDING;
22528 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
22529 -       schedule_timeout_interruptible(2);
22530 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
22531 -       *statusp = RCU_KTHREAD_WAITING;
22534  /*
22535   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
22536   * served by the rcu_node in question.  The CPU hotplug lock is still
22537 @@ -1276,26 +1185,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
22538         free_cpumask_var(cm);
22541 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
22542 -       .store                  = &rcu_cpu_kthread_task,
22543 -       .thread_should_run      = rcu_cpu_kthread_should_run,
22544 -       .thread_fn              = rcu_cpu_kthread,
22545 -       .thread_comm            = "rcuc/%u",
22546 -       .setup                  = rcu_cpu_kthread_setup,
22547 -       .park                   = rcu_cpu_kthread_park,
22550  /*
22551   * Spawn boost kthreads -- called as soon as the scheduler is running.
22552   */
22553  static void __init rcu_spawn_boost_kthreads(void)
22555         struct rcu_node *rnp;
22556 -       int cpu;
22558 -       for_each_possible_cpu(cpu)
22559 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
22560 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
22561         rcu_for_each_leaf_node(rcu_state_p, rnp)
22562                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
22564 @@ -1318,11 +1213,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
22565         raw_spin_unlock_irqrestore(&rnp->lock, flags);
22568 -static void invoke_rcu_callbacks_kthread(void)
22570 -       WARN_ON_ONCE(1);
22573  static bool rcu_is_callbacks_kthread(void)
22575         return false;
22576 @@ -1346,7 +1236,7 @@ static void rcu_prepare_kthreads(int cpu)
22578  #endif /* #else #ifdef CONFIG_RCU_BOOST */
22580 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
22581 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
22583  /*
22584   * Check to see if any future RCU-related work will need to be done
22585 @@ -1363,7 +1253,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
22586         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
22587                ? 0 : rcu_cpu_has_callbacks(NULL);
22589 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
22591 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
22592  /*
22593   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
22594   * after it.
22595 @@ -1459,6 +1351,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
22596         return cbs_ready;
22599 +#ifndef CONFIG_PREEMPT_RT_FULL
22601  /*
22602   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
22603   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
22604 @@ -1504,6 +1398,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
22605         *nextevt = basemono + dj * TICK_NSEC;
22606         return 0;
22608 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
22610  /*
22611   * Prepare a CPU for idle from an RCU perspective.  The first major task
22612 @@ -1822,9 +1717,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
22613   * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
22614   * grace period.
22615   */
22616 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
22617 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
22619 -       wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
22620 +       swake_up_all(sq);
22623  /*
22624 @@ -1840,10 +1735,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
22625         rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
22628 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
22630 +       return &rnp->nocb_gp_wq[rnp->completed & 0x1];
22633  static void rcu_init_one_nocb(struct rcu_node *rnp)
22635 -       init_waitqueue_head(&rnp->nocb_gp_wq[0]);
22636 -       init_waitqueue_head(&rnp->nocb_gp_wq[1]);
22637 +       init_swait_queue_head(&rnp->nocb_gp_wq[0]);
22638 +       init_swait_queue_head(&rnp->nocb_gp_wq[1]);
22641  #ifndef CONFIG_RCU_NOCB_CPU_ALL
22642 @@ -1868,7 +1768,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
22643         if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
22644                 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
22645                 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
22646 -               wake_up(&rdp_leader->nocb_wq);
22647 +               swake_up(&rdp_leader->nocb_wq);
22648         }
22651 @@ -2081,7 +1981,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
22652          */
22653         trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
22654         for (;;) {
22655 -               wait_event_interruptible(
22656 +               swait_event_interruptible(
22657                         rnp->nocb_gp_wq[c & 0x1],
22658                         (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
22659                 if (likely(d))
22660 @@ -2109,7 +2009,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp)
22661         /* Wait for callbacks to appear. */
22662         if (!rcu_nocb_poll) {
22663                 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
22664 -               wait_event_interruptible(my_rdp->nocb_wq,
22665 +               swait_event_interruptible(my_rdp->nocb_wq,
22666                                 !READ_ONCE(my_rdp->nocb_leader_sleep));
22667                 /* Memory barrier handled by smp_mb() calls below and repoll. */
22668         } else if (firsttime) {
22669 @@ -2184,7 +2084,7 @@ static void nocb_leader_wait(struct rcu_data *my_rdp)
22670                          * List was empty, wake up the follower.
22671                          * Memory barriers supplied by atomic_long_add().
22672                          */
22673 -                       wake_up(&rdp->nocb_wq);
22674 +                       swake_up(&rdp->nocb_wq);
22675                 }
22676         }
22678 @@ -2205,7 +2105,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
22679                 if (!rcu_nocb_poll) {
22680                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
22681                                             "FollowerSleep");
22682 -                       wait_event_interruptible(rdp->nocb_wq,
22683 +                       swait_event_interruptible(rdp->nocb_wq,
22684                                                  READ_ONCE(rdp->nocb_follower_head));
22685                 } else if (firsttime) {
22686                         /* Don't drown trace log with "Poll"! */
22687 @@ -2365,7 +2265,7 @@ void __init rcu_init_nohz(void)
22688  static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
22690         rdp->nocb_tail = &rdp->nocb_head;
22691 -       init_waitqueue_head(&rdp->nocb_wq);
22692 +       init_swait_queue_head(&rdp->nocb_wq);
22693         rdp->nocb_follower_tail = &rdp->nocb_follower_head;
22696 @@ -2515,7 +2415,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
22697         return false;
22700 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
22701 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
22705 @@ -2523,6 +2423,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
22709 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
22711 +       return NULL;
22714  static void rcu_init_one_nocb(struct rcu_node *rnp)
22717 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
22718 index 5f748c5a40f0..9a3904603ff6 100644
22719 --- a/kernel/rcu/update.c
22720 +++ b/kernel/rcu/update.c
22721 @@ -276,6 +276,7 @@ int rcu_read_lock_held(void)
22723  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
22725 +#ifndef CONFIG_PREEMPT_RT_FULL
22726  /**
22727   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
22728   *
22729 @@ -302,6 +303,7 @@ int rcu_read_lock_bh_held(void)
22730         return in_softirq() || irqs_disabled();
22732  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
22733 +#endif
22735  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
22737 diff --git a/kernel/relay.c b/kernel/relay.c
22738 index 0b4570cfacae..60684be39f22 100644
22739 --- a/kernel/relay.c
22740 +++ b/kernel/relay.c
22741 @@ -336,6 +336,10 @@ static void wakeup_readers(unsigned long data)
22743         struct rchan_buf *buf = (struct rchan_buf *)data;
22744         wake_up_interruptible(&buf->read_wait);
22745 +       /*
22746 +        * Stupid polling for now:
22747 +        */
22748 +       mod_timer(&buf->timer, jiffies + 1);
22751  /**
22752 @@ -353,6 +357,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
22753                 init_waitqueue_head(&buf->read_wait);
22754                 kref_init(&buf->kref);
22755                 setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
22756 +               mod_timer(&buf->timer, jiffies + 1);
22757         } else
22758                 del_timer_sync(&buf->timer);
22760 @@ -736,15 +741,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
22761                 else
22762                         buf->early_bytes += buf->chan->subbuf_size -
22763                                             buf->padding[old_subbuf];
22764 -               smp_mb();
22765 -               if (waitqueue_active(&buf->read_wait))
22766 -                       /*
22767 -                        * Calling wake_up_interruptible() from here
22768 -                        * will deadlock if we happen to be logging
22769 -                        * from the scheduler (trying to re-grab
22770 -                        * rq->lock), so defer it.
22771 -                        */
22772 -                       mod_timer(&buf->timer, jiffies + 1);
22773         }
22775         old = buf->data;
22776 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
22777 index 67687973ce80..01b9994b367a 100644
22778 --- a/kernel/sched/Makefile
22779 +++ b/kernel/sched/Makefile
22780 @@ -13,7 +13,7 @@ endif
22782  obj-y += core.o loadavg.o clock.o cputime.o
22783  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
22784 -obj-y += wait.o completion.o idle.o
22785 +obj-y += wait.o swait.o swork.o completion.o idle.o
22786  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
22787  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
22788  obj-$(CONFIG_SCHEDSTATS) += stats.o
22789 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
22790 index 8d0f35debf35..b62cf6400fe0 100644
22791 --- a/kernel/sched/completion.c
22792 +++ b/kernel/sched/completion.c
22793 @@ -30,10 +30,10 @@ void complete(struct completion *x)
22795         unsigned long flags;
22797 -       spin_lock_irqsave(&x->wait.lock, flags);
22798 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
22799         x->done++;
22800 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
22801 -       spin_unlock_irqrestore(&x->wait.lock, flags);
22802 +       swake_up_locked(&x->wait);
22803 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22805  EXPORT_SYMBOL(complete);
22807 @@ -50,10 +50,10 @@ void complete_all(struct completion *x)
22809         unsigned long flags;
22811 -       spin_lock_irqsave(&x->wait.lock, flags);
22812 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
22813         x->done += UINT_MAX/2;
22814 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
22815 -       spin_unlock_irqrestore(&x->wait.lock, flags);
22816 +       swake_up_all_locked(&x->wait);
22817 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22819  EXPORT_SYMBOL(complete_all);
22821 @@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
22822                    long (*action)(long), long timeout, int state)
22824         if (!x->done) {
22825 -               DECLARE_WAITQUEUE(wait, current);
22826 +               DECLARE_SWAITQUEUE(wait);
22828 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
22829 +               __prepare_to_swait(&x->wait, &wait);
22830                 do {
22831                         if (signal_pending_state(state, current)) {
22832                                 timeout = -ERESTARTSYS;
22833                                 break;
22834                         }
22835                         __set_current_state(state);
22836 -                       spin_unlock_irq(&x->wait.lock);
22837 +                       raw_spin_unlock_irq(&x->wait.lock);
22838                         timeout = action(timeout);
22839 -                       spin_lock_irq(&x->wait.lock);
22840 +                       raw_spin_lock_irq(&x->wait.lock);
22841                 } while (!x->done && timeout);
22842 -               __remove_wait_queue(&x->wait, &wait);
22843 +               __finish_swait(&x->wait, &wait);
22844                 if (!x->done)
22845                         return timeout;
22846         }
22847 @@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
22849         might_sleep();
22851 -       spin_lock_irq(&x->wait.lock);
22852 +       raw_spin_lock_irq(&x->wait.lock);
22853         timeout = do_wait_for_common(x, action, timeout, state);
22854 -       spin_unlock_irq(&x->wait.lock);
22855 +       raw_spin_unlock_irq(&x->wait.lock);
22856         return timeout;
22859 @@ -277,12 +277,12 @@ bool try_wait_for_completion(struct completion *x)
22860         if (!READ_ONCE(x->done))
22861                 return 0;
22863 -       spin_lock_irqsave(&x->wait.lock, flags);
22864 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
22865         if (!x->done)
22866                 ret = 0;
22867         else
22868                 x->done--;
22869 -       spin_unlock_irqrestore(&x->wait.lock, flags);
22870 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22871         return ret;
22873  EXPORT_SYMBOL(try_wait_for_completion);
22874 @@ -311,7 +311,7 @@ bool completion_done(struct completion *x)
22875          * after it's acquired the lock.
22876          */
22877         smp_rmb();
22878 -       spin_unlock_wait(&x->wait.lock);
22879 +       raw_spin_unlock_wait(&x->wait.lock);
22880         return true;
22882  EXPORT_SYMBOL(completion_done);
22883 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
22884 index 9d6b3d869592..3e2e4c3de066 100644
22885 --- a/kernel/sched/core.c
22886 +++ b/kernel/sched/core.c
22887 @@ -260,7 +260,11 @@ late_initcall(sched_init_debug);
22888   * Number of tasks to iterate in a single balance run.
22889   * Limited because this is done with IRQs disabled.
22890   */
22891 +#ifndef CONFIG_PREEMPT_RT_FULL
22892  const_debug unsigned int sysctl_sched_nr_migrate = 32;
22893 +#else
22894 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
22895 +#endif
22897  /*
22898   * period over which we average the RT time consumption, measured
22899 @@ -438,6 +442,7 @@ static void init_rq_hrtick(struct rq *rq)
22901         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22902         rq->hrtick_timer.function = hrtick;
22903 +       rq->hrtick_timer.irqsafe = 1;
22905  #else  /* CONFIG_SCHED_HRTICK */
22906  static inline void hrtick_clear(struct rq *rq)
22907 @@ -518,9 +523,15 @@ static bool set_nr_if_polling(struct task_struct *p)
22908  #endif
22909  #endif
22911 -void wake_q_add(struct wake_q_head *head, struct task_struct *task)
22912 +void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
22913 +                 bool sleeper)
22915 -       struct wake_q_node *node = &task->wake_q;
22916 +       struct wake_q_node *node;
22918 +       if (sleeper)
22919 +               node = &task->wake_q_sleeper;
22920 +       else
22921 +               node = &task->wake_q;
22923         /*
22924          * Atomically grab the task, if ->wake_q is !nil already it means
22925 @@ -542,24 +553,33 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
22926         head->lastp = &node->next;
22929 -void wake_up_q(struct wake_q_head *head)
22930 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
22932         struct wake_q_node *node = head->first;
22934         while (node != WAKE_Q_TAIL) {
22935                 struct task_struct *task;
22937 -               task = container_of(node, struct task_struct, wake_q);
22938 +               if (sleeper)
22939 +                       task = container_of(node, struct task_struct, wake_q_sleeper);
22940 +               else
22941 +                       task = container_of(node, struct task_struct, wake_q);
22942                 BUG_ON(!task);
22943                 /* task can safely be re-inserted now */
22944                 node = node->next;
22945 -               task->wake_q.next = NULL;
22946 +               if (sleeper)
22947 +                       task->wake_q_sleeper.next = NULL;
22948 +               else
22949 +                       task->wake_q.next = NULL;
22951                 /*
22952                  * wake_up_process() implies a wmb() to pair with the queueing
22953                  * in wake_q_add() so as not to miss wakeups.
22954                  */
22955 -               wake_up_process(task);
22956 +               if (sleeper)
22957 +                       wake_up_lock_sleeper(task);
22958 +               else
22959 +                       wake_up_process(task);
22960                 put_task_struct(task);
22961         }
22963 @@ -595,6 +615,38 @@ void resched_curr(struct rq *rq)
22964                 trace_sched_wake_idle_without_ipi(cpu);
22967 +#ifdef CONFIG_PREEMPT_LAZY
22968 +void resched_curr_lazy(struct rq *rq)
22970 +       struct task_struct *curr = rq->curr;
22971 +       int cpu;
22973 +       if (!sched_feat(PREEMPT_LAZY)) {
22974 +               resched_curr(rq);
22975 +               return;
22976 +       }
22978 +       lockdep_assert_held(&rq->lock);
22980 +       if (test_tsk_need_resched(curr))
22981 +               return;
22983 +       if (test_tsk_need_resched_lazy(curr))
22984 +               return;
22986 +       set_tsk_need_resched_lazy(curr);
22988 +       cpu = cpu_of(rq);
22989 +       if (cpu == smp_processor_id())
22990 +               return;
22992 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
22993 +       smp_mb();
22994 +       if (!tsk_is_polling(curr))
22995 +               smp_send_reschedule(cpu);
22997 +#endif
22999  void resched_cpu(int cpu)
23001         struct rq *rq = cpu_rq(cpu);
23002 @@ -617,11 +669,14 @@ void resched_cpu(int cpu)
23003   */
23004  int get_nohz_timer_target(void)
23006 -       int i, cpu = smp_processor_id();
23007 +       int i, cpu;
23008         struct sched_domain *sd;
23010 +       preempt_disable_rt();
23011 +       cpu = smp_processor_id();
23013         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
23014 -               return cpu;
23015 +               goto preempt_en_rt;
23017         rcu_read_lock();
23018         for_each_domain(cpu, sd) {
23019 @@ -640,6 +695,8 @@ int get_nohz_timer_target(void)
23020                 cpu = housekeeping_any_cpu();
23021  unlock:
23022         rcu_read_unlock();
23023 +preempt_en_rt:
23024 +       preempt_enable_rt();
23025         return cpu;
23027  /*
23028 @@ -1166,7 +1223,8 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma
23029         p->nr_cpus_allowed = cpumask_weight(new_mask);
23032 -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
23033 +static void __do_set_cpus_allowed_tail(struct task_struct *p,
23034 +                                      const struct cpumask *new_mask)
23036         struct rq *rq = task_rq(p);
23037         bool queued, running;
23038 @@ -1195,6 +1253,98 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
23039                 enqueue_task(rq, p, ENQUEUE_RESTORE);
23042 +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
23044 +       if (__migrate_disabled(p)) {
23045 +               lockdep_assert_held(&p->pi_lock);
23047 +               cpumask_copy(&p->cpus_allowed, new_mask);
23048 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
23049 +               p->migrate_disable_update = 1;
23050 +#endif
23051 +               return;
23052 +       }
23053 +       __do_set_cpus_allowed_tail(p, new_mask);
23056 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
23057 +static DEFINE_MUTEX(sched_down_mutex);
23058 +static cpumask_t sched_down_cpumask;
23060 +void tell_sched_cpu_down_begin(int cpu)
23062 +       mutex_lock(&sched_down_mutex);
23063 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
23064 +       mutex_unlock(&sched_down_mutex);
23067 +void tell_sched_cpu_down_done(int cpu)
23069 +       mutex_lock(&sched_down_mutex);
23070 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
23071 +       mutex_unlock(&sched_down_mutex);
23074 +/**
23075 + * migrate_me - try to move the current task off this cpu
23076 + *
23077 + * Used by the pin_current_cpu() code to try to get tasks
23078 + * to move off the current CPU as it is going down.
23079 + * It will only move the task if the task isn't pinned to
23080 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
23081 + * and the task has to be in a RUNNING state. Otherwise the
23082 + * movement of the task will wake it up (change its state
23083 + * to running) when the task did not expect it.
23084 + *
23085 + * Returns 1 if it succeeded in moving the current task
23086 + *         0 otherwise.
23087 + */
23088 +int migrate_me(void)
23090 +       struct task_struct *p = current;
23091 +       struct migration_arg arg;
23092 +       struct cpumask *cpumask;
23093 +       struct cpumask *mask;
23094 +       unsigned long flags;
23095 +       unsigned int dest_cpu;
23096 +       struct rq *rq;
23098 +       /*
23099 +        * We can not migrate tasks bounded to a CPU or tasks not
23100 +        * running. The movement of the task will wake it up.
23101 +        */
23102 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
23103 +               return 0;
23105 +       mutex_lock(&sched_down_mutex);
23106 +       rq = task_rq_lock(p, &flags);
23108 +       cpumask = this_cpu_ptr(&sched_cpumasks);
23109 +       mask = &p->cpus_allowed;
23111 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
23113 +       if (!cpumask_weight(cpumask)) {
23114 +               /* It's only on this CPU? */
23115 +               task_rq_unlock(rq, p, &flags);
23116 +               mutex_unlock(&sched_down_mutex);
23117 +               return 0;
23118 +       }
23120 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
23122 +       arg.task = p;
23123 +       arg.dest_cpu = dest_cpu;
23125 +       task_rq_unlock(rq, p, &flags);
23127 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
23128 +       tlb_migrate_finish(p->mm);
23129 +       mutex_unlock(&sched_down_mutex);
23131 +       return 1;
23134  /*
23135   * Change a given task's CPU affinity. Migrate the thread to a
23136   * proper CPU and schedule it away if the CPU it's executing on
23137 @@ -1234,7 +1384,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
23138         do_set_cpus_allowed(p, new_mask);
23140         /* Can the task run on the task's current CPU? If so, we're done */
23141 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
23142 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
23143                 goto out;
23145         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
23146 @@ -1410,6 +1560,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
23147         return ret;
23150 +static bool check_task_state(struct task_struct *p, long match_state)
23152 +       bool match = false;
23154 +       raw_spin_lock_irq(&p->pi_lock);
23155 +       if (p->state == match_state || p->saved_state == match_state)
23156 +               match = true;
23157 +       raw_spin_unlock_irq(&p->pi_lock);
23159 +       return match;
23162  /*
23163   * wait_task_inactive - wait for a thread to unschedule.
23164   *
23165 @@ -1454,7 +1616,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
23166                  * is actually now running somewhere else!
23167                  */
23168                 while (task_running(rq, p)) {
23169 -                       if (match_state && unlikely(p->state != match_state))
23170 +                       if (match_state && !check_task_state(p, match_state))
23171                                 return 0;
23172                         cpu_relax();
23173                 }
23174 @@ -1469,7 +1631,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
23175                 running = task_running(rq, p);
23176                 queued = task_on_rq_queued(p);
23177                 ncsw = 0;
23178 -               if (!match_state || p->state == match_state)
23179 +               if (!match_state || p->state == match_state ||
23180 +                   p->saved_state == match_state)
23181                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
23182                 task_rq_unlock(rq, p, &flags);
23184 @@ -1626,7 +1789,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
23186         lockdep_assert_held(&p->pi_lock);
23188 -       if (p->nr_cpus_allowed > 1)
23189 +       if (tsk_nr_cpus_allowed(p) > 1)
23190                 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
23192         /*
23193 @@ -1706,10 +1869,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
23195         activate_task(rq, p, en_flags);
23196         p->on_rq = TASK_ON_RQ_QUEUED;
23198 -       /* if a worker is waking up, notify workqueue */
23199 -       if (p->flags & PF_WQ_WORKER)
23200 -               wq_worker_waking_up(p, cpu_of(rq));
23203  /*
23204 @@ -1936,8 +2095,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
23205          */
23206         smp_mb__before_spinlock();
23207         raw_spin_lock_irqsave(&p->pi_lock, flags);
23208 -       if (!(p->state & state))
23209 +       if (!(p->state & state)) {
23210 +               /*
23211 +                * The task might be running due to a spinlock sleeper
23212 +                * wakeup. Check the saved state and set it to running
23213 +                * if the wakeup condition is true.
23214 +                */
23215 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
23216 +                       if (p->saved_state & state) {
23217 +                               p->saved_state = TASK_RUNNING;
23218 +                               success = 1;
23219 +                       }
23220 +               }
23221                 goto out;
23222 +       }
23224 +       /*
23225 +        * If this is a regular wakeup, then we can unconditionally
23226 +        * clear the saved state of a "lock sleeper".
23227 +        */
23228 +       if (!(wake_flags & WF_LOCK_SLEEPER))
23229 +               p->saved_state = TASK_RUNNING;
23231         trace_sched_waking(p);
23233 @@ -2028,52 +2206,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
23234         return success;
23237 -/**
23238 - * try_to_wake_up_local - try to wake up a local task with rq lock held
23239 - * @p: the thread to be awakened
23240 - *
23241 - * Put @p on the run-queue if it's not already there. The caller must
23242 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
23243 - * the current task.
23244 - */
23245 -static void try_to_wake_up_local(struct task_struct *p)
23247 -       struct rq *rq = task_rq(p);
23249 -       if (WARN_ON_ONCE(rq != this_rq()) ||
23250 -           WARN_ON_ONCE(p == current))
23251 -               return;
23253 -       lockdep_assert_held(&rq->lock);
23255 -       if (!raw_spin_trylock(&p->pi_lock)) {
23256 -               /*
23257 -                * This is OK, because current is on_cpu, which avoids it being
23258 -                * picked for load-balance and preemption/IRQs are still
23259 -                * disabled avoiding further scheduler activity on it and we've
23260 -                * not yet picked a replacement task.
23261 -                */
23262 -               lockdep_unpin_lock(&rq->lock);
23263 -               raw_spin_unlock(&rq->lock);
23264 -               raw_spin_lock(&p->pi_lock);
23265 -               raw_spin_lock(&rq->lock);
23266 -               lockdep_pin_lock(&rq->lock);
23267 -       }
23269 -       if (!(p->state & TASK_NORMAL))
23270 -               goto out;
23272 -       trace_sched_waking(p);
23274 -       if (!task_on_rq_queued(p))
23275 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
23277 -       ttwu_do_wakeup(rq, p, 0);
23278 -       ttwu_stat(p, smp_processor_id(), 0);
23279 -out:
23280 -       raw_spin_unlock(&p->pi_lock);
23283  /**
23284   * wake_up_process - Wake up a specific process
23285   * @p: The process to be woken up.
23286 @@ -2092,6 +2224,18 @@ int wake_up_process(struct task_struct *p)
23288  EXPORT_SYMBOL(wake_up_process);
23290 +/**
23291 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
23292 + * @p: The process to be woken up.
23293 + *
23294 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
23295 + * the nature of the wakeup.
23296 + */
23297 +int wake_up_lock_sleeper(struct task_struct *p)
23299 +       return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
23302  int wake_up_state(struct task_struct *p, unsigned int state)
23304         return try_to_wake_up(p, state, 0);
23305 @@ -2278,6 +2422,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
23306         p->on_cpu = 0;
23307  #endif
23308         init_task_preempt_count(p);
23309 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
23310 +       task_thread_info(p)->preempt_lazy_count = 0;
23311 +#endif
23312  #ifdef CONFIG_SMP
23313         plist_node_init(&p->pushable_tasks, MAX_PRIO);
23314         RB_CLEAR_NODE(&p->pushable_dl_tasks);
23315 @@ -2602,8 +2749,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
23316         finish_arch_post_lock_switch();
23318         fire_sched_in_preempt_notifiers(current);
23319 +       /*
23320 +        * We use mmdrop_delayed() here so we don't have to do the
23321 +        * full __mmdrop() when we are the last user.
23322 +        */
23323         if (mm)
23324 -               mmdrop(mm);
23325 +               mmdrop_delayed(mm);
23326         if (unlikely(prev_state == TASK_DEAD)) {
23327                 if (prev->sched_class->task_dead)
23328                         prev->sched_class->task_dead(prev);
23329 @@ -2934,16 +3085,6 @@ u64 scheduler_tick_max_deferment(void)
23331  #endif
23333 -notrace unsigned long get_parent_ip(unsigned long addr)
23335 -       if (in_lock_functions(addr)) {
23336 -               addr = CALLER_ADDR2;
23337 -               if (in_lock_functions(addr))
23338 -                       addr = CALLER_ADDR3;
23339 -       }
23340 -       return addr;
23343  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
23344                                 defined(CONFIG_PREEMPT_TRACER))
23346 @@ -2965,7 +3106,7 @@ void preempt_count_add(int val)
23347                                 PREEMPT_MASK - 10);
23348  #endif
23349         if (preempt_count() == val) {
23350 -               unsigned long ip = get_parent_ip(CALLER_ADDR1);
23351 +               unsigned long ip = get_lock_parent_ip();
23352  #ifdef CONFIG_DEBUG_PREEMPT
23353                 current->preempt_disable_ip = ip;
23354  #endif
23355 @@ -2992,7 +3133,7 @@ void preempt_count_sub(int val)
23356  #endif
23358         if (preempt_count() == val)
23359 -               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
23360 +               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
23361         __preempt_count_sub(val);
23363  EXPORT_SYMBOL(preempt_count_sub);
23364 @@ -3047,6 +3188,114 @@ static inline void schedule_debug(struct task_struct *prev)
23365         schedstat_inc(this_rq(), sched_count);
23368 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
23370 +void migrate_disable(void)
23372 +       struct task_struct *p = current;
23374 +       if (in_atomic() || irqs_disabled()) {
23375 +#ifdef CONFIG_SCHED_DEBUG
23376 +               p->migrate_disable_atomic++;
23377 +#endif
23378 +               return;
23379 +       }
23381 +#ifdef CONFIG_SCHED_DEBUG
23382 +       if (unlikely(p->migrate_disable_atomic)) {
23383 +               tracing_off();
23384 +               WARN_ON_ONCE(1);
23385 +       }
23386 +#endif
23388 +       if (p->migrate_disable) {
23389 +               p->migrate_disable++;
23390 +               return;
23391 +       }
23393 +       preempt_disable();
23394 +       preempt_lazy_disable();
23395 +       pin_current_cpu();
23396 +       p->migrate_disable = 1;
23397 +       preempt_enable();
23399 +EXPORT_SYMBOL(migrate_disable);
23401 +void migrate_enable(void)
23403 +       struct task_struct *p = current;
23405 +       if (in_atomic() || irqs_disabled()) {
23406 +#ifdef CONFIG_SCHED_DEBUG
23407 +               p->migrate_disable_atomic--;
23408 +#endif
23409 +               return;
23410 +       }
23412 +#ifdef CONFIG_SCHED_DEBUG
23413 +       if (unlikely(p->migrate_disable_atomic)) {
23414 +               tracing_off();
23415 +               WARN_ON_ONCE(1);
23416 +       }
23417 +#endif
23418 +       WARN_ON_ONCE(p->migrate_disable <= 0);
23420 +       if (p->migrate_disable > 1) {
23421 +               p->migrate_disable--;
23422 +               return;
23423 +       }
23425 +       preempt_disable();
23426 +       /*
23427 +        * Clearing migrate_disable causes tsk_cpus_allowed to
23428 +        * show the tasks original cpu affinity.
23429 +        */
23430 +       p->migrate_disable = 0;
23432 +       if (p->migrate_disable_update) {
23433 +               unsigned long flags;
23434 +               struct rq *rq;
23436 +               rq = task_rq_lock(p, &flags);
23437 +               update_rq_clock(rq);
23439 +               __do_set_cpus_allowed_tail(p, &p->cpus_allowed);
23440 +               task_rq_unlock(rq, p, &flags);
23442 +               p->migrate_disable_update = 0;
23444 +               WARN_ON(smp_processor_id() != task_cpu(p));
23445 +               if (!cpumask_test_cpu(task_cpu(p), &p->cpus_allowed)) {
23446 +                       const struct cpumask *cpu_valid_mask = cpu_active_mask;
23447 +                       struct migration_arg arg;
23448 +                       unsigned int dest_cpu;
23450 +                       if (p->flags & PF_KTHREAD) {
23451 +                               /*
23452 +                                * Kernel threads are allowed on online && !active CPUs
23453 +                                */
23454 +                               cpu_valid_mask = cpu_online_mask;
23455 +                       }
23456 +                       dest_cpu = cpumask_any_and(cpu_valid_mask, &p->cpus_allowed);
23457 +                       arg.task = p;
23458 +                       arg.dest_cpu = dest_cpu;
23460 +                       unpin_current_cpu();
23461 +                       preempt_lazy_enable();
23462 +                       preempt_enable();
23463 +                       stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
23464 +                       tlb_migrate_finish(p->mm);
23465 +                       return;
23466 +               }
23467 +       }
23469 +       unpin_current_cpu();
23470 +       preempt_enable();
23471 +       preempt_lazy_enable();
23473 +EXPORT_SYMBOL(migrate_enable);
23474 +#endif
23476  /*
23477   * Pick up the highest-prio task:
23478   */
23479 @@ -3171,19 +3420,6 @@ static void __sched notrace __schedule(bool preempt)
23480                 } else {
23481                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
23482                         prev->on_rq = 0;
23484 -                       /*
23485 -                        * If a worker went to sleep, notify and ask workqueue
23486 -                        * whether it wants to wake up a task to maintain
23487 -                        * concurrency.
23488 -                        */
23489 -                       if (prev->flags & PF_WQ_WORKER) {
23490 -                               struct task_struct *to_wakeup;
23492 -                               to_wakeup = wq_worker_sleeping(prev, cpu);
23493 -                               if (to_wakeup)
23494 -                                       try_to_wake_up_local(to_wakeup);
23495 -                       }
23496                 }
23497                 switch_count = &prev->nvcsw;
23498         }
23499 @@ -3193,6 +3429,7 @@ static void __sched notrace __schedule(bool preempt)
23501         next = pick_next_task(rq, prev);
23502         clear_tsk_need_resched(prev);
23503 +       clear_tsk_need_resched_lazy(prev);
23504         clear_preempt_need_resched();
23505         rq->clock_skip_update = 0;
23507 @@ -3214,8 +3451,19 @@ static void __sched notrace __schedule(bool preempt)
23509  static inline void sched_submit_work(struct task_struct *tsk)
23511 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
23512 +       if (!tsk->state)
23513                 return;
23514 +       /*
23515 +        * If a worker went to sleep, notify and ask workqueue whether
23516 +        * it wants to wake up a task to maintain concurrency.
23517 +        */
23518 +       if (tsk->flags & PF_WQ_WORKER)
23519 +               wq_worker_sleeping(tsk);
23522 +       if (tsk_is_pi_blocked(tsk))
23523 +               return;
23525         /*
23526          * If we are going to sleep and we have plugged IO queued,
23527          * make sure to submit it to avoid deadlocks.
23528 @@ -3224,6 +3472,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
23529                 blk_schedule_flush_plug(tsk);
23532 +static void sched_update_worker(struct task_struct *tsk)
23534 +       if (tsk->flags & PF_WQ_WORKER)
23535 +               wq_worker_running(tsk);
23538  asmlinkage __visible void __sched schedule(void)
23540         struct task_struct *tsk = current;
23541 @@ -3234,6 +3488,7 @@ asmlinkage __visible void __sched schedule(void)
23542                 __schedule(false);
23543                 sched_preempt_enable_no_resched();
23544         } while (need_resched());
23545 +       sched_update_worker(tsk);
23547  EXPORT_SYMBOL(schedule);
23549 @@ -3282,6 +3537,30 @@ static void __sched notrace preempt_schedule_common(void)
23550         } while (need_resched());
23553 +#ifdef CONFIG_PREEMPT_LAZY
23555 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
23556 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
23557 + * preempt_lazy_count counter >0.
23558 + */
23559 +static __always_inline int preemptible_lazy(void)
23561 +       if (test_thread_flag(TIF_NEED_RESCHED))
23562 +               return 1;
23563 +       if (current_thread_info()->preempt_lazy_count)
23564 +               return 0;
23565 +       return 1;
23568 +#else
23570 +static inline int preemptible_lazy(void)
23572 +       return 1;
23575 +#endif
23577  #ifdef CONFIG_PREEMPT
23578  /*
23579   * this is the entry point to schedule() from in-kernel preemption
23580 @@ -3296,6 +3575,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
23581          */
23582         if (likely(!preemptible()))
23583                 return;
23584 +       if (!preemptible_lazy())
23585 +               return;
23587         preempt_schedule_common();
23589 @@ -3322,6 +3603,8 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
23591         if (likely(!preemptible()))
23592                 return;
23593 +       if (!preemptible_lazy())
23594 +               return;
23596         do {
23597                 preempt_disable_notrace();
23598 @@ -3331,7 +3614,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
23599                  * an infinite recursion.
23600                  */
23601                 prev_ctx = exception_enter();
23602 +               /*
23603 +                * The add/subtract must not be traced by the function
23604 +                * tracer. But we still want to account for the
23605 +                * preempt off latency tracer. Since the _notrace versions
23606 +                * of add/subtract skip the accounting for latency tracer
23607 +                * we must force it manually.
23608 +                */
23609 +               start_critical_timings();
23610                 __schedule(true);
23611 +               stop_critical_timings();
23612                 exception_exit(prev_ctx);
23614                 preempt_enable_no_resched_notrace();
23615 @@ -4675,6 +4967,7 @@ int __cond_resched_lock(spinlock_t *lock)
23617  EXPORT_SYMBOL(__cond_resched_lock);
23619 +#ifndef CONFIG_PREEMPT_RT_FULL
23620  int __sched __cond_resched_softirq(void)
23622         BUG_ON(!in_softirq());
23623 @@ -4688,6 +4981,7 @@ int __sched __cond_resched_softirq(void)
23624         return 0;
23626  EXPORT_SYMBOL(__cond_resched_softirq);
23627 +#endif
23629  /**
23630   * yield - yield the current processor to other threads.
23631 @@ -5054,7 +5348,9 @@ void init_idle(struct task_struct *idle, int cpu)
23633         /* Set the preempt count _outside_ the spinlocks! */
23634         init_idle_preempt_count(idle, cpu);
23636 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
23637 +       task_thread_info(idle)->preempt_lazy_count = 0;
23638 +#endif
23639         /*
23640          * The idle tasks have their own, simple scheduling class:
23641          */
23642 @@ -5195,6 +5491,8 @@ void sched_setnuma(struct task_struct *p, int nid)
23643  #endif /* CONFIG_NUMA_BALANCING */
23645  #ifdef CONFIG_HOTPLUG_CPU
23646 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
23648  /*
23649   * Ensures that the idle task is using init_mm right before its cpu goes
23650   * offline.
23651 @@ -5209,7 +5507,11 @@ void idle_task_exit(void)
23652                 switch_mm(mm, &init_mm, current);
23653                 finish_arch_post_lock_switch();
23654         }
23655 -       mmdrop(mm);
23656 +       /*
23657 +        * Defer the cleanup to an alive cpu. On RT we can neither
23658 +        * call mmdrop() nor mmdrop_delayed() from here.
23659 +        */
23660 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
23663  /*
23664 @@ -5581,6 +5883,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
23666         case CPU_DEAD:
23667                 calc_load_migrate(rq);
23668 +               if (per_cpu(idle_last_mm, cpu)) {
23669 +                       mmdrop(per_cpu(idle_last_mm, cpu));
23670 +                       per_cpu(idle_last_mm, cpu) = NULL;
23671 +               }
23672                 break;
23673  #endif
23674         }
23675 @@ -5911,6 +6217,7 @@ static int init_rootdomain(struct root_domain *rd)
23676         rd->rto_cpu = -1;
23677         raw_spin_lock_init(&rd->rto_lock);
23678         init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
23679 +       rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ;
23680  #endif
23682         init_dl_bw(&rd->dl_bw);
23683 @@ -7585,7 +7892,7 @@ void __init sched_init(void)
23684  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
23685  static inline int preempt_count_equals(int preempt_offset)
23687 -       int nested = preempt_count() + rcu_preempt_depth();
23688 +       int nested = preempt_count() + sched_rcu_preempt_depth();
23690         return (nested == preempt_offset);
23692 diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
23693 index 5a75b08cfd85..5be58820465c 100644
23694 --- a/kernel/sched/cpudeadline.c
23695 +++ b/kernel/sched/cpudeadline.c
23696 @@ -103,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
23697         const struct sched_dl_entity *dl_se = &p->dl;
23699         if (later_mask &&
23700 -           cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
23701 +           cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
23702                 best_cpu = cpumask_any(later_mask);
23703                 goto out;
23704 -       } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
23705 +       } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
23706                         dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
23707                 best_cpu = cpudl_maximum(cp);
23708                 if (later_mask)
23709 diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
23710 index 981fcd7dc394..11e9705bf937 100644
23711 --- a/kernel/sched/cpupri.c
23712 +++ b/kernel/sched/cpupri.c
23713 @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
23714                 if (skip)
23715                         continue;
23717 -               if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
23718 +               if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
23719                         continue;
23721                 if (lowest_mask) {
23722 -                       cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
23723 +                       cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
23725                         /*
23726                          * We have to ensure that we have at least one bit
23727 diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
23728 index a1aecbedf5b1..558b98af241d 100644
23729 --- a/kernel/sched/cputime.c
23730 +++ b/kernel/sched/cputime.c
23731 @@ -685,7 +685,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
23733         unsigned long long delta = vtime_delta(tsk);
23735 -       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
23736 +       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
23737         tsk->vtime_snap += delta;
23739         /* CHECKME: always safe to convert nsecs to cputime? */
23740 @@ -701,37 +701,37 @@ static void __vtime_account_system(struct task_struct *tsk)
23742  void vtime_account_system(struct task_struct *tsk)
23744 -       write_seqlock(&tsk->vtime_seqlock);
23745 +       write_seqcount_begin(&tsk->vtime_seqcount);
23746         __vtime_account_system(tsk);
23747 -       write_sequnlock(&tsk->vtime_seqlock);
23748 +       write_seqcount_end(&tsk->vtime_seqcount);
23751  void vtime_gen_account_irq_exit(struct task_struct *tsk)
23753 -       write_seqlock(&tsk->vtime_seqlock);
23754 +       write_seqcount_begin(&tsk->vtime_seqcount);
23755         __vtime_account_system(tsk);
23756         if (context_tracking_in_user())
23757                 tsk->vtime_snap_whence = VTIME_USER;
23758 -       write_sequnlock(&tsk->vtime_seqlock);
23759 +       write_seqcount_end(&tsk->vtime_seqcount);
23762  void vtime_account_user(struct task_struct *tsk)
23764         cputime_t delta_cpu;
23766 -       write_seqlock(&tsk->vtime_seqlock);
23767 +       write_seqcount_begin(&tsk->vtime_seqcount);
23768         delta_cpu = get_vtime_delta(tsk);
23769         tsk->vtime_snap_whence = VTIME_SYS;
23770         account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
23771 -       write_sequnlock(&tsk->vtime_seqlock);
23772 +       write_seqcount_end(&tsk->vtime_seqcount);
23775  void vtime_user_enter(struct task_struct *tsk)
23777 -       write_seqlock(&tsk->vtime_seqlock);
23778 +       write_seqcount_begin(&tsk->vtime_seqcount);
23779         __vtime_account_system(tsk);
23780         tsk->vtime_snap_whence = VTIME_USER;
23781 -       write_sequnlock(&tsk->vtime_seqlock);
23782 +       write_seqcount_end(&tsk->vtime_seqcount);
23785  void vtime_guest_enter(struct task_struct *tsk)
23786 @@ -743,19 +743,19 @@ void vtime_guest_enter(struct task_struct *tsk)
23787          * synchronization against the reader (task_gtime())
23788          * that can thus safely catch up with a tickless delta.
23789          */
23790 -       write_seqlock(&tsk->vtime_seqlock);
23791 +       write_seqcount_begin(&tsk->vtime_seqcount);
23792         __vtime_account_system(tsk);
23793         current->flags |= PF_VCPU;
23794 -       write_sequnlock(&tsk->vtime_seqlock);
23795 +       write_seqcount_end(&tsk->vtime_seqcount);
23797  EXPORT_SYMBOL_GPL(vtime_guest_enter);
23799  void vtime_guest_exit(struct task_struct *tsk)
23801 -       write_seqlock(&tsk->vtime_seqlock);
23802 +       write_seqcount_begin(&tsk->vtime_seqcount);
23803         __vtime_account_system(tsk);
23804         current->flags &= ~PF_VCPU;
23805 -       write_sequnlock(&tsk->vtime_seqlock);
23806 +       write_seqcount_end(&tsk->vtime_seqcount);
23808  EXPORT_SYMBOL_GPL(vtime_guest_exit);
23810 @@ -768,24 +768,26 @@ void vtime_account_idle(struct task_struct *tsk)
23812  void arch_vtime_task_switch(struct task_struct *prev)
23814 -       write_seqlock(&prev->vtime_seqlock);
23815 -       prev->vtime_snap_whence = VTIME_SLEEPING;
23816 -       write_sequnlock(&prev->vtime_seqlock);
23817 +       write_seqcount_begin(&prev->vtime_seqcount);
23818 +       prev->vtime_snap_whence = VTIME_INACTIVE;
23819 +       write_seqcount_end(&prev->vtime_seqcount);
23821 -       write_seqlock(&current->vtime_seqlock);
23822 +       write_seqcount_begin(&current->vtime_seqcount);
23823         current->vtime_snap_whence = VTIME_SYS;
23824         current->vtime_snap = sched_clock_cpu(smp_processor_id());
23825 -       write_sequnlock(&current->vtime_seqlock);
23826 +       write_seqcount_end(&current->vtime_seqcount);
23829  void vtime_init_idle(struct task_struct *t, int cpu)
23831         unsigned long flags;
23833 -       write_seqlock_irqsave(&t->vtime_seqlock, flags);
23834 +       local_irq_save(flags);
23835 +       write_seqcount_begin(&t->vtime_seqcount);
23836         t->vtime_snap_whence = VTIME_SYS;
23837         t->vtime_snap = sched_clock_cpu(cpu);
23838 -       write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
23839 +       write_seqcount_end(&t->vtime_seqcount);
23840 +       local_irq_restore(flags);
23843  cputime_t task_gtime(struct task_struct *t)
23844 @@ -797,13 +799,13 @@ cputime_t task_gtime(struct task_struct *t)
23845                 return t->gtime;
23847         do {
23848 -               seq = read_seqbegin(&t->vtime_seqlock);
23849 +               seq = read_seqcount_begin(&t->vtime_seqcount);
23851                 gtime = t->gtime;
23852                 if (t->flags & PF_VCPU)
23853                         gtime += vtime_delta(t);
23855 -       } while (read_seqretry(&t->vtime_seqlock, seq));
23856 +       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
23858         return gtime;
23860 @@ -826,7 +828,7 @@ fetch_task_cputime(struct task_struct *t,
23861                 *udelta = 0;
23862                 *sdelta = 0;
23864 -               seq = read_seqbegin(&t->vtime_seqlock);
23865 +               seq = read_seqcount_begin(&t->vtime_seqcount);
23867                 if (u_dst)
23868                         *u_dst = *u_src;
23869 @@ -834,7 +836,7 @@ fetch_task_cputime(struct task_struct *t,
23870                         *s_dst = *s_src;
23872                 /* Task is sleeping, nothing to add */
23873 -               if (t->vtime_snap_whence == VTIME_SLEEPING ||
23874 +               if (t->vtime_snap_whence == VTIME_INACTIVE ||
23875                     is_idle_task(t))
23876                         continue;
23878 @@ -850,7 +852,7 @@ fetch_task_cputime(struct task_struct *t,
23879                         if (t->vtime_snap_whence == VTIME_SYS)
23880                                 *sdelta = delta;
23881                 }
23882 -       } while (read_seqretry(&t->vtime_seqlock, seq));
23883 +       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
23887 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
23888 index a996f7356216..645c18fe2075 100644
23889 --- a/kernel/sched/deadline.c
23890 +++ b/kernel/sched/deadline.c
23891 @@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
23893         struct task_struct *p = dl_task_of(dl_se);
23895 -       if (p->nr_cpus_allowed > 1)
23896 +       if (tsk_nr_cpus_allowed(p) > 1)
23897                 dl_rq->dl_nr_migratory++;
23899         update_dl_migration(dl_rq);
23900 @@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
23902         struct task_struct *p = dl_task_of(dl_se);
23904 -       if (p->nr_cpus_allowed > 1)
23905 +       if (tsk_nr_cpus_allowed(p) > 1)
23906                 dl_rq->dl_nr_migratory--;
23908         update_dl_migration(dl_rq);
23909 @@ -702,6 +702,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
23911         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23912         timer->function = dl_task_timer;
23913 +       timer->irqsafe = 1;
23916  /*
23917 @@ -1039,7 +1040,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
23919         enqueue_dl_entity(&p->dl, pi_se, flags);
23921 -       if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
23922 +       if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
23923                 enqueue_pushable_dl_task(rq, p);
23926 @@ -1117,9 +1118,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
23927          * try to make it stay here, it might be important.
23928          */
23929         if (unlikely(dl_task(curr)) &&
23930 -           (curr->nr_cpus_allowed < 2 ||
23931 +           (tsk_nr_cpus_allowed(curr) < 2 ||
23932              !dl_entity_preempt(&p->dl, &curr->dl)) &&
23933 -           (p->nr_cpus_allowed > 1)) {
23934 +           (tsk_nr_cpus_allowed(p) > 1)) {
23935                 int target = find_later_rq(p);
23937                 if (target != -1 &&
23938 @@ -1140,7 +1141,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
23939          * Current can't be migrated, useless to reschedule,
23940          * let's hope p can move out.
23941          */
23942 -       if (rq->curr->nr_cpus_allowed == 1 ||
23943 +       if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
23944             cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
23945                 return;
23947 @@ -1148,7 +1149,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
23948          * p is migratable, so let's not schedule it and
23949          * see if it is pushed or pulled somewhere else.
23950          */
23951 -       if (p->nr_cpus_allowed != 1 &&
23952 +       if (tsk_nr_cpus_allowed(p) != 1 &&
23953             cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
23954                 return;
23956 @@ -1262,7 +1263,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
23958         update_curr_dl(rq);
23960 -       if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
23961 +       if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
23962                 enqueue_pushable_dl_task(rq, p);
23965 @@ -1385,7 +1386,7 @@ static int find_later_rq(struct task_struct *task)
23966         if (unlikely(!later_mask))
23967                 return -1;
23969 -       if (task->nr_cpus_allowed == 1)
23970 +       if (tsk_nr_cpus_allowed(task) == 1)
23971                 return -1;
23973         /*
23974 @@ -1491,7 +1492,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
23975                 if (double_lock_balance(rq, later_rq)) {
23976                         if (unlikely(task_rq(task) != rq ||
23977                                      !cpumask_test_cpu(later_rq->cpu,
23978 -                                                      &task->cpus_allowed) ||
23979 +                                                      tsk_cpus_allowed(task)) ||
23980                                      task_running(rq, task) ||
23981                                      !task_on_rq_queued(task))) {
23982                                 double_unlock_balance(rq, later_rq);
23983 @@ -1530,7 +1531,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
23985         BUG_ON(rq->cpu != task_cpu(p));
23986         BUG_ON(task_current(rq, p));
23987 -       BUG_ON(p->nr_cpus_allowed <= 1);
23988 +       BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
23990         BUG_ON(!task_on_rq_queued(p));
23991         BUG_ON(!dl_task(p));
23992 @@ -1569,7 +1570,7 @@ static int push_dl_task(struct rq *rq)
23993          */
23994         if (dl_task(rq->curr) &&
23995             dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
23996 -           rq->curr->nr_cpus_allowed > 1) {
23997 +           tsk_nr_cpus_allowed(rq->curr) > 1) {
23998                 resched_curr(rq);
23999                 return 0;
24000         }
24001 @@ -1716,9 +1717,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
24003         if (!task_running(rq, p) &&
24004             !test_tsk_need_resched(rq->curr) &&
24005 -           p->nr_cpus_allowed > 1 &&
24006 +           tsk_nr_cpus_allowed(p) > 1 &&
24007             dl_task(rq->curr) &&
24008 -           (rq->curr->nr_cpus_allowed < 2 ||
24009 +           (tsk_nr_cpus_allowed(rq->curr) < 2 ||
24010              !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
24011                 push_dl_tasks(rq);
24012         }
24013 @@ -1819,7 +1820,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
24015         if (task_on_rq_queued(p) && rq->curr != p) {
24016  #ifdef CONFIG_SMP
24017 -               if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
24018 +               if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
24019                         queue_push_tasks(rq);
24020  #endif
24021                 if (dl_task(rq->curr))
24022 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
24023 index 641511771ae6..a2d69b883623 100644
24024 --- a/kernel/sched/debug.c
24025 +++ b/kernel/sched/debug.c
24026 @@ -251,6 +251,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
24027         P(rt_throttled);
24028         PN(rt_time);
24029         PN(rt_runtime);
24030 +#ifdef CONFIG_SMP
24031 +       P(rt_nr_migratory);
24032 +#endif
24034  #undef PN
24035  #undef P
24036 @@ -635,6 +638,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
24037  #endif
24038         P(policy);
24039         P(prio);
24040 +#ifdef CONFIG_PREEMPT_RT_FULL
24041 +       P(migrate_disable);
24042 +#endif
24043 +       P(nr_cpus_allowed);
24044  #undef PN
24045  #undef __PN
24046  #undef P
24047 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
24048 index 812069b66f47..ddf1424bcc78 100644
24049 --- a/kernel/sched/fair.c
24050 +++ b/kernel/sched/fair.c
24051 @@ -3166,7 +3166,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
24052         ideal_runtime = sched_slice(cfs_rq, curr);
24053         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
24054         if (delta_exec > ideal_runtime) {
24055 -               resched_curr(rq_of(cfs_rq));
24056 +               resched_curr_lazy(rq_of(cfs_rq));
24057                 /*
24058                  * The current task ran long enough, ensure it doesn't get
24059                  * re-elected due to buddy favours.
24060 @@ -3190,7 +3190,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
24061                 return;
24063         if (delta > ideal_runtime)
24064 -               resched_curr(rq_of(cfs_rq));
24065 +               resched_curr_lazy(rq_of(cfs_rq));
24068  static void
24069 @@ -3330,7 +3330,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
24070          * validating it and just reschedule.
24071          */
24072         if (queued) {
24073 -               resched_curr(rq_of(cfs_rq));
24074 +               resched_curr_lazy(rq_of(cfs_rq));
24075                 return;
24076         }
24077         /*
24078 @@ -3512,7 +3512,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
24079          * hierarchy can be throttled
24080          */
24081         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
24082 -               resched_curr(rq_of(cfs_rq));
24083 +               resched_curr_lazy(rq_of(cfs_rq));
24086  static __always_inline
24087 @@ -4144,7 +4144,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
24089                 if (delta < 0) {
24090                         if (rq->curr == p)
24091 -                               resched_curr(rq);
24092 +                               resched_curr_lazy(rq);
24093                         return;
24094                 }
24095                 hrtick_start(rq, delta);
24096 @@ -5232,7 +5232,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
24097         return;
24099  preempt:
24100 -       resched_curr(rq);
24101 +       resched_curr_lazy(rq);
24102         /*
24103          * Only set the backward buddy when the current task is still
24104          * on the rq. This can happen when a wakeup gets interleaved
24105 @@ -7983,7 +7983,7 @@ static void task_fork_fair(struct task_struct *p)
24106                  * 'current' within the tree based on its new key value.
24107                  */
24108                 swap(curr->vruntime, se->vruntime);
24109 -               resched_curr(rq);
24110 +               resched_curr_lazy(rq);
24111         }
24113         se->vruntime -= cfs_rq->min_vruntime;
24114 @@ -8008,7 +8008,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
24115          */
24116         if (rq->curr == p) {
24117                 if (p->prio > oldprio)
24118 -                       resched_curr(rq);
24119 +                       resched_curr_lazy(rq);
24120         } else
24121                 check_preempt_curr(rq, p, 0);
24123 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
24124 index 69631fa46c2f..6d28fcd08872 100644
24125 --- a/kernel/sched/features.h
24126 +++ b/kernel/sched/features.h
24127 @@ -45,11 +45,19 @@ SCHED_FEAT(LB_BIAS, true)
24128   */
24129  SCHED_FEAT(NONTASK_CAPACITY, true)
24131 +#ifdef CONFIG_PREEMPT_RT_FULL
24132 +SCHED_FEAT(TTWU_QUEUE, false)
24133 +# ifdef CONFIG_PREEMPT_LAZY
24134 +SCHED_FEAT(PREEMPT_LAZY, true)
24135 +# endif
24136 +#else
24138  /*
24139   * Queue remote wakeups on the target CPU and process them
24140   * using the scheduler IPI. Reduces rq->lock contention/bounces.
24141   */
24142  SCHED_FEAT(TTWU_QUEUE, true)
24143 +#endif
24145  #ifdef HAVE_RT_PUSH_IPI
24146  /*
24147 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
24148 index 95fefb364dab..14f2d740edab 100644
24149 --- a/kernel/sched/rt.c
24150 +++ b/kernel/sched/rt.c
24151 @@ -47,6 +47,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
24153         hrtimer_init(&rt_b->rt_period_timer,
24154                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
24155 +       rt_b->rt_period_timer.irqsafe = 1;
24156         rt_b->rt_period_timer.function = sched_rt_period_timer;
24159 @@ -315,7 +316,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
24160         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
24162         rt_rq->rt_nr_total++;
24163 -       if (p->nr_cpus_allowed > 1)
24164 +       if (tsk_nr_cpus_allowed(p) > 1)
24165                 rt_rq->rt_nr_migratory++;
24167         update_rt_migration(rt_rq);
24168 @@ -332,7 +333,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
24169         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
24171         rt_rq->rt_nr_total--;
24172 -       if (p->nr_cpus_allowed > 1)
24173 +       if (tsk_nr_cpus_allowed(p) > 1)
24174                 rt_rq->rt_nr_migratory--;
24176         update_rt_migration(rt_rq);
24177 @@ -1251,7 +1252,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
24179         enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
24181 -       if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
24182 +       if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
24183                 enqueue_pushable_task(rq, p);
24186 @@ -1340,7 +1341,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
24187          * will have to sort it out.
24188          */
24189         if (curr && unlikely(rt_task(curr)) &&
24190 -           (curr->nr_cpus_allowed < 2 ||
24191 +           (tsk_nr_cpus_allowed(curr) < 2 ||
24192              curr->prio <= p->prio)) {
24193                 int target = find_lowest_rq(p);
24195 @@ -1364,7 +1365,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
24196          * Current can't be migrated, useless to reschedule,
24197          * let's hope p can move out.
24198          */
24199 -       if (rq->curr->nr_cpus_allowed == 1 ||
24200 +       if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
24201             !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
24202                 return;
24204 @@ -1372,7 +1373,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
24205          * p is migratable, so let's not schedule it and
24206          * see if it is pushed or pulled somewhere else.
24207          */
24208 -       if (p->nr_cpus_allowed != 1
24209 +       if (tsk_nr_cpus_allowed(p) != 1
24210             && cpupri_find(&rq->rd->cpupri, p, NULL))
24211                 return;
24213 @@ -1506,7 +1507,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
24214          * The previous task needs to be made eligible for pushing
24215          * if it is still active
24216          */
24217 -       if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
24218 +       if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
24219                 enqueue_pushable_task(rq, p);
24222 @@ -1556,7 +1557,7 @@ static int find_lowest_rq(struct task_struct *task)
24223         if (unlikely(!lowest_mask))
24224                 return -1;
24226 -       if (task->nr_cpus_allowed == 1)
24227 +       if (tsk_nr_cpus_allowed(task) == 1)
24228                 return -1; /* No other targets possible */
24230         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
24231 @@ -1688,7 +1689,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
24233         BUG_ON(rq->cpu != task_cpu(p));
24234         BUG_ON(task_current(rq, p));
24235 -       BUG_ON(p->nr_cpus_allowed <= 1);
24236 +       BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
24238         BUG_ON(!task_on_rq_queued(p));
24239         BUG_ON(!rt_task(p));
24240 @@ -2060,9 +2061,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
24242         if (!task_running(rq, p) &&
24243             !test_tsk_need_resched(rq->curr) &&
24244 -           p->nr_cpus_allowed > 1 &&
24245 +           tsk_nr_cpus_allowed(p) > 1 &&
24246             (dl_task(rq->curr) || rt_task(rq->curr)) &&
24247 -           (rq->curr->nr_cpus_allowed < 2 ||
24248 +           (tsk_nr_cpus_allowed(rq->curr) < 2 ||
24249              rq->curr->prio <= p->prio))
24250                 push_rt_tasks(rq);
24252 @@ -2135,7 +2136,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
24253          */
24254         if (task_on_rq_queued(p) && rq->curr != p) {
24255  #ifdef CONFIG_SMP
24256 -               if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
24257 +               if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
24258                         queue_push_tasks(rq);
24259  #endif /* CONFIG_SMP */
24260                 if (p->prio < rq->curr->prio)
24261 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
24262 index 448a8266ceea..e45a90ce57f7 100644
24263 --- a/kernel/sched/sched.h
24264 +++ b/kernel/sched/sched.h
24265 @@ -1110,6 +1110,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
24266  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
24267  #define WF_FORK                0x02            /* child wakeup after fork */
24268  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
24269 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
24271  /*
24272   * To aid in avoiding the subversion of "niceness" due to uneven distribution
24273 @@ -1309,6 +1310,15 @@ extern void init_sched_fair_class(void);
24274  extern void resched_curr(struct rq *rq);
24275  extern void resched_cpu(int cpu);
24277 +#ifdef CONFIG_PREEMPT_LAZY
24278 +extern void resched_curr_lazy(struct rq *rq);
24279 +#else
24280 +static inline void resched_curr_lazy(struct rq *rq)
24282 +       resched_curr(rq);
24284 +#endif
24286  extern struct rt_bandwidth def_rt_bandwidth;
24287  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
24289 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
24290 new file mode 100644
24291 index 000000000000..205fe36868f9
24292 --- /dev/null
24293 +++ b/kernel/sched/swait.c
24294 @@ -0,0 +1,143 @@
24295 +#include <linux/sched.h>
24296 +#include <linux/swait.h>
24297 +#include <linux/suspend.h>
24299 +void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
24300 +                            struct lock_class_key *key)
24302 +       raw_spin_lock_init(&q->lock);
24303 +       lockdep_set_class_and_name(&q->lock, key, name);
24304 +       INIT_LIST_HEAD(&q->task_list);
24306 +EXPORT_SYMBOL(__init_swait_queue_head);
24309 + * The thing about the wake_up_state() return value; I think we can ignore it.
24310 + *
24311 + * If for some reason it would return 0, that means the previously waiting
24312 + * task is already running, so it will observe condition true (or has already).
24313 + */
24314 +void swake_up_locked(struct swait_queue_head *q)
24316 +       struct swait_queue *curr;
24318 +       if (list_empty(&q->task_list))
24319 +               return;
24321 +       curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
24322 +       wake_up_process(curr->task);
24323 +       list_del_init(&curr->task_list);
24325 +EXPORT_SYMBOL(swake_up_locked);
24327 +void swake_up_all_locked(struct swait_queue_head *q)
24329 +       struct swait_queue *curr;
24330 +       int wakes = 0;
24332 +       while (!list_empty(&q->task_list)) {
24334 +               curr = list_first_entry(&q->task_list, typeof(*curr),
24335 +                                       task_list);
24336 +               wake_up_process(curr->task);
24337 +               list_del_init(&curr->task_list);
24338 +               wakes++;
24339 +       }
24340 +       if (pm_in_action)
24341 +               return;
24342 +       WARN(wakes > 2, "complate_all() with %d waiters\n", wakes);
24344 +EXPORT_SYMBOL(swake_up_all_locked);
24346 +void swake_up(struct swait_queue_head *q)
24348 +       unsigned long flags;
24350 +       if (!swait_active(q))
24351 +               return;
24353 +       raw_spin_lock_irqsave(&q->lock, flags);
24354 +       swake_up_locked(q);
24355 +       raw_spin_unlock_irqrestore(&q->lock, flags);
24357 +EXPORT_SYMBOL(swake_up);
24360 + * Does not allow usage from IRQ disabled, since we must be able to
24361 + * release IRQs to guarantee bounded hold time.
24362 + */
24363 +void swake_up_all(struct swait_queue_head *q)
24365 +       struct swait_queue *curr;
24366 +       LIST_HEAD(tmp);
24368 +       if (!swait_active(q))
24369 +               return;
24371 +       raw_spin_lock_irq(&q->lock);
24372 +       list_splice_init(&q->task_list, &tmp);
24373 +       while (!list_empty(&tmp)) {
24374 +               curr = list_first_entry(&tmp, typeof(*curr), task_list);
24376 +               wake_up_state(curr->task, TASK_NORMAL);
24377 +               list_del_init(&curr->task_list);
24379 +               if (list_empty(&tmp))
24380 +                       break;
24382 +               raw_spin_unlock_irq(&q->lock);
24383 +               raw_spin_lock_irq(&q->lock);
24384 +       }
24385 +       raw_spin_unlock_irq(&q->lock);
24387 +EXPORT_SYMBOL(swake_up_all);
24389 +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
24391 +       wait->task = current;
24392 +       if (list_empty(&wait->task_list))
24393 +               list_add(&wait->task_list, &q->task_list);
24396 +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
24398 +       unsigned long flags;
24400 +       raw_spin_lock_irqsave(&q->lock, flags);
24401 +       __prepare_to_swait(q, wait);
24402 +       set_current_state(state);
24403 +       raw_spin_unlock_irqrestore(&q->lock, flags);
24405 +EXPORT_SYMBOL(prepare_to_swait);
24407 +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
24409 +       if (signal_pending_state(state, current))
24410 +               return -ERESTARTSYS;
24412 +       prepare_to_swait(q, wait, state);
24414 +       return 0;
24416 +EXPORT_SYMBOL(prepare_to_swait_event);
24418 +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
24420 +       __set_current_state(TASK_RUNNING);
24421 +       if (!list_empty(&wait->task_list))
24422 +               list_del_init(&wait->task_list);
24425 +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
24427 +       unsigned long flags;
24429 +       __set_current_state(TASK_RUNNING);
24431 +       if (!list_empty_careful(&wait->task_list)) {
24432 +               raw_spin_lock_irqsave(&q->lock, flags);
24433 +               list_del_init(&wait->task_list);
24434 +               raw_spin_unlock_irqrestore(&q->lock, flags);
24435 +       }
24437 +EXPORT_SYMBOL(finish_swait);
24438 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
24439 new file mode 100644
24440 index 000000000000..1950f40ca725
24441 --- /dev/null
24442 +++ b/kernel/sched/swork.c
24443 @@ -0,0 +1,173 @@
24445 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
24446 + *
24447 + * Provides a framework for enqueuing callbacks from irq context
24448 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
24449 + */
24451 +#include <linux/swait.h>
24452 +#include <linux/swork.h>
24453 +#include <linux/kthread.h>
24454 +#include <linux/slab.h>
24455 +#include <linux/spinlock.h>
24456 +#include <linux/export.h>
24458 +#define SWORK_EVENT_PENDING     (1 << 0)
24460 +static DEFINE_MUTEX(worker_mutex);
24461 +static struct sworker *glob_worker;
24463 +struct sworker {
24464 +       struct list_head events;
24465 +       struct swait_queue_head wq;
24467 +       raw_spinlock_t lock;
24469 +       struct task_struct *task;
24470 +       int refs;
24473 +static bool swork_readable(struct sworker *worker)
24475 +       bool r;
24477 +       if (kthread_should_stop())
24478 +               return true;
24480 +       raw_spin_lock_irq(&worker->lock);
24481 +       r = !list_empty(&worker->events);
24482 +       raw_spin_unlock_irq(&worker->lock);
24484 +       return r;
24487 +static int swork_kthread(void *arg)
24489 +       struct sworker *worker = arg;
24491 +       for (;;) {
24492 +               swait_event_interruptible(worker->wq,
24493 +                                       swork_readable(worker));
24494 +               if (kthread_should_stop())
24495 +                       break;
24497 +               raw_spin_lock_irq(&worker->lock);
24498 +               while (!list_empty(&worker->events)) {
24499 +                       struct swork_event *sev;
24501 +                       sev = list_first_entry(&worker->events,
24502 +                                       struct swork_event, item);
24503 +                       list_del(&sev->item);
24504 +                       raw_spin_unlock_irq(&worker->lock);
24506 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
24507 +                                                        &sev->flags));
24508 +                       sev->func(sev);
24509 +                       raw_spin_lock_irq(&worker->lock);
24510 +               }
24511 +               raw_spin_unlock_irq(&worker->lock);
24512 +       }
24513 +       return 0;
24516 +static struct sworker *swork_create(void)
24518 +       struct sworker *worker;
24520 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
24521 +       if (!worker)
24522 +               return ERR_PTR(-ENOMEM);
24524 +       INIT_LIST_HEAD(&worker->events);
24525 +       raw_spin_lock_init(&worker->lock);
24526 +       init_swait_queue_head(&worker->wq);
24528 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
24529 +       if (IS_ERR(worker->task)) {
24530 +               kfree(worker);
24531 +               return ERR_PTR(-ENOMEM);
24532 +       }
24534 +       return worker;
24537 +static void swork_destroy(struct sworker *worker)
24539 +       kthread_stop(worker->task);
24541 +       WARN_ON(!list_empty(&worker->events));
24542 +       kfree(worker);
24545 +/**
24546 + * swork_queue - queue swork
24547 + *
24548 + * Returns %false if @work was already on a queue, %true otherwise.
24549 + *
24550 + * The work is queued and processed on a random CPU
24551 + */
24552 +bool swork_queue(struct swork_event *sev)
24554 +       unsigned long flags;
24556 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
24557 +               return false;
24559 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
24560 +       list_add_tail(&sev->item, &glob_worker->events);
24561 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
24563 +       swake_up(&glob_worker->wq);
24564 +       return true;
24566 +EXPORT_SYMBOL_GPL(swork_queue);
24568 +/**
24569 + * swork_get - get an instance of the sworker
24570 + *
24571 + * Returns an negative error code if the initialization if the worker did not
24572 + * work, %0 otherwise.
24573 + *
24574 + */
24575 +int swork_get(void)
24577 +       struct sworker *worker;
24579 +       mutex_lock(&worker_mutex);
24580 +       if (!glob_worker) {
24581 +               worker = swork_create();
24582 +               if (IS_ERR(worker)) {
24583 +                       mutex_unlock(&worker_mutex);
24584 +                       return -ENOMEM;
24585 +               }
24587 +               glob_worker = worker;
24588 +       }
24590 +       glob_worker->refs++;
24591 +       mutex_unlock(&worker_mutex);
24593 +       return 0;
24595 +EXPORT_SYMBOL_GPL(swork_get);
24597 +/**
24598 + * swork_put - puts an instance of the sworker
24599 + *
24600 + * Will destroy the sworker thread. This function must not be called until all
24601 + * queued events have been completed.
24602 + */
24603 +void swork_put(void)
24605 +       mutex_lock(&worker_mutex);
24607 +       glob_worker->refs--;
24608 +       if (glob_worker->refs > 0)
24609 +               goto out;
24611 +       swork_destroy(glob_worker);
24612 +       glob_worker = NULL;
24613 +out:
24614 +       mutex_unlock(&worker_mutex);
24616 +EXPORT_SYMBOL_GPL(swork_put);
24617 diff --git a/kernel/signal.c b/kernel/signal.c
24618 index 4a548c6a4118..e870a133857b 100644
24619 --- a/kernel/signal.c
24620 +++ b/kernel/signal.c
24621 @@ -14,6 +14,7 @@
24622  #include <linux/export.h>
24623  #include <linux/init.h>
24624  #include <linux/sched.h>
24625 +#include <linux/sched/rt.h>
24626  #include <linux/fs.h>
24627  #include <linux/tty.h>
24628  #include <linux/binfmts.h>
24629 @@ -354,13 +355,30 @@ static bool task_participate_group_stop(struct task_struct *task)
24630         return false;
24633 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
24635 +       struct sigqueue *q = t->sigqueue_cache;
24637 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
24638 +               return NULL;
24639 +       return q;
24642 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
24644 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
24645 +               return 0;
24646 +       return 1;
24649  /*
24650   * allocate a new signal queue record
24651   * - this may be called without locks if and only if t == current, otherwise an
24652   *   appropriate lock must be held to stop the target task from exiting
24653   */
24654  static struct sigqueue *
24655 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
24656 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
24657 +                   int override_rlimit, int fromslab)
24659         struct sigqueue *q = NULL;
24660         struct user_struct *user;
24661 @@ -377,7 +395,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
24662         if (override_rlimit ||
24663             atomic_read(&user->sigpending) <=
24664                         task_rlimit(t, RLIMIT_SIGPENDING)) {
24665 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
24666 +               if (!fromslab)
24667 +                       q = get_task_cache(t);
24668 +               if (!q)
24669 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
24670         } else {
24671                 print_dropped_signal(sig);
24672         }
24673 @@ -394,6 +415,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
24674         return q;
24677 +static struct sigqueue *
24678 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
24679 +                int override_rlimit)
24681 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
24684  static void __sigqueue_free(struct sigqueue *q)
24686         if (q->flags & SIGQUEUE_PREALLOC)
24687 @@ -403,6 +431,21 @@ static void __sigqueue_free(struct sigqueue *q)
24688         kmem_cache_free(sigqueue_cachep, q);
24691 +static void sigqueue_free_current(struct sigqueue *q)
24693 +       struct user_struct *up;
24695 +       if (q->flags & SIGQUEUE_PREALLOC)
24696 +               return;
24698 +       up = q->user;
24699 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
24700 +               atomic_dec(&up->sigpending);
24701 +               free_uid(up);
24702 +       } else
24703 +                 __sigqueue_free(q);
24706  void flush_sigqueue(struct sigpending *queue)
24708         struct sigqueue *q;
24709 @@ -415,6 +458,21 @@ void flush_sigqueue(struct sigpending *queue)
24710         }
24714 + * Called from __exit_signal. Flush tsk->pending and
24715 + * tsk->sigqueue_cache
24716 + */
24717 +void flush_task_sigqueue(struct task_struct *tsk)
24719 +       struct sigqueue *q;
24721 +       flush_sigqueue(&tsk->pending);
24723 +       q = get_task_cache(tsk);
24724 +       if (q)
24725 +               kmem_cache_free(sigqueue_cachep, q);
24728  /*
24729   * Flush all pending signals for this kthread.
24730   */
24731 @@ -534,7 +592,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
24732                         (info->si_code == SI_TIMER) &&
24733                         (info->si_sys_private);
24735 -               __sigqueue_free(first);
24736 +               sigqueue_free_current(first);
24737         } else {
24738                 /*
24739                  * Ok, it wasn't in the queue.  This must be
24740 @@ -570,6 +628,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
24741         bool resched_timer = false;
24742         int signr;
24744 +       WARN_ON_ONCE(tsk != current);
24746         /* We only dequeue private signals from ourselves, we don't let
24747          * signalfd steal them
24748          */
24749 @@ -1166,8 +1226,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
24750   * We don't want to have recursive SIGSEGV's etc, for example,
24751   * that is why we also clear SIGNAL_UNKILLABLE.
24752   */
24753 -int
24754 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
24755 +static int
24756 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
24758         unsigned long int flags;
24759         int ret, blocked, ignored;
24760 @@ -1192,6 +1252,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
24761         return ret;
24764 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
24767 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
24768 + * since it can not enable preemption, and the signal code's spin_locks
24769 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
24770 + * send the signal on exit of the trap.
24771 + */
24772 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
24773 +       if (in_atomic()) {
24774 +               if (WARN_ON_ONCE(t != current))
24775 +                       return 0;
24776 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
24777 +                       return 0;
24779 +               if (is_si_special(info)) {
24780 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
24781 +                       t->forced_info.si_signo = sig;
24782 +                       t->forced_info.si_errno = 0;
24783 +                       t->forced_info.si_code = SI_KERNEL;
24784 +                       t->forced_info.si_pid = 0;
24785 +                       t->forced_info.si_uid = 0;
24786 +               } else {
24787 +                       t->forced_info = *info;
24788 +               }
24790 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
24791 +               return 0;
24792 +       }
24793 +#endif
24794 +       return do_force_sig_info(sig, info, t);
24797  /*
24798   * Nuke all other threads in the group.
24799   */
24800 @@ -1226,12 +1319,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
24801                  * Disable interrupts early to avoid deadlocks.
24802                  * See rcu_read_unlock() comment header for details.
24803                  */
24804 -               local_irq_save(*flags);
24805 +               local_irq_save_nort(*flags);
24806                 rcu_read_lock();
24807                 sighand = rcu_dereference(tsk->sighand);
24808                 if (unlikely(sighand == NULL)) {
24809                         rcu_read_unlock();
24810 -                       local_irq_restore(*flags);
24811 +                       local_irq_restore_nort(*flags);
24812                         break;
24813                 }
24814                 /*
24815 @@ -1252,7 +1345,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
24816                 }
24817                 spin_unlock(&sighand->siglock);
24818                 rcu_read_unlock();
24819 -               local_irq_restore(*flags);
24820 +               local_irq_restore_nort(*flags);
24821         }
24823         return sighand;
24824 @@ -1495,7 +1588,8 @@ EXPORT_SYMBOL(kill_pid);
24825   */
24826  struct sigqueue *sigqueue_alloc(void)
24828 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
24829 +       /* Preallocated sigqueue objects always from the slabcache ! */
24830 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
24832         if (q)
24833                 q->flags |= SIGQUEUE_PREALLOC;
24834 @@ -1856,15 +1950,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
24835                 if (gstop_done && ptrace_reparented(current))
24836                         do_notify_parent_cldstop(current, false, why);
24838 -               /*
24839 -                * Don't want to allow preemption here, because
24840 -                * sys_ptrace() needs this task to be inactive.
24841 -                *
24842 -                * XXX: implement read_unlock_no_resched().
24843 -                */
24844 -               preempt_disable();
24845                 read_unlock(&tasklist_lock);
24846 -               preempt_enable_no_resched();
24847                 freezable_schedule();
24848         } else {
24849                 /*
24850 diff --git a/kernel/softirq.c b/kernel/softirq.c
24851 index 479e4436f787..cb9c1d5dee10 100644
24852 --- a/kernel/softirq.c
24853 +++ b/kernel/softirq.c
24854 @@ -21,10 +21,12 @@
24855  #include <linux/freezer.h>
24856  #include <linux/kthread.h>
24857  #include <linux/rcupdate.h>
24858 +#include <linux/delay.h>
24859  #include <linux/ftrace.h>
24860  #include <linux/smp.h>
24861  #include <linux/smpboot.h>
24862  #include <linux/tick.h>
24863 +#include <linux/locallock.h>
24864  #include <linux/irq.h>
24866  #define CREATE_TRACE_POINTS
24867 @@ -56,12 +58,108 @@ EXPORT_SYMBOL(irq_stat);
24868  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
24870  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
24871 +#ifdef CONFIG_PREEMPT_RT_FULL
24872 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
24873 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
24874 +#endif
24876  const char * const softirq_to_name[NR_SOFTIRQS] = {
24877         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
24878         "TASKLET", "SCHED", "HRTIMER", "RCU"
24879  };
24881 +#ifdef CONFIG_NO_HZ_COMMON
24882 +# ifdef CONFIG_PREEMPT_RT_FULL
24884 +struct softirq_runner {
24885 +       struct task_struct *runner[NR_SOFTIRQS];
24888 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
24890 +static inline void softirq_set_runner(unsigned int sirq)
24892 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24894 +       sr->runner[sirq] = current;
24897 +static inline void softirq_clr_runner(unsigned int sirq)
24899 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24901 +       sr->runner[sirq] = NULL;
24905 + * On preempt-rt a softirq running context might be blocked on a
24906 + * lock. There might be no other runnable task on this CPU because the
24907 + * lock owner runs on some other CPU. So we have to go into idle with
24908 + * the pending bit set. Therefor we need to check this otherwise we
24909 + * warn about false positives which confuses users and defeats the
24910 + * whole purpose of this test.
24911 + *
24912 + * This code is called with interrupts disabled.
24913 + */
24914 +void softirq_check_pending_idle(void)
24916 +       static int rate_limit;
24917 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24918 +       u32 warnpending;
24919 +       int i;
24921 +       if (rate_limit >= 10)
24922 +               return;
24924 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
24925 +       for (i = 0; i < NR_SOFTIRQS; i++) {
24926 +               struct task_struct *tsk = sr->runner[i];
24928 +               /*
24929 +                * The wakeup code in rtmutex.c wakes up the task
24930 +                * _before_ it sets pi_blocked_on to NULL under
24931 +                * tsk->pi_lock. So we need to check for both: state
24932 +                * and pi_blocked_on.
24933 +                */
24934 +               if (tsk) {
24935 +                       raw_spin_lock(&tsk->pi_lock);
24936 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
24937 +                               /* Clear all bits pending in that task */
24938 +                               warnpending &= ~(tsk->softirqs_raised);
24939 +                               warnpending &= ~(1 << i);
24940 +                       }
24941 +                       raw_spin_unlock(&tsk->pi_lock);
24942 +               }
24943 +       }
24945 +       if (warnpending) {
24946 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
24947 +                      warnpending);
24948 +               rate_limit++;
24949 +       }
24951 +# else
24953 + * On !PREEMPT_RT we just printk rate limited:
24954 + */
24955 +void softirq_check_pending_idle(void)
24957 +       static int rate_limit;
24959 +       if (rate_limit < 10 &&
24960 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
24961 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
24962 +                      local_softirq_pending());
24963 +               rate_limit++;
24964 +       }
24966 +# endif
24968 +#else /* !CONFIG_NO_HZ_COMMON */
24969 +static inline void softirq_set_runner(unsigned int sirq) { }
24970 +static inline void softirq_clr_runner(unsigned int sirq) { }
24971 +#endif
24973  /*
24974   * we cannot loop indefinitely here to avoid userspace starvation,
24975   * but we also don't want to introduce a worst case 1/HZ latency
24976 @@ -77,6 +175,79 @@ static void wakeup_softirqd(void)
24977                 wake_up_process(tsk);
24980 +#ifdef CONFIG_PREEMPT_RT_FULL
24981 +static void wakeup_timer_softirqd(void)
24983 +       /* Interrupts are disabled: no need to stop preemption */
24984 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
24986 +       if (tsk && tsk->state != TASK_RUNNING)
24987 +               wake_up_process(tsk);
24989 +#endif
24991 +static void handle_softirq(unsigned int vec_nr)
24993 +       struct softirq_action *h = softirq_vec + vec_nr;
24994 +       int prev_count;
24996 +       prev_count = preempt_count();
24998 +       kstat_incr_softirqs_this_cpu(vec_nr);
25000 +       trace_softirq_entry(vec_nr);
25001 +       h->action(h);
25002 +       trace_softirq_exit(vec_nr);
25003 +       if (unlikely(prev_count != preempt_count())) {
25004 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
25005 +                      vec_nr, softirq_to_name[vec_nr], h->action,
25006 +                      prev_count, preempt_count());
25007 +               preempt_count_set(prev_count);
25008 +       }
25011 +#ifndef CONFIG_PREEMPT_RT_FULL
25012 +static inline int ksoftirqd_softirq_pending(void)
25014 +       return local_softirq_pending();
25017 +static void handle_pending_softirqs(u32 pending)
25019 +       struct softirq_action *h = softirq_vec;
25020 +       int softirq_bit;
25022 +       local_irq_enable();
25024 +       h = softirq_vec;
25026 +       while ((softirq_bit = ffs(pending))) {
25027 +               unsigned int vec_nr;
25029 +               h += softirq_bit - 1;
25030 +               vec_nr = h - softirq_vec;
25031 +               handle_softirq(vec_nr);
25033 +               h++;
25034 +               pending >>= softirq_bit;
25035 +       }
25037 +       rcu_bh_qs();
25038 +       local_irq_disable();
25041 +static void run_ksoftirqd(unsigned int cpu)
25043 +       local_irq_disable();
25044 +       if (ksoftirqd_softirq_pending()) {
25045 +               __do_softirq();
25046 +               local_irq_enable();
25047 +               cond_resched_rcu_qs();
25048 +               return;
25049 +       }
25050 +       local_irq_enable();
25053  /*
25054   * preempt_count and SOFTIRQ_OFFSET usage:
25055   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
25056 @@ -116,9 +287,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
25058         if (preempt_count() == cnt) {
25059  #ifdef CONFIG_DEBUG_PREEMPT
25060 -               current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
25061 +               current->preempt_disable_ip = get_lock_parent_ip();
25062  #endif
25063 -               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
25064 +               trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
25065         }
25067  EXPORT_SYMBOL(__local_bh_disable_ip);
25068 @@ -232,10 +403,8 @@ asmlinkage __visible void __do_softirq(void)
25069         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
25070         unsigned long old_flags = current->flags;
25071         int max_restart = MAX_SOFTIRQ_RESTART;
25072 -       struct softirq_action *h;
25073         bool in_hardirq;
25074         __u32 pending;
25075 -       int softirq_bit;
25077         /*
25078          * Mask out PF_MEMALLOC s current task context is borrowed for the
25079 @@ -254,36 +423,7 @@ asmlinkage __visible void __do_softirq(void)
25080         /* Reset the pending bitmask before enabling irqs */
25081         set_softirq_pending(0);
25083 -       local_irq_enable();
25085 -       h = softirq_vec;
25087 -       while ((softirq_bit = ffs(pending))) {
25088 -               unsigned int vec_nr;
25089 -               int prev_count;
25091 -               h += softirq_bit - 1;
25093 -               vec_nr = h - softirq_vec;
25094 -               prev_count = preempt_count();
25096 -               kstat_incr_softirqs_this_cpu(vec_nr);
25098 -               trace_softirq_entry(vec_nr);
25099 -               h->action(h);
25100 -               trace_softirq_exit(vec_nr);
25101 -               if (unlikely(prev_count != preempt_count())) {
25102 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
25103 -                              vec_nr, softirq_to_name[vec_nr], h->action,
25104 -                              prev_count, preempt_count());
25105 -                       preempt_count_set(prev_count);
25106 -               }
25107 -               h++;
25108 -               pending >>= softirq_bit;
25109 -       }
25111 -       rcu_bh_qs();
25112 -       local_irq_disable();
25113 +       handle_pending_softirqs(pending);
25115         pending = local_softirq_pending();
25116         if (pending) {
25117 @@ -319,6 +459,310 @@ asmlinkage __visible void do_softirq(void)
25118         local_irq_restore(flags);
25122 + * This function must run with irqs disabled!
25123 + */
25124 +void raise_softirq_irqoff(unsigned int nr)
25126 +       __raise_softirq_irqoff(nr);
25128 +       /*
25129 +        * If we're in an interrupt or softirq, we're done
25130 +        * (this also catches softirq-disabled code). We will
25131 +        * actually run the softirq once we return from
25132 +        * the irq or softirq.
25133 +        *
25134 +        * Otherwise we wake up ksoftirqd to make sure we
25135 +        * schedule the softirq soon.
25136 +        */
25137 +       if (!in_interrupt())
25138 +               wakeup_softirqd();
25141 +void __raise_softirq_irqoff(unsigned int nr)
25143 +       trace_softirq_raise(nr);
25144 +       or_softirq_pending(1UL << nr);
25147 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
25148 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
25149 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
25151 +#else /* !PREEMPT_RT_FULL */
25154 + * On RT we serialize softirq execution with a cpu local lock per softirq
25155 + */
25156 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
25158 +void __init softirq_early_init(void)
25160 +       int i;
25162 +       for (i = 0; i < NR_SOFTIRQS; i++)
25163 +               local_irq_lock_init(local_softirq_locks[i]);
25166 +static void lock_softirq(int which)
25168 +       local_lock(local_softirq_locks[which]);
25171 +static void unlock_softirq(int which)
25173 +       local_unlock(local_softirq_locks[which]);
25176 +static void do_single_softirq(int which)
25178 +       unsigned long old_flags = current->flags;
25180 +       current->flags &= ~PF_MEMALLOC;
25181 +       vtime_account_irq_enter(current);
25182 +       current->flags |= PF_IN_SOFTIRQ;
25183 +       lockdep_softirq_enter();
25184 +       local_irq_enable();
25185 +       handle_softirq(which);
25186 +       local_irq_disable();
25187 +       lockdep_softirq_exit();
25188 +       current->flags &= ~PF_IN_SOFTIRQ;
25189 +       vtime_account_irq_enter(current);
25190 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
25194 + * Called with interrupts disabled. Process softirqs which were raised
25195 + * in current context (or on behalf of ksoftirqd).
25196 + */
25197 +static void do_current_softirqs(void)
25199 +       while (current->softirqs_raised) {
25200 +               int i = __ffs(current->softirqs_raised);
25201 +               unsigned int pending, mask = (1U << i);
25203 +               current->softirqs_raised &= ~mask;
25204 +               local_irq_enable();
25206 +               /*
25207 +                * If the lock is contended, we boost the owner to
25208 +                * process the softirq or leave the critical section
25209 +                * now.
25210 +                */
25211 +               lock_softirq(i);
25212 +               local_irq_disable();
25213 +               softirq_set_runner(i);
25214 +               /*
25215 +                * Check with the local_softirq_pending() bits,
25216 +                * whether we need to process this still or if someone
25217 +                * else took care of it.
25218 +                */
25219 +               pending = local_softirq_pending();
25220 +               if (pending & mask) {
25221 +                       set_softirq_pending(pending & ~mask);
25222 +                       do_single_softirq(i);
25223 +               }
25224 +               softirq_clr_runner(i);
25225 +               WARN_ON(current->softirq_nestcnt != 1);
25226 +               local_irq_enable();
25227 +               unlock_softirq(i);
25228 +               local_irq_disable();
25229 +       }
25232 +void __local_bh_disable(void)
25234 +       if (++current->softirq_nestcnt == 1)
25235 +               migrate_disable();
25237 +EXPORT_SYMBOL(__local_bh_disable);
25239 +void __local_bh_enable(void)
25241 +       if (WARN_ON(current->softirq_nestcnt == 0))
25242 +               return;
25244 +       local_irq_disable();
25245 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
25246 +               do_current_softirqs();
25247 +       local_irq_enable();
25249 +       if (--current->softirq_nestcnt == 0)
25250 +               migrate_enable();
25252 +EXPORT_SYMBOL(__local_bh_enable);
25254 +void _local_bh_enable(void)
25256 +       if (WARN_ON(current->softirq_nestcnt == 0))
25257 +               return;
25258 +       if (--current->softirq_nestcnt == 0)
25259 +               migrate_enable();
25261 +EXPORT_SYMBOL(_local_bh_enable);
25263 +int in_serving_softirq(void)
25265 +       return current->flags & PF_IN_SOFTIRQ;
25267 +EXPORT_SYMBOL(in_serving_softirq);
25269 +/* Called with preemption disabled */
25270 +static void run_ksoftirqd(unsigned int cpu)
25272 +       local_irq_disable();
25273 +       current->softirq_nestcnt++;
25275 +       do_current_softirqs();
25276 +       current->softirq_nestcnt--;
25277 +       local_irq_enable();
25278 +       cond_resched_rcu_qs();
25282 + * Called from netif_rx_ni(). Preemption enabled, but migration
25283 + * disabled. So the cpu can't go away under us.
25284 + */
25285 +void thread_do_softirq(void)
25287 +       if (!in_serving_softirq() && current->softirqs_raised) {
25288 +               current->softirq_nestcnt++;
25289 +               do_current_softirqs();
25290 +               current->softirq_nestcnt--;
25291 +       }
25294 +static void do_raise_softirq_irqoff(unsigned int nr)
25296 +       unsigned int mask;
25298 +       mask = 1UL << nr;
25300 +       trace_softirq_raise(nr);
25301 +       or_softirq_pending(mask);
25303 +       /*
25304 +        * If we are not in a hard interrupt and inside a bh disabled
25305 +        * region, we simply raise the flag on current. local_bh_enable()
25306 +        * will make sure that the softirq is executed. Otherwise we
25307 +        * delegate it to ksoftirqd.
25308 +        */
25309 +       if (!in_irq() && current->softirq_nestcnt)
25310 +               current->softirqs_raised |= mask;
25311 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
25312 +               return;
25314 +       if (mask & TIMER_SOFTIRQS)
25315 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
25316 +       else
25317 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
25320 +static void wakeup_proper_softirq(unsigned int nr)
25322 +       if ((1UL << nr) & TIMER_SOFTIRQS)
25323 +               wakeup_timer_softirqd();
25324 +       else
25325 +               wakeup_softirqd();
25329 +void __raise_softirq_irqoff(unsigned int nr)
25331 +       do_raise_softirq_irqoff(nr);
25332 +       if (!in_irq() && !current->softirq_nestcnt)
25333 +               wakeup_proper_softirq(nr);
25337 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
25338 + */
25339 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
25341 +       unsigned int mask;
25343 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
25344 +                        !__this_cpu_read(ktimer_softirqd)))
25345 +               return;
25346 +       mask = 1UL << nr;
25348 +       trace_softirq_raise(nr);
25349 +       or_softirq_pending(mask);
25350 +       if (mask & TIMER_SOFTIRQS)
25351 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
25352 +       else
25353 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
25354 +       wakeup_proper_softirq(nr);
25358 + * This function must run with irqs disabled!
25359 + */
25360 +void raise_softirq_irqoff(unsigned int nr)
25362 +       do_raise_softirq_irqoff(nr);
25364 +       /*
25365 +        * If we're in an hard interrupt we let irq return code deal
25366 +        * with the wakeup of ksoftirqd.
25367 +        */
25368 +       if (in_irq())
25369 +               return;
25370 +       /*
25371 +        * If we are in thread context but outside of a bh disabled
25372 +        * region, we need to wake ksoftirqd as well.
25373 +        *
25374 +        * CHECKME: Some of the places which do that could be wrapped
25375 +        * into local_bh_disable/enable pairs. Though it's unclear
25376 +        * whether this is worth the effort. To find those places just
25377 +        * raise a WARN() if the condition is met.
25378 +        */
25379 +       if (!current->softirq_nestcnt)
25380 +               wakeup_proper_softirq(nr);
25383 +static inline int ksoftirqd_softirq_pending(void)
25385 +       return current->softirqs_raised;
25388 +static inline void local_bh_disable_nort(void) { }
25389 +static inline void _local_bh_enable_nort(void) { }
25391 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
25393 +       /* Take over all but timer pending softirqs when starting */
25394 +       local_irq_disable();
25395 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
25396 +       local_irq_enable();
25399 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
25401 +       struct sched_param param = { .sched_priority = 1 };
25403 +       sched_setscheduler(current, SCHED_FIFO, &param);
25405 +       /* Take over timer pending softirqs when starting */
25406 +       local_irq_disable();
25407 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
25408 +       local_irq_enable();
25411 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
25412 +                                                   bool online)
25414 +       struct sched_param param = { .sched_priority = 0 };
25416 +       sched_setscheduler(current, SCHED_NORMAL, &param);
25419 +static int ktimer_softirqd_should_run(unsigned int cpu)
25421 +       return current->softirqs_raised;
25424 +#endif /* PREEMPT_RT_FULL */
25425  /*
25426   * Enter an interrupt context.
25427   */
25428 @@ -330,9 +774,9 @@ void irq_enter(void)
25429                  * Prevent raise_softirq from needlessly waking up ksoftirqd
25430                  * here, as softirq will be serviced on return from interrupt.
25431                  */
25432 -               local_bh_disable();
25433 +               local_bh_disable_nort();
25434                 tick_irq_enter();
25435 -               _local_bh_enable();
25436 +               _local_bh_enable_nort();
25437         }
25439         __irq_enter();
25440 @@ -340,6 +784,7 @@ void irq_enter(void)
25442  static inline void invoke_softirq(void)
25444 +#ifndef CONFIG_PREEMPT_RT_FULL
25445         if (!force_irqthreads) {
25446  #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
25447                 /*
25448 @@ -359,6 +804,18 @@ static inline void invoke_softirq(void)
25449         } else {
25450                 wakeup_softirqd();
25451         }
25452 +#else /* PREEMPT_RT_FULL */
25453 +       unsigned long flags;
25455 +       local_irq_save(flags);
25456 +       if (__this_cpu_read(ksoftirqd) &&
25457 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
25458 +               wakeup_softirqd();
25459 +       if (__this_cpu_read(ktimer_softirqd) &&
25460 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
25461 +               wakeup_timer_softirqd();
25462 +       local_irq_restore(flags);
25463 +#endif
25466  static inline void tick_irq_exit(void)
25467 @@ -395,26 +852,6 @@ void irq_exit(void)
25468         trace_hardirq_exit(); /* must be last! */
25472 - * This function must run with irqs disabled!
25473 - */
25474 -inline void raise_softirq_irqoff(unsigned int nr)
25476 -       __raise_softirq_irqoff(nr);
25478 -       /*
25479 -        * If we're in an interrupt or softirq, we're done
25480 -        * (this also catches softirq-disabled code). We will
25481 -        * actually run the softirq once we return from
25482 -        * the irq or softirq.
25483 -        *
25484 -        * Otherwise we wake up ksoftirqd to make sure we
25485 -        * schedule the softirq soon.
25486 -        */
25487 -       if (!in_interrupt())
25488 -               wakeup_softirqd();
25491  void raise_softirq(unsigned int nr)
25493         unsigned long flags;
25494 @@ -424,12 +861,6 @@ void raise_softirq(unsigned int nr)
25495         local_irq_restore(flags);
25498 -void __raise_softirq_irqoff(unsigned int nr)
25500 -       trace_softirq_raise(nr);
25501 -       or_softirq_pending(1UL << nr);
25504  void open_softirq(int nr, void (*action)(struct softirq_action *))
25506         softirq_vec[nr].action = action;
25507 @@ -446,15 +877,45 @@ struct tasklet_head {
25508  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
25509  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
25511 +static void inline
25512 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
25514 +       if (tasklet_trylock(t)) {
25515 +again:
25516 +               /* We may have been preempted before tasklet_trylock
25517 +                * and __tasklet_action may have already run.
25518 +                * So double check the sched bit while the takslet
25519 +                * is locked before adding it to the list.
25520 +                */
25521 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
25522 +                       t->next = NULL;
25523 +                       *head->tail = t;
25524 +                       head->tail = &(t->next);
25525 +                       raise_softirq_irqoff(nr);
25526 +                       tasklet_unlock(t);
25527 +               } else {
25528 +                       /* This is subtle. If we hit the corner case above
25529 +                        * It is possible that we get preempted right here,
25530 +                        * and another task has successfully called
25531 +                        * tasklet_schedule(), then this function, and
25532 +                        * failed on the trylock. Thus we must be sure
25533 +                        * before releasing the tasklet lock, that the
25534 +                        * SCHED_BIT is clear. Otherwise the tasklet
25535 +                        * may get its SCHED_BIT set, but not added to the
25536 +                        * list
25537 +                        */
25538 +                       if (!tasklet_tryunlock(t))
25539 +                               goto again;
25540 +               }
25541 +       }
25544  void __tasklet_schedule(struct tasklet_struct *t)
25546         unsigned long flags;
25548         local_irq_save(flags);
25549 -       t->next = NULL;
25550 -       *__this_cpu_read(tasklet_vec.tail) = t;
25551 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
25552 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
25553 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
25554         local_irq_restore(flags);
25556  EXPORT_SYMBOL(__tasklet_schedule);
25557 @@ -464,10 +925,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
25558         unsigned long flags;
25560         local_irq_save(flags);
25561 -       t->next = NULL;
25562 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
25563 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
25564 -       raise_softirq_irqoff(HI_SOFTIRQ);
25565 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
25566         local_irq_restore(flags);
25568  EXPORT_SYMBOL(__tasklet_hi_schedule);
25569 @@ -476,82 +934,122 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
25571         BUG_ON(!irqs_disabled());
25573 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
25574 -       __this_cpu_write(tasklet_hi_vec.head, t);
25575 -       __raise_softirq_irqoff(HI_SOFTIRQ);
25576 +       __tasklet_hi_schedule(t);
25578  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
25580 -static void tasklet_action(struct softirq_action *a)
25581 +void  tasklet_enable(struct tasklet_struct *t)
25583 -       struct tasklet_struct *list;
25584 +       if (!atomic_dec_and_test(&t->count))
25585 +               return;
25586 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
25587 +               tasklet_schedule(t);
25589 +EXPORT_SYMBOL(tasklet_enable);
25591 -       local_irq_disable();
25592 -       list = __this_cpu_read(tasklet_vec.head);
25593 -       __this_cpu_write(tasklet_vec.head, NULL);
25594 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
25595 -       local_irq_enable();
25596 +static void __tasklet_action(struct softirq_action *a,
25597 +                            struct tasklet_struct *list)
25599 +       int loops = 1000000;
25601         while (list) {
25602                 struct tasklet_struct *t = list;
25604                 list = list->next;
25606 -               if (tasklet_trylock(t)) {
25607 -                       if (!atomic_read(&t->count)) {
25608 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
25609 -                                                       &t->state))
25610 -                                       BUG();
25611 -                               t->func(t->data);
25612 -                               tasklet_unlock(t);
25613 -                               continue;
25614 -                       }
25615 -                       tasklet_unlock(t);
25616 +               /*
25617 +                * Should always succeed - after a tasklist got on the
25618 +                * list (after getting the SCHED bit set from 0 to 1),
25619 +                * nothing but the tasklet softirq it got queued to can
25620 +                * lock it:
25621 +                */
25622 +               if (!tasklet_trylock(t)) {
25623 +                       WARN_ON(1);
25624 +                       continue;
25625                 }
25627 -               local_irq_disable();
25628                 t->next = NULL;
25629 -               *__this_cpu_read(tasklet_vec.tail) = t;
25630 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
25631 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
25632 -               local_irq_enable();
25634 +               /*
25635 +                * If we cannot handle the tasklet because it's disabled,
25636 +                * mark it as pending. tasklet_enable() will later
25637 +                * re-schedule the tasklet.
25638 +                */
25639 +               if (unlikely(atomic_read(&t->count))) {
25640 +out_disabled:
25641 +                       /* implicit unlock: */
25642 +                       wmb();
25643 +                       t->state = TASKLET_STATEF_PENDING;
25644 +                       continue;
25645 +               }
25647 +               /*
25648 +                * After this point on the tasklet might be rescheduled
25649 +                * on another CPU, but it can only be added to another
25650 +                * CPU's tasklet list if we unlock the tasklet (which we
25651 +                * dont do yet).
25652 +                */
25653 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
25654 +                       WARN_ON(1);
25656 +again:
25657 +               t->func(t->data);
25659 +               /*
25660 +                * Try to unlock the tasklet. We must use cmpxchg, because
25661 +                * another CPU might have scheduled or disabled the tasklet.
25662 +                * We only allow the STATE_RUN -> 0 transition here.
25663 +                */
25664 +               while (!tasklet_tryunlock(t)) {
25665 +                       /*
25666 +                        * If it got disabled meanwhile, bail out:
25667 +                        */
25668 +                       if (atomic_read(&t->count))
25669 +                               goto out_disabled;
25670 +                       /*
25671 +                        * If it got scheduled meanwhile, re-execute
25672 +                        * the tasklet function:
25673 +                        */
25674 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
25675 +                               goto again;
25676 +                       if (!--loops) {
25677 +                               printk("hm, tasklet state: %08lx\n", t->state);
25678 +                               WARN_ON(1);
25679 +                               tasklet_unlock(t);
25680 +                               break;
25681 +                       }
25682 +               }
25683         }
25686 +static void tasklet_action(struct softirq_action *a)
25688 +       struct tasklet_struct *list;
25690 +       local_irq_disable();
25692 +       list = __this_cpu_read(tasklet_vec.head);
25693 +       __this_cpu_write(tasklet_vec.head, NULL);
25694 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
25696 +       local_irq_enable();
25698 +       __tasklet_action(a, list);
25701  static void tasklet_hi_action(struct softirq_action *a)
25703         struct tasklet_struct *list;
25705         local_irq_disable();
25707         list = __this_cpu_read(tasklet_hi_vec.head);
25708         __this_cpu_write(tasklet_hi_vec.head, NULL);
25709         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
25710 -       local_irq_enable();
25712 -       while (list) {
25713 -               struct tasklet_struct *t = list;
25715 -               list = list->next;
25717 -               if (tasklet_trylock(t)) {
25718 -                       if (!atomic_read(&t->count)) {
25719 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
25720 -                                                       &t->state))
25721 -                                       BUG();
25722 -                               t->func(t->data);
25723 -                               tasklet_unlock(t);
25724 -                               continue;
25725 -                       }
25726 -                       tasklet_unlock(t);
25727 -               }
25728 +       local_irq_enable();
25730 -               local_irq_disable();
25731 -               t->next = NULL;
25732 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
25733 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
25734 -               __raise_softirq_irqoff(HI_SOFTIRQ);
25735 -               local_irq_enable();
25736 -       }
25737 +       __tasklet_action(a, list);
25740  void tasklet_init(struct tasklet_struct *t,
25741 @@ -572,7 +1070,7 @@ void tasklet_kill(struct tasklet_struct *t)
25743         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
25744                 do {
25745 -                       yield();
25746 +                       msleep(1);
25747                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
25748         }
25749         tasklet_unlock_wait(t);
25750 @@ -646,25 +1144,26 @@ void __init softirq_init(void)
25751         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
25754 -static int ksoftirqd_should_run(unsigned int cpu)
25756 -       return local_softirq_pending();
25759 -static void run_ksoftirqd(unsigned int cpu)
25760 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
25761 +void tasklet_unlock_wait(struct tasklet_struct *t)
25763 -       local_irq_disable();
25764 -       if (local_softirq_pending()) {
25765 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
25766                 /*
25767 -                * We can safely run softirq on inline stack, as we are not deep
25768 -                * in the task stack here.
25769 +                * Hack for now to avoid this busy-loop:
25770                  */
25771 -               __do_softirq();
25772 -               local_irq_enable();
25773 -               cond_resched_rcu_qs();
25774 -               return;
25775 +#ifdef CONFIG_PREEMPT_RT_FULL
25776 +               msleep(1);
25777 +#else
25778 +               barrier();
25779 +#endif
25780         }
25781 -       local_irq_enable();
25783 +EXPORT_SYMBOL(tasklet_unlock_wait);
25784 +#endif
25786 +static int ksoftirqd_should_run(unsigned int cpu)
25788 +       return ksoftirqd_softirq_pending();
25791  #ifdef CONFIG_HOTPLUG_CPU
25792 @@ -746,16 +1245,31 @@ static struct notifier_block cpu_nfb = {
25794  static struct smp_hotplug_thread softirq_threads = {
25795         .store                  = &ksoftirqd,
25796 +       .setup                  = ksoftirqd_set_sched_params,
25797         .thread_should_run      = ksoftirqd_should_run,
25798         .thread_fn              = run_ksoftirqd,
25799         .thread_comm            = "ksoftirqd/%u",
25800  };
25802 +#ifdef CONFIG_PREEMPT_RT_FULL
25803 +static struct smp_hotplug_thread softirq_timer_threads = {
25804 +       .store                  = &ktimer_softirqd,
25805 +       .setup                  = ktimer_softirqd_set_sched_params,
25806 +       .cleanup                = ktimer_softirqd_clr_sched_params,
25807 +       .thread_should_run      = ktimer_softirqd_should_run,
25808 +       .thread_fn              = run_ksoftirqd,
25809 +       .thread_comm            = "ktimersoftd/%u",
25811 +#endif
25813  static __init int spawn_ksoftirqd(void)
25815         register_cpu_notifier(&cpu_nfb);
25817         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
25818 +#ifdef CONFIG_PREEMPT_RT_FULL
25819 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
25820 +#endif
25822         return 0;
25824 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
25825 index a3bbaee77c58..f84d3b45cda7 100644
25826 --- a/kernel/stop_machine.c
25827 +++ b/kernel/stop_machine.c
25828 @@ -37,7 +37,7 @@ struct cpu_stop_done {
25829  struct cpu_stopper {
25830         struct task_struct      *thread;
25832 -       spinlock_t              lock;
25833 +       raw_spinlock_t          lock;
25834         bool                    enabled;        /* is this stopper enabled? */
25835         struct list_head        works;          /* list of pending works */
25837 @@ -86,12 +86,12 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
25838         struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
25839         unsigned long flags;
25841 -       spin_lock_irqsave(&stopper->lock, flags);
25842 +       raw_spin_lock_irqsave(&stopper->lock, flags);
25843         if (stopper->enabled)
25844                 __cpu_stop_queue_work(stopper, work);
25845         else
25846                 cpu_stop_signal_done(work->done, false);
25847 -       spin_unlock_irqrestore(&stopper->lock, flags);
25848 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
25851  /**
25852 @@ -224,8 +224,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
25853         int err;
25855         lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
25856 -       spin_lock_irq(&stopper1->lock);
25857 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
25858 +       raw_spin_lock_irq(&stopper1->lock);
25859 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
25861         err = -ENOENT;
25862         if (!stopper1->enabled || !stopper2->enabled)
25863 @@ -235,8 +235,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
25864         __cpu_stop_queue_work(stopper1, work1);
25865         __cpu_stop_queue_work(stopper2, work2);
25866  unlock:
25867 -       spin_unlock(&stopper2->lock);
25868 -       spin_unlock_irq(&stopper1->lock);
25869 +       raw_spin_unlock(&stopper2->lock);
25870 +       raw_spin_unlock_irq(&stopper1->lock);
25871         lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
25873         return err;
25874 @@ -258,7 +258,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
25875         struct cpu_stop_work work1, work2;
25876         struct multi_stop_data msdata;
25878 -       preempt_disable();
25879 +       preempt_disable_nort();
25880         msdata = (struct multi_stop_data){
25881                 .fn = fn,
25882                 .data = arg,
25883 @@ -278,11 +278,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
25884         if (cpu1 > cpu2)
25885                 swap(cpu1, cpu2);
25886         if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
25887 -               preempt_enable();
25888 +               preempt_enable_nort();
25889                 return -ENOENT;
25890         }
25892 -       preempt_enable();
25893 +       preempt_enable_nort();
25895         wait_for_completion(&done.completion);
25897 @@ -315,17 +315,20 @@ static DEFINE_MUTEX(stop_cpus_mutex);
25899  static void queue_stop_cpus_work(const struct cpumask *cpumask,
25900                                  cpu_stop_fn_t fn, void *arg,
25901 -                                struct cpu_stop_done *done)
25902 +                                struct cpu_stop_done *done, bool inactive)
25904         struct cpu_stop_work *work;
25905         unsigned int cpu;
25907         /*
25908 -        * Disable preemption while queueing to avoid getting
25909 -        * preempted by a stopper which might wait for other stoppers
25910 -        * to enter @fn which can lead to deadlock.
25911 +        * Make sure that all work is queued on all cpus before
25912 +        * any of the cpus can execute it.
25913          */
25914 -       lg_global_lock(&stop_cpus_lock);
25915 +       if (!inactive)
25916 +               lg_global_lock(&stop_cpus_lock);
25917 +       else
25918 +               lg_global_trylock_relax(&stop_cpus_lock);
25920         for_each_cpu(cpu, cpumask) {
25921                 work = &per_cpu(cpu_stopper.stop_work, cpu);
25922                 work->fn = fn;
25923 @@ -342,7 +345,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
25924         struct cpu_stop_done done;
25926         cpu_stop_init_done(&done, cpumask_weight(cpumask));
25927 -       queue_stop_cpus_work(cpumask, fn, arg, &done);
25928 +       queue_stop_cpus_work(cpumask, fn, arg, &done, false);
25929         wait_for_completion(&done.completion);
25930         return done.executed ? done.ret : -ENOENT;
25932 @@ -422,9 +425,9 @@ static int cpu_stop_should_run(unsigned int cpu)
25933         unsigned long flags;
25934         int run;
25936 -       spin_lock_irqsave(&stopper->lock, flags);
25937 +       raw_spin_lock_irqsave(&stopper->lock, flags);
25938         run = !list_empty(&stopper->works);
25939 -       spin_unlock_irqrestore(&stopper->lock, flags);
25940 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
25941         return run;
25944 @@ -436,13 +439,13 @@ static void cpu_stopper_thread(unsigned int cpu)
25946  repeat:
25947         work = NULL;
25948 -       spin_lock_irq(&stopper->lock);
25949 +       raw_spin_lock_irq(&stopper->lock);
25950         if (!list_empty(&stopper->works)) {
25951                 work = list_first_entry(&stopper->works,
25952                                         struct cpu_stop_work, list);
25953                 list_del_init(&work->list);
25954         }
25955 -       spin_unlock_irq(&stopper->lock);
25956 +       raw_spin_unlock_irq(&stopper->lock);
25958         if (work) {
25959                 cpu_stop_fn_t fn = work->fn;
25960 @@ -450,6 +453,16 @@ static void cpu_stopper_thread(unsigned int cpu)
25961                 struct cpu_stop_done *done = work->done;
25962                 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
25964 +               /*
25965 +                * Wait until the stopper finished scheduling on all
25966 +                * cpus
25967 +                */
25968 +               lg_global_lock(&stop_cpus_lock);
25969 +               /*
25970 +                * Let other cpu threads continue as well
25971 +                */
25972 +               lg_global_unlock(&stop_cpus_lock);
25974                 /* cpu stop callbacks are not allowed to sleep */
25975                 preempt_disable();
25977 @@ -520,10 +533,12 @@ static int __init cpu_stop_init(void)
25978         for_each_possible_cpu(cpu) {
25979                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
25981 -               spin_lock_init(&stopper->lock);
25982 +               raw_spin_lock_init(&stopper->lock);
25983                 INIT_LIST_HEAD(&stopper->works);
25984         }
25986 +       lg_lock_init(&stop_cpus_lock, "stop_cpus_lock");
25988         BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
25989         stop_machine_unpark(raw_smp_processor_id());
25990         stop_machine_initialized = true;
25991 @@ -620,7 +635,7 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
25992         set_state(&msdata, MULTI_STOP_PREPARE);
25993         cpu_stop_init_done(&done, num_active_cpus());
25994         queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
25995 -                            &done);
25996 +                            &done, true);
25997         ret = multi_cpu_stop(&msdata);
25999         /* Busy wait for completion. */
26000 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
26001 index 17f7bcff1e02..120fc8932165 100644
26002 --- a/kernel/time/hrtimer.c
26003 +++ b/kernel/time/hrtimer.c
26004 @@ -48,11 +48,13 @@
26005  #include <linux/sched/rt.h>
26006  #include <linux/sched/deadline.h>
26007  #include <linux/timer.h>
26008 +#include <linux/kthread.h>
26009  #include <linux/freezer.h>
26011  #include <asm/uaccess.h>
26013  #include <trace/events/timer.h>
26014 +#include <trace/events/hist.h>
26016  #include "tick-internal.h"
26018 @@ -717,6 +719,44 @@ static void clock_was_set_work(struct work_struct *work)
26020  static DECLARE_WORK(hrtimer_work, clock_was_set_work);
26022 +#ifdef CONFIG_PREEMPT_RT_FULL
26024 + * RT can not call schedule_work from real interrupt context.
26025 + * Need to make a thread to do the real work.
26026 + */
26027 +static struct task_struct *clock_set_delay_thread;
26028 +static bool do_clock_set_delay;
26030 +static int run_clock_set_delay(void *ignore)
26032 +       while (!kthread_should_stop()) {
26033 +               set_current_state(TASK_INTERRUPTIBLE);
26034 +               if (do_clock_set_delay) {
26035 +                       do_clock_set_delay = false;
26036 +                       schedule_work(&hrtimer_work);
26037 +               }
26038 +               schedule();
26039 +       }
26040 +       __set_current_state(TASK_RUNNING);
26041 +       return 0;
26044 +void clock_was_set_delayed(void)
26046 +       do_clock_set_delay = true;
26047 +       /* Make visible before waking up process */
26048 +       smp_wmb();
26049 +       wake_up_process(clock_set_delay_thread);
26052 +static __init int create_clock_set_delay_thread(void)
26054 +       clock_set_delay_thread = kthread_run(run_clock_set_delay, NULL, "kclksetdelayd");
26055 +       BUG_ON(!clock_set_delay_thread);
26056 +       return 0;
26058 +early_initcall(create_clock_set_delay_thread);
26059 +#else /* PREEMPT_RT_FULL */
26060  /*
26061   * Called from timekeeping and resume code to reprogramm the hrtimer
26062   * interrupt device on all cpus.
26063 @@ -725,6 +765,7 @@ void clock_was_set_delayed(void)
26065         schedule_work(&hrtimer_work);
26067 +#endif
26069  #else
26071 @@ -734,11 +775,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
26072  static inline void hrtimer_switch_to_hres(void) { }
26073  static inline void
26074  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
26075 -static inline int hrtimer_reprogram(struct hrtimer *timer,
26076 -                                   struct hrtimer_clock_base *base)
26078 -       return 0;
26080 +static inline void hrtimer_reprogram(struct hrtimer *timer,
26081 +                                    struct hrtimer_clock_base *base) { }
26082  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
26083  static inline void retrigger_next_event(void *arg) { }
26085 @@ -870,6 +908,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
26087  EXPORT_SYMBOL_GPL(hrtimer_forward);
26089 +#ifdef CONFIG_PREEMPT_RT_BASE
26090 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
26092 +/**
26093 + * hrtimer_wait_for_timer - Wait for a running timer
26094 + *
26095 + * @timer:     timer to wait for
26096 + *
26097 + * The function waits in case the timers callback function is
26098 + * currently executed on the waitqueue of the timer base. The
26099 + * waitqueue is woken up after the timer callback function has
26100 + * finished execution.
26101 + */
26102 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
26104 +       struct hrtimer_clock_base *base = timer->base;
26106 +       if (base && base->cpu_base && !timer->irqsafe)
26107 +               wait_event(base->cpu_base->wait,
26108 +                               !(hrtimer_callback_running(timer)));
26111 +#else
26112 +# define wake_up_timer_waiters(b)      do { } while (0)
26113 +#endif
26115  /*
26116   * enqueue_hrtimer - internal function to (re)start a timer
26117   *
26118 @@ -911,6 +975,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
26119         if (!(state & HRTIMER_STATE_ENQUEUED))
26120                 return;
26122 +       if (unlikely(!list_empty(&timer->cb_entry))) {
26123 +               list_del_init(&timer->cb_entry);
26124 +               return;
26125 +       }
26127         if (!timerqueue_del(&base->active, &timer->node))
26128                 cpu_base->active_bases &= ~(1 << base->index);
26130 @@ -1006,7 +1075,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
26131         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
26133         timer_stats_hrtimer_set_start_info(timer);
26134 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
26135 +       {
26136 +               ktime_t now = new_base->get_time();
26138 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
26139 +                       timer->praecox = now;
26140 +               else
26141 +                       timer->praecox = ktime_set(0, 0);
26142 +       }
26143 +#endif
26144         leftmost = enqueue_hrtimer(timer, new_base);
26145         if (!leftmost)
26146                 goto unlock;
26147 @@ -1078,7 +1156,7 @@ int hrtimer_cancel(struct hrtimer *timer)
26149                 if (ret >= 0)
26150                         return ret;
26151 -               cpu_relax();
26152 +               hrtimer_wait_for_timer(timer);
26153         }
26155  EXPORT_SYMBOL_GPL(hrtimer_cancel);
26156 @@ -1142,6 +1220,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
26158         base = hrtimer_clockid_to_base(clock_id);
26159         timer->base = &cpu_base->clock_base[base];
26160 +       INIT_LIST_HEAD(&timer->cb_entry);
26161         timerqueue_init(&timer->node);
26163  #ifdef CONFIG_TIMER_STATS
26164 @@ -1182,6 +1261,7 @@ bool hrtimer_active(const struct hrtimer *timer)
26165                 seq = raw_read_seqcount_begin(&cpu_base->seq);
26167                 if (timer->state != HRTIMER_STATE_INACTIVE ||
26168 +                   cpu_base->running_soft == timer ||
26169                     cpu_base->running == timer)
26170                         return true;
26172 @@ -1280,10 +1360,112 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
26173         cpu_base->running = NULL;
26176 -static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
26177 +#ifdef CONFIG_PREEMPT_RT_BASE
26178 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
26179 +                                struct hrtimer_clock_base *base)
26181 +       int leftmost;
26183 +       if (restart != HRTIMER_NORESTART &&
26184 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
26186 +               leftmost = enqueue_hrtimer(timer, base);
26187 +               if (!leftmost)
26188 +                       return;
26189 +#ifdef CONFIG_HIGH_RES_TIMERS
26190 +               if (!hrtimer_is_hres_active(timer)) {
26191 +                       /*
26192 +                        * Kick to reschedule the next tick to handle the new timer
26193 +                        * on dynticks target.
26194 +                        */
26195 +                       if (base->cpu_base->nohz_active)
26196 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
26197 +               } else {
26199 +                       hrtimer_reprogram(timer, base);
26200 +               }
26201 +#endif
26202 +       }
26206 + * The changes in mainline which removed the callback modes from
26207 + * hrtimer are not yet working with -rt. The non wakeup_process()
26208 + * based callbacks which involve sleeping locks need to be treated
26209 + * seperately.
26210 + */
26211 +static void hrtimer_rt_run_pending(void)
26213 +       enum hrtimer_restart (*fn)(struct hrtimer *);
26214 +       struct hrtimer_cpu_base *cpu_base;
26215 +       struct hrtimer_clock_base *base;
26216 +       struct hrtimer *timer;
26217 +       int index, restart;
26219 +       local_irq_disable();
26220 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
26222 +       raw_spin_lock(&cpu_base->lock);
26224 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
26225 +               base = &cpu_base->clock_base[index];
26227 +               while (!list_empty(&base->expired)) {
26228 +                       timer = list_first_entry(&base->expired,
26229 +                                                struct hrtimer, cb_entry);
26231 +                       /*
26232 +                        * Same as the above __run_hrtimer function
26233 +                        * just we run with interrupts enabled.
26234 +                        */
26235 +                       debug_deactivate(timer);
26236 +                       cpu_base->running_soft = timer;
26237 +                       raw_write_seqcount_barrier(&cpu_base->seq);
26239 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
26240 +                       timer_stats_account_hrtimer(timer);
26241 +                       fn = timer->function;
26243 +                       raw_spin_unlock_irq(&cpu_base->lock);
26244 +                       restart = fn(timer);
26245 +                       raw_spin_lock_irq(&cpu_base->lock);
26247 +                       hrtimer_rt_reprogram(restart, timer, base);
26248 +                       raw_write_seqcount_barrier(&cpu_base->seq);
26250 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
26251 +                       cpu_base->running_soft = NULL;
26252 +               }
26253 +       }
26255 +       raw_spin_unlock_irq(&cpu_base->lock);
26257 +       wake_up_timer_waiters(cpu_base);
26260 +static int hrtimer_rt_defer(struct hrtimer *timer)
26262 +       if (timer->irqsafe)
26263 +               return 0;
26265 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
26266 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
26267 +       return 1;
26270 +#else
26272 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
26274 +#endif
26276 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
26278 +static int __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
26280         struct hrtimer_clock_base *base = cpu_base->clock_base;
26281         unsigned int active = cpu_base->active_bases;
26282 +       int raise = 0;
26284         for (; active; base++, active >>= 1) {
26285                 struct timerqueue_node *node;
26286 @@ -1299,6 +1481,15 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
26288                         timer = container_of(node, struct hrtimer, node);
26290 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
26291 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
26292 +                               timer->praecox : hrtimer_get_expires(timer),
26293 +                               basenow)),
26294 +                           current,
26295 +                           timer->function == hrtimer_wakeup ?
26296 +                           container_of(timer, struct hrtimer_sleeper,
26297 +                               timer)->task : NULL);
26299                         /*
26300                          * The immediate goal for using the softexpires is
26301                          * minimizing wakeups, not running timers at the
26302 @@ -1314,9 +1505,13 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
26303                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
26304                                 break;
26306 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
26307 +                       if (!hrtimer_rt_defer(timer))
26308 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
26309 +                       else
26310 +                               raise = 1;
26311                 }
26312         }
26313 +       return raise;
26316  #ifdef CONFIG_HIGH_RES_TIMERS
26317 @@ -1330,6 +1525,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
26318         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
26319         ktime_t expires_next, now, entry_time, delta;
26320         int retries = 0;
26321 +       int raise;
26323         BUG_ON(!cpu_base->hres_active);
26324         cpu_base->nr_events++;
26325 @@ -1348,7 +1544,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
26326          */
26327         cpu_base->expires_next.tv64 = KTIME_MAX;
26329 -       __hrtimer_run_queues(cpu_base, now);
26330 +       raise = __hrtimer_run_queues(cpu_base, now);
26332         /* Reevaluate the clock bases for the next expiry */
26333         expires_next = __hrtimer_get_next_event(cpu_base);
26334 @@ -1359,6 +1555,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
26335         cpu_base->expires_next = expires_next;
26336         cpu_base->in_hrtirq = 0;
26337         raw_spin_unlock(&cpu_base->lock);
26338 +       if (raise)
26339 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
26341         /* Reprogramming necessary ? */
26342         if (!tick_program_event(expires_next, 0)) {
26343 @@ -1438,6 +1636,7 @@ void hrtimer_run_queues(void)
26345         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
26346         ktime_t now;
26347 +       int raise;
26349         if (__hrtimer_hres_active(cpu_base))
26350                 return;
26351 @@ -1456,8 +1655,10 @@ void hrtimer_run_queues(void)
26353         raw_spin_lock(&cpu_base->lock);
26354         now = hrtimer_update_base(cpu_base);
26355 -       __hrtimer_run_queues(cpu_base, now);
26356 +       raise = __hrtimer_run_queues(cpu_base, now);
26357         raw_spin_unlock(&cpu_base->lock);
26358 +       if (raise)
26359 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
26362  /*
26363 @@ -1479,16 +1680,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
26364  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
26366         sl->timer.function = hrtimer_wakeup;
26367 +       sl->timer.irqsafe = 1;
26368         sl->task = task;
26370  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
26372 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
26373 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
26374 +                               unsigned long state)
26376         hrtimer_init_sleeper(t, current);
26378         do {
26379 -               set_current_state(TASK_INTERRUPTIBLE);
26380 +               set_current_state(state);
26381                 hrtimer_start_expires(&t->timer, mode);
26383                 if (likely(t->task))
26384 @@ -1530,7 +1733,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
26385                                 HRTIMER_MODE_ABS);
26386         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
26388 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
26389 +       /* cpu_chill() does not care about restart state. */
26390 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
26391                 goto out;
26393         rmtp = restart->nanosleep.rmtp;
26394 @@ -1547,8 +1751,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
26395         return ret;
26398 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
26399 -                      const enum hrtimer_mode mode, const clockid_t clockid)
26400 +static long
26401 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
26402 +                   const enum hrtimer_mode mode, const clockid_t clockid,
26403 +                   unsigned long state)
26405         struct restart_block *restart;
26406         struct hrtimer_sleeper t;
26407 @@ -1561,7 +1767,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
26409         hrtimer_init_on_stack(&t.timer, clockid, mode);
26410         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
26411 -       if (do_nanosleep(&t, mode))
26412 +       if (do_nanosleep(&t, mode, state))
26413                 goto out;
26415         /* Absolute timers do not update the rmtp value and restart: */
26416 @@ -1588,6 +1794,12 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
26417         return ret;
26420 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
26421 +                      const enum hrtimer_mode mode, const clockid_t clockid)
26423 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
26426  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
26427                 struct timespec __user *, rmtp)
26429 @@ -1602,6 +1814,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
26430         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
26433 +#ifdef CONFIG_PREEMPT_RT_FULL
26435 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
26436 + */
26437 +void cpu_chill(void)
26439 +       struct timespec tu = {
26440 +               .tv_nsec = NSEC_PER_MSEC,
26441 +       };
26442 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
26444 +       current->flags |= PF_NOFREEZE;
26445 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
26446 +                           TASK_UNINTERRUPTIBLE);
26447 +       if (!freeze_flag)
26448 +               current->flags &= ~PF_NOFREEZE;
26450 +EXPORT_SYMBOL(cpu_chill);
26451 +#endif
26453  /*
26454   * Functions related to boot-time initialization:
26455   */
26456 @@ -1613,15 +1845,19 @@ static void init_hrtimers_cpu(int cpu)
26457         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
26458                 cpu_base->clock_base[i].cpu_base = cpu_base;
26459                 timerqueue_init_head(&cpu_base->clock_base[i].active);
26460 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
26461         }
26463         cpu_base->cpu = cpu;
26464         hrtimer_init_hres(cpu_base);
26465 +#ifdef CONFIG_PREEMPT_RT_BASE
26466 +       init_waitqueue_head(&cpu_base->wait);
26467 +#endif
26470  #ifdef CONFIG_HOTPLUG_CPU
26472 -static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
26473 +static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
26474                                 struct hrtimer_clock_base *new_base)
26476         struct hrtimer *timer;
26477 @@ -1649,12 +1885,21 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
26478                  */
26479                 enqueue_hrtimer(timer, new_base);
26480         }
26481 +#ifdef CONFIG_PREEMPT_RT_BASE
26482 +       list_splice_tail(&old_base->expired, &new_base->expired);
26483 +       /*
26484 +        * Tell the caller to raise HRTIMER_SOFTIRQ.  We can't safely
26485 +        * acquire ktimersoftd->pi_lock while the base lock is held.
26486 +        */
26487 +       return !list_empty(&new_base->expired);
26488 +#endif
26489 +       return 0;
26492  static void migrate_hrtimers(int scpu)
26494         struct hrtimer_cpu_base *old_base, *new_base;
26495 -       int i;
26496 +       int i, raise = 0;
26498         BUG_ON(cpu_online(scpu));
26499         tick_cancel_sched_timer(scpu);
26500 @@ -1670,13 +1915,16 @@ static void migrate_hrtimers(int scpu)
26501         raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
26503         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
26504 -               migrate_hrtimer_list(&old_base->clock_base[i],
26505 -                                    &new_base->clock_base[i]);
26506 +               raise |= migrate_hrtimer_list(&old_base->clock_base[i],
26507 +                                             &new_base->clock_base[i]);
26508         }
26510         raw_spin_unlock(&old_base->lock);
26511         raw_spin_unlock(&new_base->lock);
26513 +       if (raise)
26514 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
26516         /* Check, if we got expired work to do */
26517         __hrtimer_peek_ahead_timers();
26518         local_irq_enable();
26519 @@ -1714,11 +1962,21 @@ static struct notifier_block hrtimers_nb = {
26520         .notifier_call = hrtimer_cpu_notify,
26521  };
26523 +#ifdef CONFIG_PREEMPT_RT_BASE
26524 +static void run_hrtimer_softirq(struct softirq_action *h)
26526 +       hrtimer_rt_run_pending();
26528 +#endif
26530  void __init hrtimers_init(void)
26532         hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
26533                           (void *)(long)smp_processor_id());
26534         register_cpu_notifier(&hrtimers_nb);
26535 +#ifdef CONFIG_PREEMPT_RT_BASE
26536 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
26537 +#endif
26540  /**
26541 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
26542 index 1d5c7204ddc9..184de6751180 100644
26543 --- a/kernel/time/itimer.c
26544 +++ b/kernel/time/itimer.c
26545 @@ -213,6 +213,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
26546                 /* We are sharing ->siglock with it_real_fn() */
26547                 if (hrtimer_try_to_cancel(timer) < 0) {
26548                         spin_unlock_irq(&tsk->sighand->siglock);
26549 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
26550                         goto again;
26551                 }
26552                 expires = timeval_to_ktime(value->it_value);
26553 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
26554 index 347fecf86a3f..2ede47408a3e 100644
26555 --- a/kernel/time/jiffies.c
26556 +++ b/kernel/time/jiffies.c
26557 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
26558         .max_cycles     = 10,
26559  };
26561 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
26562 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
26563 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
26565  #if (BITS_PER_LONG < 64)
26566  u64 get_jiffies_64(void)
26567 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
26568         u64 ret;
26570         do {
26571 -               seq = read_seqbegin(&jiffies_lock);
26572 +               seq = read_seqcount_begin(&jiffies_seq);
26573                 ret = jiffies_64;
26574 -       } while (read_seqretry(&jiffies_lock, seq));
26575 +       } while (read_seqcount_retry(&jiffies_seq, seq));
26576         return ret;
26578  EXPORT_SYMBOL(get_jiffies_64);
26579 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
26580 index ab861771e37f..0f6868fd2de6 100644
26581 --- a/kernel/time/ntp.c
26582 +++ b/kernel/time/ntp.c
26583 @@ -10,6 +10,7 @@
26584  #include <linux/workqueue.h>
26585  #include <linux/hrtimer.h>
26586  #include <linux/jiffies.h>
26587 +#include <linux/kthread.h>
26588  #include <linux/math64.h>
26589  #include <linux/timex.h>
26590  #include <linux/time.h>
26591 @@ -562,10 +563,52 @@ static void sync_cmos_clock(struct work_struct *work)
26592                            &sync_cmos_work, timespec64_to_jiffies(&next));
26595 +#ifdef CONFIG_PREEMPT_RT_FULL
26597 + * RT can not call schedule_delayed_work from real interrupt context.
26598 + * Need to make a thread to do the real work.
26599 + */
26600 +static struct task_struct *cmos_delay_thread;
26601 +static bool do_cmos_delay;
26603 +static int run_cmos_delay(void *ignore)
26605 +       while (!kthread_should_stop()) {
26606 +               set_current_state(TASK_INTERRUPTIBLE);
26607 +               if (do_cmos_delay) {
26608 +                       do_cmos_delay = false;
26609 +                       queue_delayed_work(system_power_efficient_wq,
26610 +                                          &sync_cmos_work, 0);
26611 +               }
26612 +               schedule();
26613 +       }
26614 +       __set_current_state(TASK_RUNNING);
26615 +       return 0;
26618 +void ntp_notify_cmos_timer(void)
26620 +       do_cmos_delay = true;
26621 +       /* Make visible before waking up process */
26622 +       smp_wmb();
26623 +       wake_up_process(cmos_delay_thread);
26626 +static __init int create_cmos_delay_thread(void)
26628 +       cmos_delay_thread = kthread_run(run_cmos_delay, NULL, "kcmosdelayd");
26629 +       BUG_ON(!cmos_delay_thread);
26630 +       return 0;
26632 +early_initcall(create_cmos_delay_thread);
26634 +#else
26636  void ntp_notify_cmos_timer(void)
26638         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
26640 +#endif /* CONFIG_PREEMPT_RT_FULL */
26642  #else
26643  void ntp_notify_cmos_timer(void) { }
26644 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
26645 index 80016b329d94..b7342b6e6a5a 100644
26646 --- a/kernel/time/posix-cpu-timers.c
26647 +++ b/kernel/time/posix-cpu-timers.c
26648 @@ -3,6 +3,7 @@
26649   */
26651  #include <linux/sched.h>
26652 +#include <linux/sched/rt.h>
26653  #include <linux/posix-timers.h>
26654  #include <linux/errno.h>
26655  #include <linux/math64.h>
26656 @@ -650,7 +651,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
26657         /*
26658          * Disarm any old timer after extracting its expiry time.
26659          */
26660 -       WARN_ON_ONCE(!irqs_disabled());
26661 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
26663         ret = 0;
26664         old_incr = timer->it.cpu.incr;
26665 @@ -1092,7 +1093,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
26666         /*
26667          * Now re-arm for the new expiry time.
26668          */
26669 -       WARN_ON_ONCE(!irqs_disabled());
26670 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
26671         arm_timer(timer);
26672         unlock_task_sighand(p, &flags);
26674 @@ -1183,13 +1184,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
26675   * already updated our counts.  We need to check if any timers fire now.
26676   * Interrupts are disabled.
26677   */
26678 -void run_posix_cpu_timers(struct task_struct *tsk)
26679 +static void __run_posix_cpu_timers(struct task_struct *tsk)
26681         LIST_HEAD(firing);
26682         struct k_itimer *timer, *next;
26683         unsigned long flags;
26685 -       WARN_ON_ONCE(!irqs_disabled());
26686 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
26688         /*
26689          * The fast path checks that there are no expired thread or thread
26690 @@ -1243,6 +1244,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
26691         }
26694 +#ifdef CONFIG_PREEMPT_RT_BASE
26695 +#include <linux/kthread.h>
26696 +#include <linux/cpu.h>
26697 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
26698 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
26700 +static int posix_cpu_timers_thread(void *data)
26702 +       int cpu = (long)data;
26704 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
26706 +       while (!kthread_should_stop()) {
26707 +               struct task_struct *tsk = NULL;
26708 +               struct task_struct *next = NULL;
26710 +               if (cpu_is_offline(cpu))
26711 +                       goto wait_to_die;
26713 +               /* grab task list */
26714 +               raw_local_irq_disable();
26715 +               tsk = per_cpu(posix_timer_tasklist, cpu);
26716 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
26717 +               raw_local_irq_enable();
26719 +               /* its possible the list is empty, just return */
26720 +               if (!tsk) {
26721 +                       set_current_state(TASK_INTERRUPTIBLE);
26722 +                       schedule();
26723 +                       __set_current_state(TASK_RUNNING);
26724 +                       continue;
26725 +               }
26727 +               /* Process task list */
26728 +               while (1) {
26729 +                       /* save next */
26730 +                       next = tsk->posix_timer_list;
26732 +                       /* run the task timers, clear its ptr and
26733 +                        * unreference it
26734 +                        */
26735 +                       __run_posix_cpu_timers(tsk);
26736 +                       tsk->posix_timer_list = NULL;
26737 +                       put_task_struct(tsk);
26739 +                       /* check if this is the last on the list */
26740 +                       if (next == tsk)
26741 +                               break;
26742 +                       tsk = next;
26743 +               }
26744 +       }
26745 +       return 0;
26747 +wait_to_die:
26748 +       /* Wait for kthread_stop */
26749 +       set_current_state(TASK_INTERRUPTIBLE);
26750 +       while (!kthread_should_stop()) {
26751 +               schedule();
26752 +               set_current_state(TASK_INTERRUPTIBLE);
26753 +       }
26754 +       __set_current_state(TASK_RUNNING);
26755 +       return 0;
26758 +static inline int __fastpath_timer_check(struct task_struct *tsk)
26760 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
26761 +       if (unlikely(tsk->exit_state))
26762 +               return 0;
26764 +       if (!task_cputime_zero(&tsk->cputime_expires))
26765 +                       return 1;
26767 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
26768 +                       return 1;
26770 +       return 0;
26773 +void run_posix_cpu_timers(struct task_struct *tsk)
26775 +       unsigned long cpu = smp_processor_id();
26776 +       struct task_struct *tasklist;
26778 +       BUG_ON(!irqs_disabled());
26779 +       if(!per_cpu(posix_timer_task, cpu))
26780 +               return;
26781 +       /* get per-cpu references */
26782 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
26784 +       /* check to see if we're already queued */
26785 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
26786 +               get_task_struct(tsk);
26787 +               if (tasklist) {
26788 +                       tsk->posix_timer_list = tasklist;
26789 +               } else {
26790 +                       /*
26791 +                        * The list is terminated by a self-pointing
26792 +                        * task_struct
26793 +                        */
26794 +                       tsk->posix_timer_list = tsk;
26795 +               }
26796 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
26798 +               wake_up_process(per_cpu(posix_timer_task, cpu));
26799 +       }
26803 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
26804 + * Here we can start up the necessary migration thread for the new CPU.
26805 + */
26806 +static int posix_cpu_thread_call(struct notifier_block *nfb,
26807 +                                unsigned long action, void *hcpu)
26809 +       int cpu = (long)hcpu;
26810 +       struct task_struct *p;
26811 +       struct sched_param param;
26813 +       switch (action) {
26814 +       case CPU_UP_PREPARE:
26815 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
26816 +                                       "posixcputmr/%d",cpu);
26817 +               if (IS_ERR(p))
26818 +                       return NOTIFY_BAD;
26819 +               p->flags |= PF_NOFREEZE;
26820 +               kthread_bind(p, cpu);
26821 +               /* Must be high prio to avoid getting starved */
26822 +               param.sched_priority = MAX_RT_PRIO-1;
26823 +               sched_setscheduler(p, SCHED_FIFO, &param);
26824 +               per_cpu(posix_timer_task,cpu) = p;
26825 +               break;
26826 +       case CPU_ONLINE:
26827 +               /* Strictly unneccessary, as first user will wake it. */
26828 +               wake_up_process(per_cpu(posix_timer_task,cpu));
26829 +               break;
26830 +#ifdef CONFIG_HOTPLUG_CPU
26831 +       case CPU_UP_CANCELED:
26832 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
26833 +               kthread_bind(per_cpu(posix_timer_task, cpu),
26834 +                            cpumask_any(cpu_online_mask));
26835 +               kthread_stop(per_cpu(posix_timer_task,cpu));
26836 +               per_cpu(posix_timer_task,cpu) = NULL;
26837 +               break;
26838 +       case CPU_DEAD:
26839 +               kthread_stop(per_cpu(posix_timer_task,cpu));
26840 +               per_cpu(posix_timer_task,cpu) = NULL;
26841 +               break;
26842 +#endif
26843 +       }
26844 +       return NOTIFY_OK;
26847 +/* Register at highest priority so that task migration (migrate_all_tasks)
26848 + * happens before everything else.
26849 + */
26850 +static struct notifier_block posix_cpu_thread_notifier = {
26851 +       .notifier_call = posix_cpu_thread_call,
26852 +       .priority = 10
26855 +static int __init posix_cpu_thread_init(void)
26857 +       void *hcpu = (void *)(long)smp_processor_id();
26858 +       /* Start one for boot CPU. */
26859 +       unsigned long cpu;
26861 +       /* init the per-cpu posix_timer_tasklets */
26862 +       for_each_possible_cpu(cpu)
26863 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
26865 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
26866 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
26867 +       register_cpu_notifier(&posix_cpu_thread_notifier);
26868 +       return 0;
26870 +early_initcall(posix_cpu_thread_init);
26871 +#else /* CONFIG_PREEMPT_RT_BASE */
26872 +void run_posix_cpu_timers(struct task_struct *tsk)
26874 +       __run_posix_cpu_timers(tsk);
26876 +#endif /* CONFIG_PREEMPT_RT_BASE */
26878  /*
26879   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
26880   * The tsk->sighand->siglock must be held by the caller.
26881 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
26882 index f2826c35e918..464a98155a0e 100644
26883 --- a/kernel/time/posix-timers.c
26884 +++ b/kernel/time/posix-timers.c
26885 @@ -506,6 +506,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
26886  static struct pid *good_sigevent(sigevent_t * event)
26888         struct task_struct *rtn = current->group_leader;
26889 +       int sig = event->sigev_signo;
26891         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
26892                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
26893 @@ -514,7 +515,8 @@ static struct pid *good_sigevent(sigevent_t * event)
26894                 return NULL;
26896         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
26897 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
26898 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
26899 +            sig_kernel_coredump(sig)))
26900                 return NULL;
26902         return task_pid(rtn);
26903 @@ -826,6 +828,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
26904         return overrun;
26908 + * Protected by RCU!
26909 + */
26910 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
26912 +#ifdef CONFIG_PREEMPT_RT_FULL
26913 +       if (kc->timer_set == common_timer_set)
26914 +               hrtimer_wait_for_timer(&timr->it.real.timer);
26915 +       else
26916 +               /* FIXME: Whacky hack for posix-cpu-timers */
26917 +               schedule_timeout(1);
26918 +#endif
26921  /* Set a POSIX.1b interval timer. */
26922  /* timr->it_lock is taken. */
26923  static int
26924 @@ -903,6 +919,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
26925         if (!timr)
26926                 return -EINVAL;
26928 +       rcu_read_lock();
26929         kc = clockid_to_kclock(timr->it_clock);
26930         if (WARN_ON_ONCE(!kc || !kc->timer_set))
26931                 error = -EINVAL;
26932 @@ -911,9 +928,12 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
26934         unlock_timer(timr, flag);
26935         if (error == TIMER_RETRY) {
26936 +               timer_wait_for_callback(kc, timr);
26937                 rtn = NULL;     // We already got the old time...
26938 +               rcu_read_unlock();
26939                 goto retry;
26940         }
26941 +       rcu_read_unlock();
26943         if (old_setting && !error &&
26944             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
26945 @@ -951,10 +971,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
26946         if (!timer)
26947                 return -EINVAL;
26949 +       rcu_read_lock();
26950         if (timer_delete_hook(timer) == TIMER_RETRY) {
26951                 unlock_timer(timer, flags);
26952 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
26953 +                                       timer);
26954 +               rcu_read_unlock();
26955                 goto retry_delete;
26956         }
26957 +       rcu_read_unlock();
26959         spin_lock(&current->sighand->siglock);
26960         list_del(&timer->list);
26961 @@ -980,8 +1005,18 @@ static void itimer_delete(struct k_itimer *timer)
26962  retry_delete:
26963         spin_lock_irqsave(&timer->it_lock, flags);
26965 +       /* On RT we can race with a deletion */
26966 +       if (!timer->it_signal) {
26967 +               unlock_timer(timer, flags);
26968 +               return;
26969 +       }
26971         if (timer_delete_hook(timer) == TIMER_RETRY) {
26972 +               rcu_read_lock();
26973                 unlock_timer(timer, flags);
26974 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
26975 +                                       timer);
26976 +               rcu_read_unlock();
26977                 goto retry_delete;
26978         }
26979         list_del(&timer->list);
26980 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
26981 index 53d7184da0be..1b4ac3361c3f 100644
26982 --- a/kernel/time/tick-broadcast-hrtimer.c
26983 +++ b/kernel/time/tick-broadcast-hrtimer.c
26984 @@ -106,5 +106,6 @@ void tick_setup_hrtimer_broadcast(void)
26986         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26987         bctimer.function = bc_handler;
26988 +       bctimer.irqsafe = true;
26989         clockevents_register_device(&ce_broadcast_hrtimer);
26991 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
26992 index 4fcd99e12aa0..5a47f2e98faf 100644
26993 --- a/kernel/time/tick-common.c
26994 +++ b/kernel/time/tick-common.c
26995 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
26996  static void tick_periodic(int cpu)
26998         if (tick_do_timer_cpu == cpu) {
26999 -               write_seqlock(&jiffies_lock);
27000 +               raw_spin_lock(&jiffies_lock);
27001 +               write_seqcount_begin(&jiffies_seq);
27003                 /* Keep track of the next tick event */
27004                 tick_next_period = ktime_add(tick_next_period, tick_period);
27006                 do_timer(1);
27007 -               write_sequnlock(&jiffies_lock);
27008 +               write_seqcount_end(&jiffies_seq);
27009 +               raw_spin_unlock(&jiffies_lock);
27010                 update_wall_time();
27011         }
27013 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
27014                 ktime_t next;
27016                 do {
27017 -                       seq = read_seqbegin(&jiffies_lock);
27018 +                       seq = read_seqcount_begin(&jiffies_seq);
27019                         next = tick_next_period;
27020 -               } while (read_seqretry(&jiffies_lock, seq));
27021 +               } while (read_seqcount_retry(&jiffies_seq, seq));
27023                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
27025 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
27026 index e5d228f7224c..45e9456da50f 100644
27027 --- a/kernel/time/tick-sched.c
27028 +++ b/kernel/time/tick-sched.c
27029 @@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
27030                 return;
27032         /* Reevalute with jiffies_lock held */
27033 -       write_seqlock(&jiffies_lock);
27034 +       raw_spin_lock(&jiffies_lock);
27035 +       write_seqcount_begin(&jiffies_seq);
27037         delta = ktime_sub(now, last_jiffies_update);
27038         if (delta.tv64 >= tick_period.tv64) {
27039 @@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
27040                 /* Keep the tick_next_period variable up to date */
27041                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
27042         } else {
27043 -               write_sequnlock(&jiffies_lock);
27044 +               write_seqcount_end(&jiffies_seq);
27045 +               raw_spin_unlock(&jiffies_lock);
27046                 return;
27047         }
27048 -       write_sequnlock(&jiffies_lock);
27049 +       write_seqcount_end(&jiffies_seq);
27050 +       raw_spin_unlock(&jiffies_lock);
27051         update_wall_time();
27054 @@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
27056         ktime_t period;
27058 -       write_seqlock(&jiffies_lock);
27059 +       raw_spin_lock(&jiffies_lock);
27060 +       write_seqcount_begin(&jiffies_seq);
27061         /* Did we start the jiffies update yet ? */
27062         if (last_jiffies_update.tv64 == 0)
27063                 last_jiffies_update = tick_next_period;
27064         period = last_jiffies_update;
27065 -       write_sequnlock(&jiffies_lock);
27066 +       write_seqcount_end(&jiffies_seq);
27067 +       raw_spin_unlock(&jiffies_lock);
27068         return period;
27071 @@ -176,6 +181,11 @@ static bool can_stop_full_tick(void)
27072                 return false;
27073         }
27075 +       if (!arch_irq_work_has_interrupt()) {
27076 +               trace_tick_stop(0, "missing irq work interrupt\n");
27077 +               return false;
27078 +       }
27080         /* sched_clock_tick() needs us? */
27081  #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
27082         /*
27083 @@ -204,6 +214,7 @@ static void nohz_full_kick_work_func(struct irq_work *work)
27085  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
27086         .func = nohz_full_kick_work_func,
27087 +       .flags = IRQ_WORK_HARD_IRQ,
27088  };
27090  /*
27091 @@ -583,10 +594,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
27093         /* Read jiffies and the time when jiffies were updated last */
27094         do {
27095 -               seq = read_seqbegin(&jiffies_lock);
27096 +               seq = read_seqcount_begin(&jiffies_seq);
27097                 basemono = last_jiffies_update.tv64;
27098                 basejiff = jiffies;
27099 -       } while (read_seqretry(&jiffies_lock, seq));
27100 +       } while (read_seqcount_retry(&jiffies_seq, seq));
27101         ts->last_jiffies = basejiff;
27103         /*
27104 @@ -768,14 +779,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
27105                 return false;
27107         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
27108 -               static int ratelimit;
27110 -               if (ratelimit < 10 &&
27111 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
27112 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
27113 -                               (unsigned int) local_softirq_pending());
27114 -                       ratelimit++;
27115 -               }
27116 +               softirq_check_pending_idle();
27117                 return false;
27118         }
27120 @@ -1115,6 +1119,7 @@ void tick_setup_sched_timer(void)
27121          * Emulate tick processing via per-CPU hrtimers:
27122          */
27123         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
27124 +       ts->sched_timer.irqsafe = 1;
27125         ts->sched_timer.function = tick_sched_timer;
27127         /* Get the next period (per cpu) */
27128 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
27129 index 6e4866834d26..6c3e5323d06b 100644
27130 --- a/kernel/time/timekeeping.c
27131 +++ b/kernel/time/timekeeping.c
27132 @@ -2091,8 +2091,10 @@ EXPORT_SYMBOL(hardpps);
27133   */
27134  void xtime_update(unsigned long ticks)
27136 -       write_seqlock(&jiffies_lock);
27137 +       raw_spin_lock(&jiffies_lock);
27138 +       write_seqcount_begin(&jiffies_seq);
27139         do_timer(ticks);
27140 -       write_sequnlock(&jiffies_lock);
27141 +       write_seqcount_end(&jiffies_seq);
27142 +       raw_spin_unlock(&jiffies_lock);
27143         update_wall_time();
27145 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
27146 index 704f595ce83f..763a3e5121ff 100644
27147 --- a/kernel/time/timekeeping.h
27148 +++ b/kernel/time/timekeeping.h
27149 @@ -19,7 +19,8 @@ extern void timekeeping_resume(void);
27150  extern void do_timer(unsigned long ticks);
27151  extern void update_wall_time(void);
27153 -extern seqlock_t jiffies_lock;
27154 +extern raw_spinlock_t jiffies_lock;
27155 +extern seqcount_t jiffies_seq;
27157  #define CS_NAME_LEN    32
27159 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
27160 index 125407144c01..13652dafe83a 100644
27161 --- a/kernel/time/timer.c
27162 +++ b/kernel/time/timer.c
27163 @@ -80,6 +80,9 @@ struct tvec_root {
27164  struct tvec_base {
27165         spinlock_t lock;
27166         struct timer_list *running_timer;
27167 +#ifdef CONFIG_PREEMPT_RT_FULL
27168 +       wait_queue_head_t wait_for_running_timer;
27169 +#endif
27170         unsigned long timer_jiffies;
27171         unsigned long next_timer;
27172         unsigned long active_timers;
27173 @@ -777,6 +780,39 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
27174                 cpu_relax();
27175         }
27177 +#ifdef CONFIG_PREEMPT_RT_FULL
27178 +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
27179 +                                                 struct tvec_base *old,
27180 +                                                 struct tvec_base *new)
27182 +       /*
27183 +        * We cannot do the below because we might be preempted and
27184 +        * then the preempter would see NULL and loop forever.
27185 +        */
27186 +       if (spin_trylock(&new->lock)) {
27187 +               WRITE_ONCE(timer->flags,
27188 +                          (timer->flags & ~TIMER_BASEMASK) | new->cpu);
27189 +               spin_unlock(&old->lock);
27190 +               return new;
27191 +       }
27192 +       return old;
27195 +#else
27196 +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
27197 +                                                 struct tvec_base *old,
27198 +                                                 struct tvec_base *new)
27200 +       /* See the comment in lock_timer_base() */
27201 +       timer->flags |= TIMER_MIGRATING;
27203 +       spin_unlock(&old->lock);
27204 +       spin_lock(&new->lock);
27205 +       WRITE_ONCE(timer->flags,
27206 +                  (timer->flags & ~TIMER_BASEMASK) | new->cpu);
27207 +       return new;
27209 +#endif
27211  static inline int
27212  __mod_timer(struct timer_list *timer, unsigned long expires,
27213 @@ -807,16 +843,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
27214                  * handler yet has not finished. This also guarantees that
27215                  * the timer is serialized wrt itself.
27216                  */
27217 -               if (likely(base->running_timer != timer)) {
27218 -                       /* See the comment in lock_timer_base() */
27219 -                       timer->flags |= TIMER_MIGRATING;
27221 -                       spin_unlock(&base->lock);
27222 -                       base = new_base;
27223 -                       spin_lock(&base->lock);
27224 -                       WRITE_ONCE(timer->flags,
27225 -                                  (timer->flags & ~TIMER_BASEMASK) | base->cpu);
27226 -               }
27227 +               if (likely(base->running_timer != timer))
27228 +                       base = switch_timer_base(timer, base, new_base);
27229         }
27231         timer->expires = expires;
27232 @@ -1006,6 +1034,33 @@ void add_timer_on(struct timer_list *timer, int cpu)
27234  EXPORT_SYMBOL_GPL(add_timer_on);
27236 +#ifdef CONFIG_PREEMPT_RT_FULL
27238 + * Wait for a running timer
27239 + */
27240 +static void wait_for_running_timer(struct timer_list *timer)
27242 +       struct tvec_base *base;
27243 +       u32 tf = timer->flags;
27245 +       if (tf & TIMER_MIGRATING)
27246 +               return;
27248 +       base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
27249 +       wait_event(base->wait_for_running_timer,
27250 +                  base->running_timer != timer);
27253 +# define wakeup_timer_waiters(b)       wake_up_all(&(b)->wait_for_running_timer)
27254 +#else
27255 +static inline void wait_for_running_timer(struct timer_list *timer)
27257 +       cpu_relax();
27260 +# define wakeup_timer_waiters(b)       do { } while (0)
27261 +#endif
27263  /**
27264   * del_timer - deactive a timer.
27265   * @timer: the timer to be deactivated
27266 @@ -1063,7 +1118,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
27268  EXPORT_SYMBOL(try_to_del_timer_sync);
27270 -#ifdef CONFIG_SMP
27271 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
27272  /**
27273   * del_timer_sync - deactivate a timer and wait for the handler to finish.
27274   * @timer: the timer to be deactivated
27275 @@ -1123,7 +1178,7 @@ int del_timer_sync(struct timer_list *timer)
27276                 int ret = try_to_del_timer_sync(timer);
27277                 if (ret >= 0)
27278                         return ret;
27279 -               cpu_relax();
27280 +               wait_for_running_timer(timer);
27281         }
27283  EXPORT_SYMBOL(del_timer_sync);
27284 @@ -1248,16 +1303,18 @@ static inline void __run_timers(struct tvec_base *base)
27285                         if (irqsafe) {
27286                                 spin_unlock(&base->lock);
27287                                 call_timer_fn(timer, fn, data);
27288 +                               base->running_timer = NULL;
27289                                 spin_lock(&base->lock);
27290                         } else {
27291                                 spin_unlock_irq(&base->lock);
27292                                 call_timer_fn(timer, fn, data);
27293 +                               base->running_timer = NULL;
27294                                 spin_lock_irq(&base->lock);
27295                         }
27296                 }
27297         }
27298 -       base->running_timer = NULL;
27299         spin_unlock_irq(&base->lock);
27300 +       wakeup_timer_waiters(base);
27303  #ifdef CONFIG_NO_HZ_COMMON
27304 @@ -1390,6 +1447,14 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
27305         if (cpu_is_offline(smp_processor_id()))
27306                 return expires;
27308 +#ifdef CONFIG_PREEMPT_RT_FULL
27309 +       /*
27310 +        * On PREEMPT_RT we cannot sleep here. As a result we can't take
27311 +        * the base lock to check when the next timer is pending and so
27312 +        * we assume the next jiffy.
27313 +        */
27314 +       return basem + TICK_NSEC;
27315 +#endif
27316         spin_lock(&base->lock);
27317         if (base->active_timers) {
27318                 if (time_before_eq(base->next_timer, base->timer_jiffies))
27319 @@ -1416,13 +1481,13 @@ void update_process_times(int user_tick)
27321         /* Note: this timer irq context must be accounted for as well. */
27322         account_process_tick(p, user_tick);
27323 +       scheduler_tick();
27324         run_local_timers();
27325         rcu_check_callbacks(user_tick);
27326 -#ifdef CONFIG_IRQ_WORK
27327 +#if defined(CONFIG_IRQ_WORK)
27328         if (in_irq())
27329                 irq_work_tick();
27330  #endif
27331 -       scheduler_tick();
27332         run_posix_cpu_timers(p);
27335 @@ -1433,6 +1498,8 @@ static void run_timer_softirq(struct softirq_action *h)
27337         struct tvec_base *base = this_cpu_ptr(&tvec_bases);
27339 +       irq_work_tick_soft();
27341         if (time_after_eq(jiffies, base->timer_jiffies))
27342                 __run_timers(base);
27344 @@ -1589,7 +1656,7 @@ static void migrate_timers(int cpu)
27346         BUG_ON(cpu_online(cpu));
27347         old_base = per_cpu_ptr(&tvec_bases, cpu);
27348 -       new_base = get_cpu_ptr(&tvec_bases);
27349 +       new_base = get_local_ptr(&tvec_bases);
27350         /*
27351          * The caller is globally serialized and nobody else
27352          * takes two locks at once, deadlock is not possible.
27353 @@ -1613,7 +1680,7 @@ static void migrate_timers(int cpu)
27355         spin_unlock(&old_base->lock);
27356         spin_unlock_irq(&new_base->lock);
27357 -       put_cpu_ptr(&tvec_bases);
27358 +       put_local_ptr(&tvec_bases);
27361  static int timer_cpu_notify(struct notifier_block *self,
27362 @@ -1645,6 +1712,9 @@ static void __init init_timer_cpu(int cpu)
27364         base->cpu = cpu;
27365         spin_lock_init(&base->lock);
27366 +#ifdef CONFIG_PREEMPT_RT_FULL
27367 +       init_waitqueue_head(&base->wait_for_running_timer);
27368 +#endif
27370         base->timer_jiffies = jiffies;
27371         base->next_timer = base->timer_jiffies;
27372 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
27373 index e45db6b0d878..364ccd0eb57b 100644
27374 --- a/kernel/trace/Kconfig
27375 +++ b/kernel/trace/Kconfig
27376 @@ -187,6 +187,24 @@ config IRQSOFF_TRACER
27377           enabled. This option and the preempt-off timing option can be
27378           used together or separately.)
27380 +config INTERRUPT_OFF_HIST
27381 +       bool "Interrupts-off Latency Histogram"
27382 +       depends on IRQSOFF_TRACER
27383 +       help
27384 +         This option generates continuously updated histograms (one per cpu)
27385 +         of the duration of time periods with interrupts disabled. The
27386 +         histograms are disabled by default. To enable them, write a non-zero
27387 +         number to
27389 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
27391 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
27392 +         per cpu) are generated that accumulate the duration of time periods
27393 +         when both interrupts and preemption are disabled. The histogram data
27394 +         will be located in the debug file system at
27396 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
27398  config PREEMPT_TRACER
27399         bool "Preemption-off Latency Tracer"
27400         default n
27401 @@ -211,6 +229,24 @@ config PREEMPT_TRACER
27402           enabled. This option and the irqs-off timing option can be
27403           used together or separately.)
27405 +config PREEMPT_OFF_HIST
27406 +       bool "Preemption-off Latency Histogram"
27407 +       depends on PREEMPT_TRACER
27408 +       help
27409 +         This option generates continuously updated histograms (one per cpu)
27410 +         of the duration of time periods with preemption disabled. The
27411 +         histograms are disabled by default. To enable them, write a non-zero
27412 +         number to
27414 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
27416 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
27417 +         per cpu) are generated that accumulate the duration of time periods
27418 +         when both interrupts and preemption are disabled. The histogram data
27419 +         will be located in the debug file system at
27421 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
27423  config SCHED_TRACER
27424         bool "Scheduling Latency Tracer"
27425         select GENERIC_TRACER
27426 @@ -221,6 +257,74 @@ config SCHED_TRACER
27427           This tracer tracks the latency of the highest priority task
27428           to be scheduled in, starting from the point it has woken up.
27430 +config WAKEUP_LATENCY_HIST
27431 +       bool "Scheduling Latency Histogram"
27432 +       depends on SCHED_TRACER
27433 +       help
27434 +         This option generates continuously updated histograms (one per cpu)
27435 +         of the scheduling latency of the highest priority task.
27436 +         The histograms are disabled by default. To enable them, write a
27437 +         non-zero number to
27439 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
27441 +         Two different algorithms are used, one to determine the latency of
27442 +         processes that exclusively use the highest priority of the system and
27443 +         another one to determine the latency of processes that share the
27444 +         highest system priority with other processes. The former is used to
27445 +         improve hardware and system software, the latter to optimize the
27446 +         priority design of a given system. The histogram data will be
27447 +         located in the debug file system at
27449 +             /sys/kernel/debug/tracing/latency_hist/wakeup
27451 +         and
27453 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
27455 +         If both Scheduling Latency Histogram and Missed Timer Offsets
27456 +         Histogram are selected, additional histogram data will be collected
27457 +         that contain, in addition to the wakeup latency, the timer latency, in
27458 +         case the wakeup was triggered by an expired timer. These histograms
27459 +         are available in the
27461 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
27463 +         directory. They reflect the apparent interrupt and scheduling latency
27464 +         and are best suitable to determine the worst-case latency of a given
27465 +         system. To enable these histograms, write a non-zero number to
27467 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
27469 +config MISSED_TIMER_OFFSETS_HIST
27470 +       depends on HIGH_RES_TIMERS
27471 +       select GENERIC_TRACER
27472 +       bool "Missed Timer Offsets Histogram"
27473 +       help
27474 +         Generate a histogram of missed timer offsets in microseconds. The
27475 +         histograms are disabled by default. To enable them, write a non-zero
27476 +         number to
27478 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
27480 +         The histogram data will be located in the debug file system at
27482 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
27484 +         If both Scheduling Latency Histogram and Missed Timer Offsets
27485 +         Histogram are selected, additional histogram data will be collected
27486 +         that contain, in addition to the wakeup latency, the timer latency, in
27487 +         case the wakeup was triggered by an expired timer. These histograms
27488 +         are available in the
27490 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
27492 +         directory. They reflect the apparent interrupt and scheduling latency
27493 +         and are best suitable to determine the worst-case latency of a given
27494 +         system. To enable these histograms, write a non-zero number to
27496 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
27498  config ENABLE_DEFAULT_TRACERS
27499         bool "Trace process context switches and events"
27500         depends on !GENERIC_TRACER
27501 diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
27502 index 05ea5167e6bb..bc08c67301ae 100644
27503 --- a/kernel/trace/Makefile
27504 +++ b/kernel/trace/Makefile
27505 @@ -40,6 +40,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
27506  obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
27507  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
27508  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
27509 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
27510 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
27511 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
27512 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
27513  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
27514  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
27515  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
27516 diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
27517 new file mode 100644
27518 index 000000000000..7f6ee70dea41
27519 --- /dev/null
27520 +++ b/kernel/trace/latency_hist.c
27521 @@ -0,0 +1,1178 @@
27523 + * kernel/trace/latency_hist.c
27524 + *
27525 + * Add support for histograms of preemption-off latency and
27526 + * interrupt-off latency and wakeup latency, it depends on
27527 + * Real-Time Preemption Support.
27528 + *
27529 + *  Copyright (C) 2005 MontaVista Software, Inc.
27530 + *  Yi Yang <yyang@ch.mvista.com>
27531 + *
27532 + *  Converted to work with the new latency tracer.
27533 + *  Copyright (C) 2008 Red Hat, Inc.
27534 + *    Steven Rostedt <srostedt@redhat.com>
27535 + *
27536 + */
27537 +#include <linux/module.h>
27538 +#include <linux/debugfs.h>
27539 +#include <linux/seq_file.h>
27540 +#include <linux/percpu.h>
27541 +#include <linux/kallsyms.h>
27542 +#include <linux/uaccess.h>
27543 +#include <linux/sched.h>
27544 +#include <linux/sched/rt.h>
27545 +#include <linux/slab.h>
27546 +#include <linux/atomic.h>
27547 +#include <asm/div64.h>
27549 +#include "trace.h"
27550 +#include <trace/events/sched.h>
27552 +#define NSECS_PER_USECS 1000L
27554 +#define CREATE_TRACE_POINTS
27555 +#include <trace/events/hist.h>
27557 +enum {
27558 +       IRQSOFF_LATENCY = 0,
27559 +       PREEMPTOFF_LATENCY,
27560 +       PREEMPTIRQSOFF_LATENCY,
27561 +       WAKEUP_LATENCY,
27562 +       WAKEUP_LATENCY_SHAREDPRIO,
27563 +       MISSED_TIMER_OFFSETS,
27564 +       TIMERANDWAKEUP_LATENCY,
27565 +       MAX_LATENCY_TYPE,
27568 +#define MAX_ENTRY_NUM 10240
27570 +struct hist_data {
27571 +       atomic_t hist_mode; /* 0 log, 1 don't log */
27572 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
27573 +       long min_lat;
27574 +       long max_lat;
27575 +       unsigned long long below_hist_bound_samples;
27576 +       unsigned long long above_hist_bound_samples;
27577 +       long long accumulate_lat;
27578 +       unsigned long long total_samples;
27579 +       unsigned long long hist_array[MAX_ENTRY_NUM];
27582 +struct enable_data {
27583 +       int latency_type;
27584 +       int enabled;
27587 +static char *latency_hist_dir_root = "latency_hist";
27589 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27590 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
27591 +static char *irqsoff_hist_dir = "irqsoff";
27592 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
27593 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
27594 +#endif
27596 +#ifdef CONFIG_PREEMPT_OFF_HIST
27597 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
27598 +static char *preemptoff_hist_dir = "preemptoff";
27599 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
27600 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
27601 +#endif
27603 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
27604 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
27605 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
27606 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
27607 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
27608 +#endif
27610 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
27611 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
27612 +static struct enable_data preemptirqsoff_enabled_data = {
27613 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
27614 +       .enabled = 0,
27616 +#endif
27618 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27619 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27620 +struct maxlatproc_data {
27621 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
27622 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
27623 +       int pid;
27624 +       int current_pid;
27625 +       int prio;
27626 +       int current_prio;
27627 +       long latency;
27628 +       long timeroffset;
27629 +       cycle_t timestamp;
27631 +#endif
27633 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27634 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
27635 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
27636 +static char *wakeup_latency_hist_dir = "wakeup";
27637 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
27638 +static notrace void probe_wakeup_latency_hist_start(void *v,
27639 +       struct task_struct *p);
27640 +static notrace void probe_wakeup_latency_hist_stop(void *v,
27641 +       bool preempt, struct task_struct *prev, struct task_struct *next);
27642 +static notrace void probe_sched_migrate_task(void *,
27643 +       struct task_struct *task, int cpu);
27644 +static struct enable_data wakeup_latency_enabled_data = {
27645 +       .latency_type = WAKEUP_LATENCY,
27646 +       .enabled = 0,
27648 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
27649 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
27650 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
27651 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
27652 +static unsigned long wakeup_pid;
27653 +#endif
27655 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27656 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
27657 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
27658 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
27659 +       long long offset, struct task_struct *curr, struct task_struct *task);
27660 +static struct enable_data missed_timer_offsets_enabled_data = {
27661 +       .latency_type = MISSED_TIMER_OFFSETS,
27662 +       .enabled = 0,
27664 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
27665 +static unsigned long missed_timer_offsets_pid;
27666 +#endif
27668 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
27669 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27670 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
27671 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
27672 +static struct enable_data timerandwakeup_enabled_data = {
27673 +       .latency_type = TIMERANDWAKEUP_LATENCY,
27674 +       .enabled = 0,
27676 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
27677 +#endif
27679 +void notrace latency_hist(int latency_type, int cpu, long latency,
27680 +                         long timeroffset, cycle_t stop,
27681 +                         struct task_struct *p)
27683 +       struct hist_data *my_hist;
27684 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27685 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27686 +       struct maxlatproc_data *mp = NULL;
27687 +#endif
27689 +       if (!cpu_possible(cpu) || latency_type < 0 ||
27690 +           latency_type >= MAX_LATENCY_TYPE)
27691 +               return;
27693 +       switch (latency_type) {
27694 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27695 +       case IRQSOFF_LATENCY:
27696 +               my_hist = &per_cpu(irqsoff_hist, cpu);
27697 +               break;
27698 +#endif
27699 +#ifdef CONFIG_PREEMPT_OFF_HIST
27700 +       case PREEMPTOFF_LATENCY:
27701 +               my_hist = &per_cpu(preemptoff_hist, cpu);
27702 +               break;
27703 +#endif
27704 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
27705 +       case PREEMPTIRQSOFF_LATENCY:
27706 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
27707 +               break;
27708 +#endif
27709 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27710 +       case WAKEUP_LATENCY:
27711 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
27712 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
27713 +               break;
27714 +       case WAKEUP_LATENCY_SHAREDPRIO:
27715 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
27716 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
27717 +               break;
27718 +#endif
27719 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27720 +       case MISSED_TIMER_OFFSETS:
27721 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
27722 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
27723 +               break;
27724 +#endif
27725 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
27726 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27727 +       case TIMERANDWAKEUP_LATENCY:
27728 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
27729 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
27730 +               break;
27731 +#endif
27733 +       default:
27734 +               return;
27735 +       }
27737 +       latency += my_hist->offset;
27739 +       if (atomic_read(&my_hist->hist_mode) == 0)
27740 +               return;
27742 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
27743 +               if (latency < 0)
27744 +                       my_hist->below_hist_bound_samples++;
27745 +               else
27746 +                       my_hist->above_hist_bound_samples++;
27747 +       } else
27748 +               my_hist->hist_array[latency]++;
27750 +       if (unlikely(latency > my_hist->max_lat ||
27751 +           my_hist->min_lat == LONG_MAX)) {
27752 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27753 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27754 +               if (latency_type == WAKEUP_LATENCY ||
27755 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
27756 +                   latency_type == MISSED_TIMER_OFFSETS ||
27757 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
27758 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
27759 +                       strncpy(mp->current_comm, current->comm,
27760 +                           sizeof(mp->current_comm));
27761 +                       mp->pid = task_pid_nr(p);
27762 +                       mp->current_pid = task_pid_nr(current);
27763 +                       mp->prio = p->prio;
27764 +                       mp->current_prio = current->prio;
27765 +                       mp->latency = latency;
27766 +                       mp->timeroffset = timeroffset;
27767 +                       mp->timestamp = stop;
27768 +               }
27769 +#endif
27770 +               my_hist->max_lat = latency;
27771 +       }
27772 +       if (unlikely(latency < my_hist->min_lat))
27773 +               my_hist->min_lat = latency;
27774 +       my_hist->total_samples++;
27775 +       my_hist->accumulate_lat += latency;
27778 +static void *l_start(struct seq_file *m, loff_t *pos)
27780 +       loff_t *index_ptr = NULL;
27781 +       loff_t index = *pos;
27782 +       struct hist_data *my_hist = m->private;
27784 +       if (index == 0) {
27785 +               char minstr[32], avgstr[32], maxstr[32];
27787 +               atomic_dec(&my_hist->hist_mode);
27789 +               if (likely(my_hist->total_samples)) {
27790 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
27791 +                           my_hist->total_samples);
27792 +                       snprintf(minstr, sizeof(minstr), "%ld",
27793 +                           my_hist->min_lat - my_hist->offset);
27794 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
27795 +                           avg - my_hist->offset);
27796 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
27797 +                           my_hist->max_lat - my_hist->offset);
27798 +               } else {
27799 +                       strcpy(minstr, "<undef>");
27800 +                       strcpy(avgstr, minstr);
27801 +                       strcpy(maxstr, minstr);
27802 +               }
27804 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
27805 +                          "#Average latency: %s microseconds\n"
27806 +                          "#Maximum latency: %s microseconds\n"
27807 +                          "#Total samples: %llu\n"
27808 +                          "#There are %llu samples lower than %ld"
27809 +                          " microseconds.\n"
27810 +                          "#There are %llu samples greater or equal"
27811 +                          " than %ld microseconds.\n"
27812 +                          "#usecs\t%16s\n",
27813 +                          minstr, avgstr, maxstr,
27814 +                          my_hist->total_samples,
27815 +                          my_hist->below_hist_bound_samples,
27816 +                          -my_hist->offset,
27817 +                          my_hist->above_hist_bound_samples,
27818 +                          MAX_ENTRY_NUM - my_hist->offset,
27819 +                          "samples");
27820 +       }
27821 +       if (index < MAX_ENTRY_NUM) {
27822 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
27823 +               if (index_ptr)
27824 +                       *index_ptr = index;
27825 +       }
27827 +       return index_ptr;
27830 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
27832 +       loff_t *index_ptr = p;
27833 +       struct hist_data *my_hist = m->private;
27835 +       if (++*pos >= MAX_ENTRY_NUM) {
27836 +               atomic_inc(&my_hist->hist_mode);
27837 +               return NULL;
27838 +       }
27839 +       *index_ptr = *pos;
27840 +       return index_ptr;
27843 +static void l_stop(struct seq_file *m, void *p)
27845 +       kfree(p);
27848 +static int l_show(struct seq_file *m, void *p)
27850 +       int index = *(loff_t *) p;
27851 +       struct hist_data *my_hist = m->private;
27853 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
27854 +           my_hist->hist_array[index]);
27855 +       return 0;
27858 +static const struct seq_operations latency_hist_seq_op = {
27859 +       .start = l_start,
27860 +       .next  = l_next,
27861 +       .stop  = l_stop,
27862 +       .show  = l_show
27865 +static int latency_hist_open(struct inode *inode, struct file *file)
27867 +       int ret;
27869 +       ret = seq_open(file, &latency_hist_seq_op);
27870 +       if (!ret) {
27871 +               struct seq_file *seq = file->private_data;
27872 +               seq->private = inode->i_private;
27873 +       }
27874 +       return ret;
27877 +static const struct file_operations latency_hist_fops = {
27878 +       .open = latency_hist_open,
27879 +       .read = seq_read,
27880 +       .llseek = seq_lseek,
27881 +       .release = seq_release,
27884 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27885 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27886 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
27888 +       mp->comm[0] = mp->current_comm[0] = '\0';
27889 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
27890 +           mp->latency = mp->timeroffset = -1;
27891 +       mp->timestamp = 0;
27893 +#endif
27895 +static void hist_reset(struct hist_data *hist)
27897 +       atomic_dec(&hist->hist_mode);
27899 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
27900 +       hist->below_hist_bound_samples = 0ULL;
27901 +       hist->above_hist_bound_samples = 0ULL;
27902 +       hist->min_lat = LONG_MAX;
27903 +       hist->max_lat = LONG_MIN;
27904 +       hist->total_samples = 0ULL;
27905 +       hist->accumulate_lat = 0LL;
27907 +       atomic_inc(&hist->hist_mode);
27910 +static ssize_t
27911 +latency_hist_reset(struct file *file, const char __user *a,
27912 +                  size_t size, loff_t *off)
27914 +       int cpu;
27915 +       struct hist_data *hist = NULL;
27916 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27917 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27918 +       struct maxlatproc_data *mp = NULL;
27919 +#endif
27920 +       off_t latency_type = (off_t) file->private_data;
27922 +       for_each_online_cpu(cpu) {
27924 +               switch (latency_type) {
27925 +#ifdef CONFIG_PREEMPT_OFF_HIST
27926 +               case PREEMPTOFF_LATENCY:
27927 +                       hist = &per_cpu(preemptoff_hist, cpu);
27928 +                       break;
27929 +#endif
27930 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27931 +               case IRQSOFF_LATENCY:
27932 +                       hist = &per_cpu(irqsoff_hist, cpu);
27933 +                       break;
27934 +#endif
27935 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27936 +               case PREEMPTIRQSOFF_LATENCY:
27937 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
27938 +                       break;
27939 +#endif
27940 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27941 +               case WAKEUP_LATENCY:
27942 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
27943 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
27944 +                       break;
27945 +               case WAKEUP_LATENCY_SHAREDPRIO:
27946 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
27947 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
27948 +                       break;
27949 +#endif
27950 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27951 +               case MISSED_TIMER_OFFSETS:
27952 +                       hist = &per_cpu(missed_timer_offsets, cpu);
27953 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
27954 +                       break;
27955 +#endif
27956 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
27957 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27958 +               case TIMERANDWAKEUP_LATENCY:
27959 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
27960 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
27961 +                       break;
27962 +#endif
27963 +               }
27965 +               hist_reset(hist);
27966 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27967 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27968 +               if (latency_type == WAKEUP_LATENCY ||
27969 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
27970 +                   latency_type == MISSED_TIMER_OFFSETS ||
27971 +                   latency_type == TIMERANDWAKEUP_LATENCY)
27972 +                       clear_maxlatprocdata(mp);
27973 +#endif
27974 +       }
27976 +       return size;
27979 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27980 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27981 +static ssize_t
27982 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
27984 +       char buf[64];
27985 +       int r;
27986 +       unsigned long *this_pid = file->private_data;
27988 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
27989 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
27992 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
27993 +                     size_t cnt, loff_t *ppos)
27995 +       char buf[64];
27996 +       unsigned long pid;
27997 +       unsigned long *this_pid = file->private_data;
27999 +       if (cnt >= sizeof(buf))
28000 +               return -EINVAL;
28002 +       if (copy_from_user(&buf, ubuf, cnt))
28003 +               return -EFAULT;
28005 +       buf[cnt] = '\0';
28007 +       if (kstrtoul(buf, 10, &pid))
28008 +               return -EINVAL;
28010 +       *this_pid = pid;
28012 +       return cnt;
28014 +#endif
28016 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
28017 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
28018 +static ssize_t
28019 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
28021 +       int r;
28022 +       struct maxlatproc_data *mp = file->private_data;
28023 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
28024 +       unsigned long long t;
28025 +       unsigned long usecs, secs;
28026 +       char *buf;
28028 +       if (mp->pid == -1 || mp->current_pid == -1) {
28029 +               buf = "(none)\n";
28030 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
28031 +                   strlen(buf));
28032 +       }
28034 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
28035 +       if (buf == NULL)
28036 +               return -ENOMEM;
28038 +       t = ns2usecs(mp->timestamp);
28039 +       usecs = do_div(t, USEC_PER_SEC);
28040 +       secs = (unsigned long) t;
28041 +       r = snprintf(buf, strmaxlen,
28042 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
28043 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
28044 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
28045 +           secs, usecs);
28046 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
28047 +       kfree(buf);
28048 +       return r;
28050 +#endif
28052 +static ssize_t
28053 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
28055 +       char buf[64];
28056 +       struct enable_data *ed = file->private_data;
28057 +       int r;
28059 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
28060 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
28063 +static ssize_t
28064 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
28066 +       char buf[64];
28067 +       long enable;
28068 +       struct enable_data *ed = file->private_data;
28070 +       if (cnt >= sizeof(buf))
28071 +               return -EINVAL;
28073 +       if (copy_from_user(&buf, ubuf, cnt))
28074 +               return -EFAULT;
28076 +       buf[cnt] = 0;
28078 +       if (kstrtoul(buf, 10, &enable))
28079 +               return -EINVAL;
28081 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
28082 +               return cnt;
28084 +       if (enable) {
28085 +               int ret;
28087 +               switch (ed->latency_type) {
28088 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
28089 +               case PREEMPTIRQSOFF_LATENCY:
28090 +                       ret = register_trace_preemptirqsoff_hist(
28091 +                           probe_preemptirqsoff_hist, NULL);
28092 +                       if (ret) {
28093 +                               pr_info("wakeup trace: Couldn't assign "
28094 +                                   "probe_preemptirqsoff_hist "
28095 +                                   "to trace_preemptirqsoff_hist\n");
28096 +                               return ret;
28097 +                       }
28098 +                       break;
28099 +#endif
28100 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
28101 +               case WAKEUP_LATENCY:
28102 +                       ret = register_trace_sched_wakeup(
28103 +                           probe_wakeup_latency_hist_start, NULL);
28104 +                       if (ret) {
28105 +                               pr_info("wakeup trace: Couldn't assign "
28106 +                                   "probe_wakeup_latency_hist_start "
28107 +                                   "to trace_sched_wakeup\n");
28108 +                               return ret;
28109 +                       }
28110 +                       ret = register_trace_sched_wakeup_new(
28111 +                           probe_wakeup_latency_hist_start, NULL);
28112 +                       if (ret) {
28113 +                               pr_info("wakeup trace: Couldn't assign "
28114 +                                   "probe_wakeup_latency_hist_start "
28115 +                                   "to trace_sched_wakeup_new\n");
28116 +                               unregister_trace_sched_wakeup(
28117 +                                   probe_wakeup_latency_hist_start, NULL);
28118 +                               return ret;
28119 +                       }
28120 +                       ret = register_trace_sched_switch(
28121 +                           probe_wakeup_latency_hist_stop, NULL);
28122 +                       if (ret) {
28123 +                               pr_info("wakeup trace: Couldn't assign "
28124 +                                   "probe_wakeup_latency_hist_stop "
28125 +                                   "to trace_sched_switch\n");
28126 +                               unregister_trace_sched_wakeup(
28127 +                                   probe_wakeup_latency_hist_start, NULL);
28128 +                               unregister_trace_sched_wakeup_new(
28129 +                                   probe_wakeup_latency_hist_start, NULL);
28130 +                               return ret;
28131 +                       }
28132 +                       ret = register_trace_sched_migrate_task(
28133 +                           probe_sched_migrate_task, NULL);
28134 +                       if (ret) {
28135 +                               pr_info("wakeup trace: Couldn't assign "
28136 +                                   "probe_sched_migrate_task "
28137 +                                   "to trace_sched_migrate_task\n");
28138 +                               unregister_trace_sched_wakeup(
28139 +                                   probe_wakeup_latency_hist_start, NULL);
28140 +                               unregister_trace_sched_wakeup_new(
28141 +                                   probe_wakeup_latency_hist_start, NULL);
28142 +                               unregister_trace_sched_switch(
28143 +                                   probe_wakeup_latency_hist_stop, NULL);
28144 +                               return ret;
28145 +                       }
28146 +                       break;
28147 +#endif
28148 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
28149 +               case MISSED_TIMER_OFFSETS:
28150 +                       ret = register_trace_hrtimer_interrupt(
28151 +                           probe_hrtimer_interrupt, NULL);
28152 +                       if (ret) {
28153 +                               pr_info("wakeup trace: Couldn't assign "
28154 +                                   "probe_hrtimer_interrupt "
28155 +                                   "to trace_hrtimer_interrupt\n");
28156 +                               return ret;
28157 +                       }
28158 +                       break;
28159 +#endif
28160 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
28161 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
28162 +               case TIMERANDWAKEUP_LATENCY:
28163 +                       if (!wakeup_latency_enabled_data.enabled ||
28164 +                           !missed_timer_offsets_enabled_data.enabled)
28165 +                               return -EINVAL;
28166 +                       break;
28167 +#endif
28168 +               default:
28169 +                       break;
28170 +               }
28171 +       } else {
28172 +               switch (ed->latency_type) {
28173 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
28174 +               case PREEMPTIRQSOFF_LATENCY:
28175 +                       {
28176 +                               int cpu;
28178 +                               unregister_trace_preemptirqsoff_hist(
28179 +                                   probe_preemptirqsoff_hist, NULL);
28180 +                               for_each_online_cpu(cpu) {
28181 +#ifdef CONFIG_INTERRUPT_OFF_HIST
28182 +                                       per_cpu(hist_irqsoff_counting,
28183 +                                           cpu) = 0;
28184 +#endif
28185 +#ifdef CONFIG_PREEMPT_OFF_HIST
28186 +                                       per_cpu(hist_preemptoff_counting,
28187 +                                           cpu) = 0;
28188 +#endif
28189 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
28190 +                                       per_cpu(hist_preemptirqsoff_counting,
28191 +                                           cpu) = 0;
28192 +#endif
28193 +                               }
28194 +                       }
28195 +                       break;
28196 +#endif
28197 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
28198 +               case WAKEUP_LATENCY:
28199 +                       {
28200 +                               int cpu;
28202 +                               unregister_trace_sched_wakeup(
28203 +                                   probe_wakeup_latency_hist_start, NULL);
28204 +                               unregister_trace_sched_wakeup_new(
28205 +                                   probe_wakeup_latency_hist_start, NULL);
28206 +                               unregister_trace_sched_switch(
28207 +                                   probe_wakeup_latency_hist_stop, NULL);
28208 +                               unregister_trace_sched_migrate_task(
28209 +                                   probe_sched_migrate_task, NULL);
28211 +                               for_each_online_cpu(cpu) {
28212 +                                       per_cpu(wakeup_task, cpu) = NULL;
28213 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
28214 +                               }
28215 +                       }
28216 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
28217 +                       timerandwakeup_enabled_data.enabled = 0;
28218 +#endif
28219 +                       break;
28220 +#endif
28221 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
28222 +               case MISSED_TIMER_OFFSETS:
28223 +                       unregister_trace_hrtimer_interrupt(
28224 +                           probe_hrtimer_interrupt, NULL);
28225 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
28226 +                       timerandwakeup_enabled_data.enabled = 0;
28227 +#endif
28228 +                       break;
28229 +#endif
28230 +               default:
28231 +                       break;
28232 +               }
28233 +       }
28234 +       ed->enabled = enable;
28235 +       return cnt;
28238 +static const struct file_operations latency_hist_reset_fops = {
28239 +       .open = tracing_open_generic,
28240 +       .write = latency_hist_reset,
28243 +static const struct file_operations enable_fops = {
28244 +       .open = tracing_open_generic,
28245 +       .read = show_enable,
28246 +       .write = do_enable,
28249 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
28250 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
28251 +static const struct file_operations pid_fops = {
28252 +       .open = tracing_open_generic,
28253 +       .read = show_pid,
28254 +       .write = do_pid,
28257 +static const struct file_operations maxlatproc_fops = {
28258 +       .open = tracing_open_generic,
28259 +       .read = show_maxlatproc,
28261 +#endif
28263 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
28264 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
28265 +       int starthist)
28267 +       int cpu = raw_smp_processor_id();
28268 +       int time_set = 0;
28270 +       if (starthist) {
28271 +               cycle_t uninitialized_var(start);
28273 +               if (!preempt_count() && !irqs_disabled())
28274 +                       return;
28276 +#ifdef CONFIG_INTERRUPT_OFF_HIST
28277 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
28278 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
28279 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
28280 +                       start = ftrace_now(cpu);
28281 +                       time_set++;
28282 +                       per_cpu(hist_irqsoff_start, cpu) = start;
28283 +               }
28284 +#endif
28286 +#ifdef CONFIG_PREEMPT_OFF_HIST
28287 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
28288 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
28289 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
28290 +                       if (!(time_set++))
28291 +                               start = ftrace_now(cpu);
28292 +                       per_cpu(hist_preemptoff_start, cpu) = start;
28293 +               }
28294 +#endif
28296 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
28297 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
28298 +                   per_cpu(hist_preemptoff_counting, cpu) &&
28299 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
28300 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
28301 +                       if (!time_set)
28302 +                               start = ftrace_now(cpu);
28303 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
28304 +               }
28305 +#endif
28306 +       } else {
28307 +               cycle_t uninitialized_var(stop);
28309 +#ifdef CONFIG_INTERRUPT_OFF_HIST
28310 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
28311 +                   per_cpu(hist_irqsoff_counting, cpu)) {
28312 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
28314 +                       stop = ftrace_now(cpu);
28315 +                       time_set++;
28316 +                       if (start) {
28317 +                               long latency = ((long) (stop - start)) /
28318 +                                   NSECS_PER_USECS;
28320 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
28321 +                                   stop, NULL);
28322 +                       }
28323 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
28324 +               }
28325 +#endif
28327 +#ifdef CONFIG_PREEMPT_OFF_HIST
28328 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
28329 +                   per_cpu(hist_preemptoff_counting, cpu)) {
28330 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
28332 +                       if (!(time_set++))
28333 +                               stop = ftrace_now(cpu);
28334 +                       if (start) {
28335 +                               long latency = ((long) (stop - start)) /
28336 +                                   NSECS_PER_USECS;
28338 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
28339 +                                   0, stop, NULL);
28340 +                       }
28341 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
28342 +               }
28343 +#endif
28345 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
28346 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
28347 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
28348 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
28349 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
28351 +                       if (!time_set)
28352 +                               stop = ftrace_now(cpu);
28353 +                       if (start) {
28354 +                               long latency = ((long) (stop - start)) /
28355 +                                   NSECS_PER_USECS;
28357 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
28358 +                                   latency, 0, stop, NULL);
28359 +                       }
28360 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
28361 +               }
28362 +#endif
28363 +       }
28365 +#endif
28367 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
28368 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
28369 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
28370 +       int cpu)
28372 +       int old_cpu = task_cpu(task);
28374 +       if (cpu != old_cpu) {
28375 +               unsigned long flags;
28376 +               struct task_struct *cpu_wakeup_task;
28378 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
28380 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
28381 +               if (task == cpu_wakeup_task) {
28382 +                       put_task_struct(cpu_wakeup_task);
28383 +                       per_cpu(wakeup_task, old_cpu) = NULL;
28384 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
28385 +                       get_task_struct(cpu_wakeup_task);
28386 +               }
28388 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
28389 +       }
28392 +static notrace void probe_wakeup_latency_hist_start(void *v,
28393 +       struct task_struct *p)
28395 +       unsigned long flags;
28396 +       struct task_struct *curr = current;
28397 +       int cpu = task_cpu(p);
28398 +       struct task_struct *cpu_wakeup_task;
28400 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
28402 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
28404 +       if (wakeup_pid) {
28405 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
28406 +                   p->prio == curr->prio)
28407 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
28408 +               if (likely(wakeup_pid != task_pid_nr(p)))
28409 +                       goto out;
28410 +       } else {
28411 +               if (likely(!rt_task(p)) ||
28412 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
28413 +                   p->prio > curr->prio)
28414 +                       goto out;
28415 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
28416 +                   p->prio == curr->prio)
28417 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
28418 +       }
28420 +       if (cpu_wakeup_task)
28421 +               put_task_struct(cpu_wakeup_task);
28422 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
28423 +       get_task_struct(cpu_wakeup_task);
28424 +       cpu_wakeup_task->preempt_timestamp_hist =
28425 +               ftrace_now(raw_smp_processor_id());
28426 +out:
28427 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
28430 +static notrace void probe_wakeup_latency_hist_stop(void *v,
28431 +       bool preempt, struct task_struct *prev, struct task_struct *next)
28433 +       unsigned long flags;
28434 +       int cpu = task_cpu(next);
28435 +       long latency;
28436 +       cycle_t stop;
28437 +       struct task_struct *cpu_wakeup_task;
28439 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
28441 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
28443 +       if (cpu_wakeup_task == NULL)
28444 +               goto out;
28446 +       /* Already running? */
28447 +       if (unlikely(current == cpu_wakeup_task))
28448 +               goto out_reset;
28450 +       if (next != cpu_wakeup_task) {
28451 +               if (next->prio < cpu_wakeup_task->prio)
28452 +                       goto out_reset;
28454 +               if (next->prio == cpu_wakeup_task->prio)
28455 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
28457 +               goto out;
28458 +       }
28460 +       if (current->prio == cpu_wakeup_task->prio)
28461 +               per_cpu(wakeup_sharedprio, cpu) = 1;
28463 +       /*
28464 +        * The task we are waiting for is about to be switched to.
28465 +        * Calculate latency and store it in histogram.
28466 +        */
28467 +       stop = ftrace_now(raw_smp_processor_id());
28469 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
28470 +           NSECS_PER_USECS;
28472 +       if (per_cpu(wakeup_sharedprio, cpu)) {
28473 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
28474 +                   next);
28475 +               per_cpu(wakeup_sharedprio, cpu) = 0;
28476 +       } else {
28477 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
28478 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
28479 +               if (timerandwakeup_enabled_data.enabled) {
28480 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
28481 +                           next->timer_offset + latency, next->timer_offset,
28482 +                           stop, next);
28483 +               }
28484 +#endif
28485 +       }
28487 +out_reset:
28488 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
28489 +       next->timer_offset = 0;
28490 +#endif
28491 +       put_task_struct(cpu_wakeup_task);
28492 +       per_cpu(wakeup_task, cpu) = NULL;
28493 +out:
28494 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
28496 +#endif
28498 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
28499 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
28500 +       long long latency_ns, struct task_struct *curr,
28501 +       struct task_struct *task)
28503 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
28504 +           (task->prio < curr->prio ||
28505 +           (task->prio == curr->prio &&
28506 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
28507 +               long latency;
28508 +               cycle_t now;
28510 +               if (missed_timer_offsets_pid) {
28511 +                       if (likely(missed_timer_offsets_pid !=
28512 +                           task_pid_nr(task)))
28513 +                               return;
28514 +               }
28516 +               now = ftrace_now(cpu);
28517 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
28518 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
28519 +                   task);
28520 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
28521 +               task->timer_offset = latency;
28522 +#endif
28523 +       }
28525 +#endif
28527 +static __init int latency_hist_init(void)
28529 +       struct dentry *latency_hist_root = NULL;
28530 +       struct dentry *dentry;
28531 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
28532 +       struct dentry *dentry_sharedprio;
28533 +#endif
28534 +       struct dentry *entry;
28535 +       struct dentry *enable_root;
28536 +       int i = 0;
28537 +       struct hist_data *my_hist;
28538 +       char name[64];
28539 +       char *cpufmt = "CPU%d";
28540 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
28541 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
28542 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
28543 +       struct maxlatproc_data *mp = NULL;
28544 +#endif
28546 +       dentry = tracing_init_dentry();
28547 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
28548 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
28550 +#ifdef CONFIG_INTERRUPT_OFF_HIST
28551 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
28552 +       for_each_possible_cpu(i) {
28553 +               sprintf(name, cpufmt, i);
28554 +               entry = debugfs_create_file(name, 0444, dentry,
28555 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
28556 +               my_hist = &per_cpu(irqsoff_hist, i);
28557 +               atomic_set(&my_hist->hist_mode, 1);
28558 +               my_hist->min_lat = LONG_MAX;
28559 +       }
28560 +       entry = debugfs_create_file("reset", 0644, dentry,
28561 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
28562 +#endif
28564 +#ifdef CONFIG_PREEMPT_OFF_HIST
28565 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
28566 +           latency_hist_root);
28567 +       for_each_possible_cpu(i) {
28568 +               sprintf(name, cpufmt, i);
28569 +               entry = debugfs_create_file(name, 0444, dentry,
28570 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
28571 +               my_hist = &per_cpu(preemptoff_hist, i);
28572 +               atomic_set(&my_hist->hist_mode, 1);
28573 +               my_hist->min_lat = LONG_MAX;
28574 +       }
28575 +       entry = debugfs_create_file("reset", 0644, dentry,
28576 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
28577 +#endif
28579 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
28580 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
28581 +           latency_hist_root);
28582 +       for_each_possible_cpu(i) {
28583 +               sprintf(name, cpufmt, i);
28584 +               entry = debugfs_create_file(name, 0444, dentry,
28585 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
28586 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
28587 +               atomic_set(&my_hist->hist_mode, 1);
28588 +               my_hist->min_lat = LONG_MAX;
28589 +       }
28590 +       entry = debugfs_create_file("reset", 0644, dentry,
28591 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
28592 +#endif
28594 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
28595 +       entry = debugfs_create_file("preemptirqsoff", 0644,
28596 +           enable_root, (void *)&preemptirqsoff_enabled_data,
28597 +           &enable_fops);
28598 +#endif
28600 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
28601 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
28602 +           latency_hist_root);
28603 +       dentry_sharedprio = debugfs_create_dir(
28604 +           wakeup_latency_hist_dir_sharedprio, dentry);
28605 +       for_each_possible_cpu(i) {
28606 +               sprintf(name, cpufmt, i);
28608 +               entry = debugfs_create_file(name, 0444, dentry,
28609 +                   &per_cpu(wakeup_latency_hist, i),
28610 +                   &latency_hist_fops);
28611 +               my_hist = &per_cpu(wakeup_latency_hist, i);
28612 +               atomic_set(&my_hist->hist_mode, 1);
28613 +               my_hist->min_lat = LONG_MAX;
28615 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
28616 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
28617 +                   &latency_hist_fops);
28618 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
28619 +               atomic_set(&my_hist->hist_mode, 1);
28620 +               my_hist->min_lat = LONG_MAX;
28622 +               sprintf(name, cpufmt_maxlatproc, i);
28624 +               mp = &per_cpu(wakeup_maxlatproc, i);
28625 +               entry = debugfs_create_file(name, 0444, dentry, mp,
28626 +                   &maxlatproc_fops);
28627 +               clear_maxlatprocdata(mp);
28629 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
28630 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
28631 +                   &maxlatproc_fops);
28632 +               clear_maxlatprocdata(mp);
28633 +       }
28634 +       entry = debugfs_create_file("pid", 0644, dentry,
28635 +           (void *)&wakeup_pid, &pid_fops);
28636 +       entry = debugfs_create_file("reset", 0644, dentry,
28637 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
28638 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
28639 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
28640 +       entry = debugfs_create_file("wakeup", 0644,
28641 +           enable_root, (void *)&wakeup_latency_enabled_data,
28642 +           &enable_fops);
28643 +#endif
28645 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
28646 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
28647 +           latency_hist_root);
28648 +       for_each_possible_cpu(i) {
28649 +               sprintf(name, cpufmt, i);
28650 +               entry = debugfs_create_file(name, 0444, dentry,
28651 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
28652 +               my_hist = &per_cpu(missed_timer_offsets, i);
28653 +               atomic_set(&my_hist->hist_mode, 1);
28654 +               my_hist->min_lat = LONG_MAX;
28656 +               sprintf(name, cpufmt_maxlatproc, i);
28657 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
28658 +               entry = debugfs_create_file(name, 0444, dentry, mp,
28659 +                   &maxlatproc_fops);
28660 +               clear_maxlatprocdata(mp);
28661 +       }
28662 +       entry = debugfs_create_file("pid", 0644, dentry,
28663 +           (void *)&missed_timer_offsets_pid, &pid_fops);
28664 +       entry = debugfs_create_file("reset", 0644, dentry,
28665 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
28666 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
28667 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
28668 +           &enable_fops);
28669 +#endif
28671 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
28672 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
28673 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
28674 +           latency_hist_root);
28675 +       for_each_possible_cpu(i) {
28676 +               sprintf(name, cpufmt, i);
28677 +               entry = debugfs_create_file(name, 0444, dentry,
28678 +                   &per_cpu(timerandwakeup_latency_hist, i),
28679 +                   &latency_hist_fops);
28680 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
28681 +               atomic_set(&my_hist->hist_mode, 1);
28682 +               my_hist->min_lat = LONG_MAX;
28684 +               sprintf(name, cpufmt_maxlatproc, i);
28685 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
28686 +               entry = debugfs_create_file(name, 0444, dentry, mp,
28687 +                   &maxlatproc_fops);
28688 +               clear_maxlatprocdata(mp);
28689 +       }
28690 +       entry = debugfs_create_file("reset", 0644, dentry,
28691 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
28692 +       entry = debugfs_create_file("timerandwakeup", 0644,
28693 +           enable_root, (void *)&timerandwakeup_enabled_data,
28694 +           &enable_fops);
28695 +#endif
28696 +       return 0;
28699 +device_initcall(latency_hist_init);
28700 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
28701 index 8aef4e63ac57..cdb7742283e5 100644
28702 --- a/kernel/trace/trace.c
28703 +++ b/kernel/trace/trace.c
28704 @@ -1652,6 +1652,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
28705         struct task_struct *tsk = current;
28707         entry->preempt_count            = pc & 0xff;
28708 +       entry->preempt_lazy_count       = preempt_lazy_count();
28709         entry->pid                      = (tsk) ? tsk->pid : 0;
28710         entry->flags =
28711  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
28712 @@ -1661,8 +1662,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
28713  #endif
28714                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
28715                 ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
28716 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
28717 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
28718 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
28719                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
28721 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
28723  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
28725 @@ -2555,14 +2559,17 @@ get_total_entries(struct trace_buffer *buf,
28727  static void print_lat_help_header(struct seq_file *m)
28729 -       seq_puts(m, "#                  _------=> CPU#            \n"
28730 -                   "#                 / _-----=> irqs-off        \n"
28731 -                   "#                | / _----=> need-resched    \n"
28732 -                   "#                || / _---=> hardirq/softirq \n"
28733 -                   "#                ||| / _--=> preempt-depth   \n"
28734 -                   "#                |||| /     delay            \n"
28735 -                   "#  cmd     pid   ||||| time  |   caller      \n"
28736 -                   "#     \\   /      |||||  \\    |   /         \n");
28737 +       seq_puts(m, "#                  _--------=> CPU#              \n"
28738 +                   "#                 / _-------=> irqs-off          \n"
28739 +                   "#                | / _------=> need-resched      \n"
28740 +                   "#                || / _-----=> need-resched_lazy \n"
28741 +                   "#                ||| / _----=> hardirq/softirq   \n"
28742 +                   "#                |||| / _---=> preempt-depth     \n"
28743 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
28744 +                   "#                |||||| / _-=> migrate-disable   \n"
28745 +                   "#                ||||||| /     delay             \n"
28746 +                   "# cmd     pid    |||||||| time   |  caller       \n"
28747 +                   "#     \\   /      ||||||||   \\    |  /            \n");
28750  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
28751 @@ -2588,11 +2595,14 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
28752         print_event_info(buf, m);
28753         seq_puts(m, "#                              _-----=> irqs-off\n"
28754                     "#                             / _----=> need-resched\n"
28755 -                   "#                            | / _---=> hardirq/softirq\n"
28756 -                   "#                            || / _--=> preempt-depth\n"
28757 -                   "#                            ||| /     delay\n"
28758 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
28759 -                   "#              | |       |   ||||       |         |\n");
28760 +                   "#                            |/  _-----=> need-resched_lazy\n"
28761 +                   "#                            || / _---=> hardirq/softirq\n"
28762 +                   "#                            ||| / _--=> preempt-depth\n"
28763 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
28764 +                   "#                            ||||| / _-=> migrate-disable   \n"
28765 +                   "#                            |||||| /    delay\n"
28766 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
28767 +                   "#              | |       |   |||||||      |         |\n");
28770  void
28771 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
28772 index 919d9d07686f..3bf86ece683c 100644
28773 --- a/kernel/trace/trace.h
28774 +++ b/kernel/trace/trace.h
28775 @@ -117,6 +117,7 @@ struct kretprobe_trace_entry_head {
28776   *  NEED_RESCHED       - reschedule is requested
28777   *  HARDIRQ            - inside an interrupt handler
28778   *  SOFTIRQ            - inside a softirq handler
28779 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
28780   */
28781  enum trace_flag_type {
28782         TRACE_FLAG_IRQS_OFF             = 0x01,
28783 @@ -125,6 +126,7 @@ enum trace_flag_type {
28784         TRACE_FLAG_HARDIRQ              = 0x08,
28785         TRACE_FLAG_SOFTIRQ              = 0x10,
28786         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
28787 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x40,
28788  };
28790  #define TRACE_BUF_SIZE         1024
28791 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
28792 index 996f0fd34312..5bd79b347398 100644
28793 --- a/kernel/trace/trace_events.c
28794 +++ b/kernel/trace/trace_events.c
28795 @@ -188,6 +188,8 @@ static int trace_define_common_fields(void)
28796         __common_field(unsigned char, flags);
28797         __common_field(unsigned char, preempt_count);
28798         __common_field(int, pid);
28799 +       __common_field(unsigned short, migrate_disable);
28800 +       __common_field(unsigned short, padding);
28802         return ret;
28804 @@ -244,6 +246,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
28806         local_save_flags(fbuffer->flags);
28807         fbuffer->pc = preempt_count();
28808 +       /*
28809 +        * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
28810 +        * preemption (adding one to the preempt_count). Since we are
28811 +        * interested in the preempt_count at the time the tracepoint was
28812 +        * hit, we need to subtract one to offset the increment.
28813 +        */
28814 +       if (IS_ENABLED(CONFIG_PREEMPT))
28815 +               fbuffer->pc--;
28816         fbuffer->trace_file = trace_file;
28818         fbuffer->event =
28819 diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
28820 index be3222b7d72e..553e71254ad6 100644
28821 --- a/kernel/trace/trace_irqsoff.c
28822 +++ b/kernel/trace/trace_irqsoff.c
28823 @@ -13,6 +13,7 @@
28824  #include <linux/uaccess.h>
28825  #include <linux/module.h>
28826  #include <linux/ftrace.h>
28827 +#include <trace/events/hist.h>
28829  #include "trace.h"
28831 @@ -424,11 +425,13 @@ void start_critical_timings(void)
28833         if (preempt_trace() || irq_trace())
28834                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
28835 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
28837  EXPORT_SYMBOL_GPL(start_critical_timings);
28839  void stop_critical_timings(void)
28841 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
28842         if (preempt_trace() || irq_trace())
28843                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
28845 @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
28846  #ifdef CONFIG_PROVE_LOCKING
28847  void time_hardirqs_on(unsigned long a0, unsigned long a1)
28849 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
28850         if (!preempt_trace() && irq_trace())
28851                 stop_critical_timing(a0, a1);
28853 @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
28855         if (!preempt_trace() && irq_trace())
28856                 start_critical_timing(a0, a1);
28857 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
28860  #else /* !CONFIG_PROVE_LOCKING */
28861 @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
28862   */
28863  void trace_hardirqs_on(void)
28865 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
28866         if (!preempt_trace() && irq_trace())
28867                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
28869 @@ -480,11 +486,13 @@ void trace_hardirqs_off(void)
28871         if (!preempt_trace() && irq_trace())
28872                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
28873 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
28875  EXPORT_SYMBOL(trace_hardirqs_off);
28877  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
28879 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
28880         if (!preempt_trace() && irq_trace())
28881                 stop_critical_timing(CALLER_ADDR0, caller_addr);
28883 @@ -494,6 +502,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
28885         if (!preempt_trace() && irq_trace())
28886                 start_critical_timing(CALLER_ADDR0, caller_addr);
28887 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
28889  EXPORT_SYMBOL(trace_hardirqs_off_caller);
28891 @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
28892  #ifdef CONFIG_PREEMPT_TRACER
28893  void trace_preempt_on(unsigned long a0, unsigned long a1)
28895 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
28896         if (preempt_trace() && !irq_trace())
28897                 stop_critical_timing(a0, a1);
28900  void trace_preempt_off(unsigned long a0, unsigned long a1)
28902 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
28903         if (preempt_trace() && !irq_trace())
28904                 start_critical_timing(a0, a1);
28906 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
28907 index 282982195e09..9f19d839a756 100644
28908 --- a/kernel/trace/trace_output.c
28909 +++ b/kernel/trace/trace_output.c
28910 @@ -386,6 +386,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
28912         char hardsoft_irq;
28913         char need_resched;
28914 +       char need_resched_lazy;
28915         char irqs_off;
28916         int hardirq;
28917         int softirq;
28918 @@ -413,6 +414,8 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
28919                 need_resched = '.';
28920                 break;
28921         }
28922 +       need_resched_lazy =
28923 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
28925         hardsoft_irq =
28926                 (hardirq && softirq) ? 'H' :
28927 @@ -420,14 +423,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
28928                 softirq ? 's' :
28929                 '.';
28931 -       trace_seq_printf(s, "%c%c%c",
28932 -                        irqs_off, need_resched, hardsoft_irq);
28933 +       trace_seq_printf(s, "%c%c%c%c",
28934 +                        irqs_off, need_resched, need_resched_lazy,
28935 +                        hardsoft_irq);
28937         if (entry->preempt_count)
28938                 trace_seq_printf(s, "%x", entry->preempt_count);
28939         else
28940                 trace_seq_putc(s, '.');
28942 +       if (entry->preempt_lazy_count)
28943 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
28944 +       else
28945 +               trace_seq_putc(s, '.');
28947 +       if (entry->migrate_disable)
28948 +               trace_seq_printf(s, "%x", entry->migrate_disable);
28949 +       else
28950 +               trace_seq_putc(s, '.');
28952         return !trace_seq_has_overflowed(s);
28955 diff --git a/kernel/user.c b/kernel/user.c
28956 index b069ccbfb0b0..1a2e88e98b5e 100644
28957 --- a/kernel/user.c
28958 +++ b/kernel/user.c
28959 @@ -161,11 +161,11 @@ void free_uid(struct user_struct *up)
28960         if (!up)
28961                 return;
28963 -       local_irq_save(flags);
28964 +       local_irq_save_nort(flags);
28965         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
28966                 free_user(up, flags);
28967         else
28968 -               local_irq_restore(flags);
28969 +               local_irq_restore_nort(flags);
28972  struct user_struct *alloc_uid(kuid_t uid)
28973 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
28974 index c1e0b5f429b6..fa2e079cc314 100644
28975 --- a/kernel/watchdog.c
28976 +++ b/kernel/watchdog.c
28977 @@ -299,6 +299,8 @@ static int is_softlockup(unsigned long touch_ts)
28979  #ifdef CONFIG_HARDLOCKUP_DETECTOR
28981 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
28983  static struct perf_event_attr wd_hw_attr = {
28984         .type           = PERF_TYPE_HARDWARE,
28985         .config         = PERF_COUNT_HW_CPU_CYCLES,
28986 @@ -332,6 +334,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
28987                 /* only print hardlockups once */
28988                 if (__this_cpu_read(hard_watchdog_warn) == true)
28989                         return;
28990 +               /*
28991 +                * If early-printk is enabled then make sure we do not
28992 +                * lock up in printk() and kill console logging:
28993 +                */
28994 +               printk_kill();
28996 +               raw_spin_lock(&watchdog_output_lock);
28998                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
28999                 print_modules();
29000 @@ -349,8 +358,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
29001                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
29002                         trigger_allbutself_cpu_backtrace();
29004 +               raw_spin_unlock(&watchdog_output_lock);
29005                 if (hardlockup_panic)
29006 -                       panic("Hard LOCKUP");
29007 +                       nmi_panic(regs, "Hard LOCKUP");
29009                 __this_cpu_write(hard_watchdog_warn, true);
29010                 return;
29011 @@ -496,6 +506,7 @@ static void watchdog_enable(unsigned int cpu)
29012         /* kick off the timer for the hardlockup detector */
29013         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
29014         hrtimer->function = watchdog_timer_fn;
29015 +       hrtimer->irqsafe = 1;
29017         /* Enable the perf event */
29018         watchdog_nmi_enable(cpu);
29019 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
29020 index 85555eb4d3cb..79f33789f330 100644
29021 --- a/kernel/workqueue.c
29022 +++ b/kernel/workqueue.c
29023 @@ -48,6 +48,8 @@
29024  #include <linux/nodemask.h>
29025  #include <linux/moduleparam.h>
29026  #include <linux/uaccess.h>
29027 +#include <linux/locallock.h>
29028 +#include <linux/delay.h>
29030  #include "workqueue_internal.h"
29032 @@ -122,11 +124,16 @@ enum {
29033   *    cpu or grabbing pool->lock is enough for read access.  If
29034   *    POOL_DISASSOCIATED is set, it's identical to L.
29035   *
29036 + *    On RT we need the extra protection via rt_lock_idle_list() for
29037 + *    the list manipulations against read access from
29038 + *    wq_worker_sleeping(). All other places are nicely serialized via
29039 + *    pool->lock.
29040 + *
29041   * A: pool->attach_mutex protected.
29042   *
29043   * PL: wq_pool_mutex protected.
29044   *
29045 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
29046 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
29047   *
29048   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
29049   *
29050 @@ -135,7 +142,7 @@ enum {
29051   *
29052   * WQ: wq->mutex protected.
29053   *
29054 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
29055 + * WR: wq->mutex protected for writes.  RCU protected for reads.
29056   *
29057   * MD: wq_mayday_lock protected.
29058   */
29059 @@ -183,7 +190,7 @@ struct worker_pool {
29060         atomic_t                nr_running ____cacheline_aligned_in_smp;
29062         /*
29063 -        * Destruction of pool is sched-RCU protected to allow dereferences
29064 +        * Destruction of pool is RCU protected to allow dereferences
29065          * from get_work_pool().
29066          */
29067         struct rcu_head         rcu;
29068 @@ -212,7 +219,7 @@ struct pool_workqueue {
29069         /*
29070          * Release of unbound pwq is punted to system_wq.  See put_pwq()
29071          * and pwq_unbound_release_workfn() for details.  pool_workqueue
29072 -        * itself is also sched-RCU protected so that the first pwq can be
29073 +        * itself is also RCU protected so that the first pwq can be
29074          * determined without grabbing wq->mutex.
29075          */
29076         struct work_struct      unbound_release_work;
29077 @@ -332,6 +339,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
29078  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
29079  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
29081 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
29083  static int worker_thread(void *__worker);
29084  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
29086 @@ -339,20 +348,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
29087  #include <trace/events/workqueue.h>
29089  #define assert_rcu_or_pool_mutex()                                     \
29090 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
29091 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
29092                          !lockdep_is_held(&wq_pool_mutex),              \
29093 -                        "sched RCU or wq_pool_mutex should be held")
29094 +                        "RCU or wq_pool_mutex should be held")
29096  #define assert_rcu_or_wq_mutex(wq)                                     \
29097 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
29098 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
29099                          !lockdep_is_held(&wq->mutex),                  \
29100 -                        "sched RCU or wq->mutex should be held")
29101 +                        "RCU or wq->mutex should be held")
29103  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
29104 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
29105 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
29106                          !lockdep_is_held(&wq->mutex) &&                \
29107                          !lockdep_is_held(&wq_pool_mutex),              \
29108 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
29109 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
29111  #define for_each_cpu_worker_pool(pool, cpu)                            \
29112         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
29113 @@ -364,7 +373,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
29114   * @pool: iteration cursor
29115   * @pi: integer used for iteration
29116   *
29117 - * This must be called either with wq_pool_mutex held or sched RCU read
29118 + * This must be called either with wq_pool_mutex held or RCU read
29119   * locked.  If the pool needs to be used beyond the locking in effect, the
29120   * caller is responsible for guaranteeing that the pool stays online.
29121   *
29122 @@ -396,7 +405,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
29123   * @pwq: iteration cursor
29124   * @wq: the target workqueue
29125   *
29126 - * This must be called either with wq->mutex held or sched RCU read locked.
29127 + * This must be called either with wq->mutex held or RCU read locked.
29128   * If the pwq needs to be used beyond the locking in effect, the caller is
29129   * responsible for guaranteeing that the pwq stays online.
29130   *
29131 @@ -408,6 +417,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
29132                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
29133                 else
29135 +#ifdef CONFIG_PREEMPT_RT_BASE
29136 +static inline void rt_lock_idle_list(struct worker_pool *pool)
29138 +       preempt_disable();
29140 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
29142 +       preempt_enable();
29144 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
29145 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
29146 +#else
29147 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
29148 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
29149 +static inline void sched_lock_idle_list(struct worker_pool *pool)
29151 +       spin_lock_irq(&pool->lock);
29153 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
29155 +       spin_unlock_irq(&pool->lock);
29157 +#endif
29160  #ifdef CONFIG_DEBUG_OBJECTS_WORK
29162  static struct debug_obj_descr work_debug_descr;
29163 @@ -558,7 +592,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
29164   * @wq: the target workqueue
29165   * @node: the node ID
29166   *
29167 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
29168 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
29169   * read locked.
29170   * If the pwq needs to be used beyond the locking in effect, the caller is
29171   * responsible for guaranteeing that the pwq stays online.
29172 @@ -702,8 +736,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
29173   * @work: the work item of interest
29174   *
29175   * Pools are created and destroyed under wq_pool_mutex, and allows read
29176 - * access under sched-RCU read lock.  As such, this function should be
29177 - * called under wq_pool_mutex or with preemption disabled.
29178 + * access under RCU read lock.  As such, this function should be
29179 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
29180   *
29181   * All fields of the returned pool are accessible as long as the above
29182   * mentioned locking is in effect.  If the returned pool needs to be used
29183 @@ -840,51 +874,44 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
29184   */
29185  static void wake_up_worker(struct worker_pool *pool)
29187 -       struct worker *worker = first_idle_worker(pool);
29188 +       struct worker *worker;
29190 +       rt_lock_idle_list(pool);
29192 +       worker = first_idle_worker(pool);
29194         if (likely(worker))
29195                 wake_up_process(worker->task);
29197 +       rt_unlock_idle_list(pool);
29200  /**
29201 - * wq_worker_waking_up - a worker is waking up
29202 - * @task: task waking up
29203 - * @cpu: CPU @task is waking up to
29204 + * wq_worker_running - a worker is running again
29205 + * @task: task returning from sleep
29206   *
29207 - * This function is called during try_to_wake_up() when a worker is
29208 - * being awoken.
29209 - *
29210 - * CONTEXT:
29211 - * spin_lock_irq(rq->lock)
29212 + * This function is called when a worker returns from schedule()
29213   */
29214 -void wq_worker_waking_up(struct task_struct *task, int cpu)
29215 +void wq_worker_running(struct task_struct *task)
29217         struct worker *worker = kthread_data(task);
29219 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
29220 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
29221 +       if (!worker->sleeping)
29222 +               return;
29223 +       if (!(worker->flags & WORKER_NOT_RUNNING))
29224                 atomic_inc(&worker->pool->nr_running);
29225 -       }
29226 +       worker->sleeping = 0;
29229  /**
29230   * wq_worker_sleeping - a worker is going to sleep
29231   * @task: task going to sleep
29232 - * @cpu: CPU in question, must be the current CPU number
29233 - *
29234 - * This function is called during schedule() when a busy worker is
29235 - * going to sleep.  Worker on the same cpu can be woken up by
29236 - * returning pointer to its task.
29237 - *
29238 - * CONTEXT:
29239 - * spin_lock_irq(rq->lock)
29240 - *
29241 - * Return:
29242 - * Worker task on @cpu to wake up, %NULL if none.
29243 + * This function is called from schedule() when a busy worker is
29244 + * going to sleep.
29245   */
29246 -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
29247 +void wq_worker_sleeping(struct task_struct *task)
29249 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
29250 +       struct worker *worker = kthread_data(task);
29251         struct worker_pool *pool;
29253         /*
29254 @@ -893,29 +920,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
29255          * checking NOT_RUNNING.
29256          */
29257         if (worker->flags & WORKER_NOT_RUNNING)
29258 -               return NULL;
29259 +               return;
29261         pool = worker->pool;
29263 -       /* this can only happen on the local cpu */
29264 -       if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
29265 -               return NULL;
29266 +       if (WARN_ON_ONCE(worker->sleeping))
29267 +               return;
29269 +       worker->sleeping = 1;
29271         /*
29272          * The counterpart of the following dec_and_test, implied mb,
29273          * worklist not empty test sequence is in insert_work().
29274          * Please read comment there.
29275 -        *
29276 -        * NOT_RUNNING is clear.  This means that we're bound to and
29277 -        * running on the local cpu w/ rq lock held and preemption
29278 -        * disabled, which in turn means that none else could be
29279 -        * manipulating idle_list, so dereferencing idle_list without pool
29280 -        * lock is safe.
29281          */
29282         if (atomic_dec_and_test(&pool->nr_running) &&
29283 -           !list_empty(&pool->worklist))
29284 -               to_wakeup = first_idle_worker(pool);
29285 -       return to_wakeup ? to_wakeup->task : NULL;
29286 +           !list_empty(&pool->worklist)) {
29287 +               sched_lock_idle_list(pool);
29288 +               wake_up_worker(pool);
29289 +               sched_unlock_idle_list(pool);
29290 +       }
29293  /**
29294 @@ -1109,12 +1133,14 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
29296         if (pwq) {
29297                 /*
29298 -                * As both pwqs and pools are sched-RCU protected, the
29299 +                * As both pwqs and pools are RCU protected, the
29300                  * following lock operations are safe.
29301                  */
29302 -               spin_lock_irq(&pwq->pool->lock);
29303 +               rcu_read_lock();
29304 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
29305                 put_pwq(pwq);
29306 -               spin_unlock_irq(&pwq->pool->lock);
29307 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
29308 +               rcu_read_unlock();
29309         }
29312 @@ -1216,7 +1242,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
29313         struct worker_pool *pool;
29314         struct pool_workqueue *pwq;
29316 -       local_irq_save(*flags);
29317 +       local_lock_irqsave(pendingb_lock, *flags);
29319         /* try to steal the timer if it exists */
29320         if (is_dwork) {
29321 @@ -1235,6 +1261,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
29322         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
29323                 return 0;
29325 +       rcu_read_lock();
29326         /*
29327          * The queueing is in progress, or it is already queued. Try to
29328          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
29329 @@ -1273,14 +1300,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
29330                 set_work_pool_and_keep_pending(work, pool->id);
29332                 spin_unlock(&pool->lock);
29333 +               rcu_read_unlock();
29334                 return 1;
29335         }
29336         spin_unlock(&pool->lock);
29337  fail:
29338 -       local_irq_restore(*flags);
29339 +       rcu_read_unlock();
29340 +       local_unlock_irqrestore(pendingb_lock, *flags);
29341         if (work_is_canceling(work))
29342                 return -ENOENT;
29343 -       cpu_relax();
29344 +       cpu_chill();
29345         return -EAGAIN;
29348 @@ -1349,7 +1378,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
29349          * queued or lose PENDING.  Grabbing PENDING and queueing should
29350          * happen with IRQ disabled.
29351          */
29352 -       WARN_ON_ONCE(!irqs_disabled());
29353 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
29355         debug_work_activate(work);
29357 @@ -1357,6 +1386,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
29358         if (unlikely(wq->flags & __WQ_DRAINING) &&
29359             WARN_ON_ONCE(!is_chained_work(wq)))
29360                 return;
29362 +       rcu_read_lock();
29363  retry:
29364         if (req_cpu == WORK_CPU_UNBOUND)
29365                 cpu = raw_smp_processor_id();
29366 @@ -1413,10 +1444,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
29367         /* pwq determined, queue */
29368         trace_workqueue_queue_work(req_cpu, pwq, work);
29370 -       if (WARN_ON(!list_empty(&work->entry))) {
29371 -               spin_unlock(&pwq->pool->lock);
29372 -               return;
29373 -       }
29374 +       if (WARN_ON(!list_empty(&work->entry)))
29375 +               goto out;
29377         pwq->nr_in_flight[pwq->work_color]++;
29378         work_flags = work_color_to_flags(pwq->work_color);
29379 @@ -1432,7 +1461,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
29381         insert_work(pwq, work, worklist, work_flags);
29383 +out:
29384         spin_unlock(&pwq->pool->lock);
29385 +       rcu_read_unlock();
29388  /**
29389 @@ -1452,14 +1483,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
29390         bool ret = false;
29391         unsigned long flags;
29393 -       local_irq_save(flags);
29394 +       local_lock_irqsave(pendingb_lock,flags);
29396         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
29397                 __queue_work(cpu, wq, work);
29398                 ret = true;
29399         }
29401 -       local_irq_restore(flags);
29402 +       local_unlock_irqrestore(pendingb_lock, flags);
29403         return ret;
29405  EXPORT_SYMBOL(queue_work_on);
29406 @@ -1527,14 +1558,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
29407         unsigned long flags;
29409         /* read the comment in __queue_work() */
29410 -       local_irq_save(flags);
29411 +       local_lock_irqsave(pendingb_lock, flags);
29413         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
29414                 __queue_delayed_work(cpu, wq, dwork, delay);
29415                 ret = true;
29416         }
29418 -       local_irq_restore(flags);
29419 +       local_unlock_irqrestore(pendingb_lock, flags);
29420         return ret;
29422  EXPORT_SYMBOL(queue_delayed_work_on);
29423 @@ -1569,7 +1600,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
29425         if (likely(ret >= 0)) {
29426                 __queue_delayed_work(cpu, wq, dwork, delay);
29427 -               local_irq_restore(flags);
29428 +               local_unlock_irqrestore(pendingb_lock, flags);
29429         }
29431         /* -ENOENT from try_to_grab_pending() becomes %true */
29432 @@ -1602,7 +1633,9 @@ static void worker_enter_idle(struct worker *worker)
29433         worker->last_active = jiffies;
29435         /* idle_list is LIFO */
29436 +       rt_lock_idle_list(pool);
29437         list_add(&worker->entry, &pool->idle_list);
29438 +       rt_unlock_idle_list(pool);
29440         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
29441                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
29442 @@ -1635,7 +1668,9 @@ static void worker_leave_idle(struct worker *worker)
29443                 return;
29444         worker_clr_flags(worker, WORKER_IDLE);
29445         pool->nr_idle--;
29446 +       rt_lock_idle_list(pool);
29447         list_del_init(&worker->entry);
29448 +       rt_unlock_idle_list(pool);
29451  static struct worker *alloc_worker(int node)
29452 @@ -1801,7 +1836,9 @@ static void destroy_worker(struct worker *worker)
29453         pool->nr_workers--;
29454         pool->nr_idle--;
29456 +       rt_lock_idle_list(pool);
29457         list_del_init(&worker->entry);
29458 +       rt_unlock_idle_list(pool);
29459         worker->flags |= WORKER_DIE;
29460         wake_up_process(worker->task);
29462 @@ -2711,14 +2748,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
29464         might_sleep();
29466 -       local_irq_disable();
29467 +       rcu_read_lock();
29468         pool = get_work_pool(work);
29469         if (!pool) {
29470 -               local_irq_enable();
29471 +               rcu_read_unlock();
29472                 return false;
29473         }
29475 -       spin_lock(&pool->lock);
29476 +       spin_lock_irq(&pool->lock);
29477         /* see the comment in try_to_grab_pending() with the same code */
29478         pwq = get_work_pwq(work);
29479         if (pwq) {
29480 @@ -2745,10 +2782,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
29481         else
29482                 lock_map_acquire_read(&pwq->wq->lockdep_map);
29483         lock_map_release(&pwq->wq->lockdep_map);
29485 +       rcu_read_unlock();
29486         return true;
29487  already_gone:
29488         spin_unlock_irq(&pool->lock);
29489 +       rcu_read_unlock();
29490         return false;
29493 @@ -2835,7 +2873,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
29495         /* tell other tasks trying to grab @work to back off */
29496         mark_work_canceling(work);
29497 -       local_irq_restore(flags);
29498 +       local_unlock_irqrestore(pendingb_lock, flags);
29500         flush_work(work);
29501         clear_work_data(work);
29502 @@ -2890,10 +2928,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
29503   */
29504  bool flush_delayed_work(struct delayed_work *dwork)
29506 -       local_irq_disable();
29507 +       local_lock_irq(pendingb_lock);
29508         if (del_timer_sync(&dwork->timer))
29509                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
29510 -       local_irq_enable();
29511 +       local_unlock_irq(pendingb_lock);
29512         return flush_work(&dwork->work);
29514  EXPORT_SYMBOL(flush_delayed_work);
29515 @@ -2928,7 +2966,7 @@ bool cancel_delayed_work(struct delayed_work *dwork)
29517         set_work_pool_and_clear_pending(&dwork->work,
29518                                         get_work_pool_id(&dwork->work));
29519 -       local_irq_restore(flags);
29520 +       local_unlock_irqrestore(pendingb_lock, flags);
29521         return ret;
29523  EXPORT_SYMBOL(cancel_delayed_work);
29524 @@ -3155,7 +3193,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
29525   * put_unbound_pool - put a worker_pool
29526   * @pool: worker_pool to put
29527   *
29528 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
29529 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
29530   * safe manner.  get_unbound_pool() calls this function on its failure path
29531   * and this function should be able to release pools which went through,
29532   * successfully or not, init_worker_pool().
29533 @@ -3209,8 +3247,8 @@ static void put_unbound_pool(struct worker_pool *pool)
29534         del_timer_sync(&pool->idle_timer);
29535         del_timer_sync(&pool->mayday_timer);
29537 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
29538 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
29539 +       /* RCU protected to allow dereferences from get_work_pool() */
29540 +       call_rcu(&pool->rcu, rcu_free_pool);
29543  /**
29544 @@ -3317,14 +3355,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
29545         put_unbound_pool(pool);
29546         mutex_unlock(&wq_pool_mutex);
29548 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
29549 +       call_rcu(&pwq->rcu, rcu_free_pwq);
29551         /*
29552          * If we're the last pwq going away, @wq is already dead and no one
29553          * is gonna access it anymore.  Schedule RCU free.
29554          */
29555         if (is_last)
29556 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
29557 +               call_rcu(&wq->rcu, rcu_free_wq);
29560  /**
29561 @@ -3991,7 +4029,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
29562                  * The base ref is never dropped on per-cpu pwqs.  Directly
29563                  * schedule RCU free.
29564                  */
29565 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
29566 +               call_rcu(&wq->rcu, rcu_free_wq);
29567         } else {
29568                 /*
29569                  * We're the sole accessor of @wq at this point.  Directly
29570 @@ -4085,7 +4123,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
29571         struct pool_workqueue *pwq;
29572         bool ret;
29574 -       rcu_read_lock_sched();
29575 +       rcu_read_lock();
29576 +       preempt_disable();
29578         if (cpu == WORK_CPU_UNBOUND)
29579                 cpu = smp_processor_id();
29580 @@ -4096,7 +4135,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
29581                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
29583         ret = !list_empty(&pwq->delayed_works);
29584 -       rcu_read_unlock_sched();
29585 +       preempt_enable();
29586 +       rcu_read_unlock();
29588         return ret;
29590 @@ -4122,15 +4162,15 @@ unsigned int work_busy(struct work_struct *work)
29591         if (work_pending(work))
29592                 ret |= WORK_BUSY_PENDING;
29594 -       local_irq_save(flags);
29595 +       rcu_read_lock();
29596         pool = get_work_pool(work);
29597         if (pool) {
29598 -               spin_lock(&pool->lock);
29599 +               spin_lock_irqsave(&pool->lock, flags);
29600                 if (find_worker_executing_work(pool, work))
29601                         ret |= WORK_BUSY_RUNNING;
29602 -               spin_unlock(&pool->lock);
29603 +               spin_unlock_irqrestore(&pool->lock, flags);
29604         }
29605 -       local_irq_restore(flags);
29606 +       rcu_read_unlock();
29608         return ret;
29610 @@ -4319,7 +4359,7 @@ void show_workqueue_state(void)
29611         unsigned long flags;
29612         int pi;
29614 -       rcu_read_lock_sched();
29615 +       rcu_read_lock();
29617         pr_info("Showing busy workqueues and worker pools:\n");
29619 @@ -4370,7 +4410,7 @@ void show_workqueue_state(void)
29620                 spin_unlock_irqrestore(&pool->lock, flags);
29621         }
29623 -       rcu_read_unlock_sched();
29624 +       rcu_read_unlock();
29627  /*
29628 @@ -4731,16 +4771,16 @@ bool freeze_workqueues_busy(void)
29629                  * nr_active is monotonically decreasing.  It's safe
29630                  * to peek without lock.
29631                  */
29632 -               rcu_read_lock_sched();
29633 +               rcu_read_lock();
29634                 for_each_pwq(pwq, wq) {
29635                         WARN_ON_ONCE(pwq->nr_active < 0);
29636                         if (pwq->nr_active) {
29637                                 busy = true;
29638 -                               rcu_read_unlock_sched();
29639 +                               rcu_read_unlock();
29640                                 goto out_unlock;
29641                         }
29642                 }
29643 -               rcu_read_unlock_sched();
29644 +               rcu_read_unlock();
29645         }
29646  out_unlock:
29647         mutex_unlock(&wq_pool_mutex);
29648 @@ -4930,7 +4970,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
29649         const char *delim = "";
29650         int node, written = 0;
29652 -       rcu_read_lock_sched();
29653 +       get_online_cpus();
29654 +       rcu_read_lock();
29655         for_each_node(node) {
29656                 written += scnprintf(buf + written, PAGE_SIZE - written,
29657                                      "%s%d:%d", delim, node,
29658 @@ -4938,7 +4979,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
29659                 delim = " ";
29660         }
29661         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
29662 -       rcu_read_unlock_sched();
29663 +       rcu_read_unlock();
29664 +       put_online_cpus();
29666         return written;
29668 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
29669 index 3fa9c146fccb..42d1e3974554 100644
29670 --- a/kernel/workqueue_internal.h
29671 +++ b/kernel/workqueue_internal.h
29672 @@ -44,6 +44,7 @@ struct worker {
29673         unsigned long           last_active;    /* L: last active timestamp */
29674         unsigned int            flags;          /* X: flags */
29675         int                     id;             /* I: worker id */
29676 +       int                     sleeping;       /* None */
29678         /*
29679          * Opaque string set with work_set_desc().  Printed out with task
29680 @@ -69,7 +70,7 @@ static inline struct worker *current_wq_worker(void)
29681   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
29682   * sched/core.c and workqueue.c.
29683   */
29684 -void wq_worker_waking_up(struct task_struct *task, int cpu);
29685 -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
29686 +void wq_worker_running(struct task_struct *task);
29687 +void wq_worker_sleeping(struct task_struct *task);
29689  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
29690 diff --git a/lib/Kconfig b/lib/Kconfig
29691 index 1a48744253d7..f75de578cca8 100644
29692 --- a/lib/Kconfig
29693 +++ b/lib/Kconfig
29694 @@ -397,6 +397,7 @@ config CHECK_SIGNATURE
29696  config CPUMASK_OFFSTACK
29697         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
29698 +       depends on !PREEMPT_RT_FULL
29699         help
29700           Use dynamic allocation for cpumask_var_t, instead of putting
29701           them on the stack.  This is a bit more expensive, but avoids
29702 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
29703 index 547f7f923dbc..8fcdbc2fc6d0 100644
29704 --- a/lib/debugobjects.c
29705 +++ b/lib/debugobjects.c
29706 @@ -309,7 +309,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
29707         struct debug_obj *obj;
29708         unsigned long flags;
29710 -       fill_pool();
29711 +#ifdef CONFIG_PREEMPT_RT_FULL
29712 +       if (preempt_count() == 0 && !irqs_disabled())
29713 +#endif
29714 +               fill_pool();
29716         db = get_bucket((unsigned long) addr);
29718 diff --git a/lib/idr.c b/lib/idr.c
29719 index 6098336df267..9decbe914595 100644
29720 --- a/lib/idr.c
29721 +++ b/lib/idr.c
29722 @@ -30,6 +30,7 @@
29723  #include <linux/idr.h>
29724  #include <linux/spinlock.h>
29725  #include <linux/percpu.h>
29726 +#include <linux/locallock.h>
29728  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
29729  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
29730 @@ -45,6 +46,37 @@ static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
29731  static DEFINE_PER_CPU(int, idr_preload_cnt);
29732  static DEFINE_SPINLOCK(simple_ida_lock);
29734 +#ifdef CONFIG_PREEMPT_RT_FULL
29735 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
29737 +static inline void idr_preload_lock(void)
29739 +       local_lock(idr_lock);
29742 +static inline void idr_preload_unlock(void)
29744 +       local_unlock(idr_lock);
29747 +void idr_preload_end(void)
29749 +       idr_preload_unlock();
29751 +EXPORT_SYMBOL(idr_preload_end);
29752 +#else
29753 +static inline void idr_preload_lock(void)
29755 +       preempt_disable();
29758 +static inline void idr_preload_unlock(void)
29760 +       preempt_enable();
29762 +#endif
29765  /* the maximum ID which can be allocated given idr->layers */
29766  static int idr_max(int layers)
29768 @@ -115,14 +147,14 @@ static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
29769          * context.  See idr_preload() for details.
29770          */
29771         if (!in_interrupt()) {
29772 -               preempt_disable();
29773 +               idr_preload_lock();
29774                 new = __this_cpu_read(idr_preload_head);
29775                 if (new) {
29776                         __this_cpu_write(idr_preload_head, new->ary[0]);
29777                         __this_cpu_dec(idr_preload_cnt);
29778                         new->ary[0] = NULL;
29779                 }
29780 -               preempt_enable();
29781 +               idr_preload_unlock();
29782                 if (new)
29783                         return new;
29784         }
29785 @@ -366,7 +398,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
29786         idr_mark_full(pa, id);
29790  /**
29791   * idr_preload - preload for idr_alloc()
29792   * @gfp_mask: allocation mask to use for preloading
29793 @@ -401,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
29794         WARN_ON_ONCE(in_interrupt());
29795         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
29797 -       preempt_disable();
29798 +       idr_preload_lock();
29800         /*
29801          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
29802 @@ -413,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
29803         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
29804                 struct idr_layer *new;
29806 -               preempt_enable();
29807 +               idr_preload_unlock();
29808                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
29809 -               preempt_disable();
29810 +               idr_preload_lock();
29811                 if (!new)
29812                         break;
29814 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
29815 index 872a15a2a637..b93a6103fa4d 100644
29816 --- a/lib/locking-selftest.c
29817 +++ b/lib/locking-selftest.c
29818 @@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
29819  #include "locking-selftest-spin-hardirq.h"
29820  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
29822 +#ifndef CONFIG_PREEMPT_RT_FULL
29824  #include "locking-selftest-rlock-hardirq.h"
29825  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
29827 @@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
29828  #include "locking-selftest-wlock-softirq.h"
29829  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
29831 +#endif
29833  #undef E1
29834  #undef E2
29836 +#ifndef CONFIG_PREEMPT_RT_FULL
29837  /*
29838   * Enabling hardirqs with a softirq-safe lock held:
29839   */
29840 @@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
29841  #undef E1
29842  #undef E2
29844 +#endif
29846  /*
29847   * Enabling irqs with an irq-safe lock held:
29848   */
29849 @@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
29850  #include "locking-selftest-spin-hardirq.h"
29851  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
29853 +#ifndef CONFIG_PREEMPT_RT_FULL
29855  #include "locking-selftest-rlock-hardirq.h"
29856  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
29858 @@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
29859  #include "locking-selftest-wlock-softirq.h"
29860  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
29862 +#endif
29864  #undef E1
29865  #undef E2
29867 @@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
29868  #include "locking-selftest-spin-hardirq.h"
29869  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
29871 +#ifndef CONFIG_PREEMPT_RT_FULL
29873  #include "locking-selftest-rlock-hardirq.h"
29874  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
29876 @@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
29877  #include "locking-selftest-wlock-softirq.h"
29878  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
29880 +#endif
29882  #undef E1
29883  #undef E2
29884  #undef E3
29885 @@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
29886  #include "locking-selftest-spin-hardirq.h"
29887  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
29889 +#ifndef CONFIG_PREEMPT_RT_FULL
29891  #include "locking-selftest-rlock-hardirq.h"
29892  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
29894 @@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
29895  #include "locking-selftest-wlock-softirq.h"
29896  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
29898 +#endif
29900  #undef E1
29901  #undef E2
29902  #undef E3
29904 +#ifndef CONFIG_PREEMPT_RT_FULL
29906  /*
29907   * read-lock / write-lock irq inversion.
29908   *
29909 @@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
29910  #undef E2
29911  #undef E3
29913 +#endif
29915 +#ifndef CONFIG_PREEMPT_RT_FULL
29917  /*
29918   * read-lock / write-lock recursion that is actually safe.
29919   */
29920 @@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
29921  #undef E2
29922  #undef E3
29924 +#endif
29926  /*
29927   * read-lock / write-lock recursion that is unsafe.
29928   */
29929 @@ -1858,6 +1885,7 @@ void locking_selftest(void)
29931         printk("  --------------------------------------------------------------------------\n");
29933 +#ifndef CONFIG_PREEMPT_RT_FULL
29934         /*
29935          * irq-context testcases:
29936          */
29937 @@ -1870,6 +1898,28 @@ void locking_selftest(void)
29939         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
29940  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
29941 +#else
29942 +       /* On -rt, we only do hardirq context test for raw spinlock */
29943 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
29944 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
29946 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
29947 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
29949 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
29950 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
29951 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
29952 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
29953 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
29954 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
29956 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
29957 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
29958 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
29959 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
29960 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
29961 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
29962 +#endif
29964         ww_tests();
29966 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
29967 index 6d40944960de..822a2c027e72 100644
29968 --- a/lib/percpu_ida.c
29969 +++ b/lib/percpu_ida.c
29970 @@ -26,6 +26,9 @@
29971  #include <linux/string.h>
29972  #include <linux/spinlock.h>
29973  #include <linux/percpu_ida.h>
29974 +#include <linux/locallock.h>
29976 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
29978  struct percpu_ida_cpu {
29979         /*
29980 @@ -148,13 +151,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
29981         unsigned long flags;
29982         int tag;
29984 -       local_irq_save(flags);
29985 +       local_lock_irqsave(irq_off_lock, flags);
29986         tags = this_cpu_ptr(pool->tag_cpu);
29988         /* Fastpath */
29989         tag = alloc_local_tag(tags);
29990         if (likely(tag >= 0)) {
29991 -               local_irq_restore(flags);
29992 +               local_unlock_irqrestore(irq_off_lock, flags);
29993                 return tag;
29994         }
29996 @@ -173,6 +176,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
29998                 if (!tags->nr_free)
29999                         alloc_global_tags(pool, tags);
30001                 if (!tags->nr_free)
30002                         steal_tags(pool, tags);
30004 @@ -184,7 +188,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
30005                 }
30007                 spin_unlock(&pool->lock);
30008 -               local_irq_restore(flags);
30009 +               local_unlock_irqrestore(irq_off_lock, flags);
30011                 if (tag >= 0 || state == TASK_RUNNING)
30012                         break;
30013 @@ -196,7 +200,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
30015                 schedule();
30017 -               local_irq_save(flags);
30018 +               local_lock_irqsave(irq_off_lock, flags);
30019                 tags = this_cpu_ptr(pool->tag_cpu);
30020         }
30021         if (state != TASK_RUNNING)
30022 @@ -221,7 +225,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
30024         BUG_ON(tag >= pool->nr_tags);
30026 -       local_irq_save(flags);
30027 +       local_lock_irqsave(irq_off_lock, flags);
30028         tags = this_cpu_ptr(pool->tag_cpu);
30030         spin_lock(&tags->lock);
30031 @@ -253,7 +257,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
30032                 spin_unlock(&pool->lock);
30033         }
30035 -       local_irq_restore(flags);
30036 +       local_unlock_irqrestore(irq_off_lock, flags);
30038  EXPORT_SYMBOL_GPL(percpu_ida_free);
30040 @@ -345,7 +349,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
30041         struct percpu_ida_cpu *remote;
30042         unsigned cpu, i, err = 0;
30044 -       local_irq_save(flags);
30045 +       local_lock_irqsave(irq_off_lock, flags);
30046         for_each_possible_cpu(cpu) {
30047                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
30048                 spin_lock(&remote->lock);
30049 @@ -367,7 +371,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
30050         }
30051         spin_unlock(&pool->lock);
30052  out:
30053 -       local_irq_restore(flags);
30054 +       local_unlock_irqrestore(irq_off_lock, flags);
30055         return err;
30057  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
30058 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
30059 index 6b79e9026e24..44bf36a396a9 100644
30060 --- a/lib/radix-tree.c
30061 +++ b/lib/radix-tree.c
30062 @@ -34,7 +34,7 @@
30063  #include <linux/bitops.h>
30064  #include <linux/rcupdate.h>
30065  #include <linux/preempt.h>             /* in_interrupt() */
30067 +#include <linux/locallock.h>
30069  /*
30070   * The height_to_maxindex array needs to be one deeper than the maximum
30071 @@ -69,6 +69,7 @@ struct radix_tree_preload {
30072         struct radix_tree_node *nodes;
30073  };
30074  static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
30075 +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
30077  static inline void *ptr_to_indirect(void *ptr)
30079 @@ -196,13 +197,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
30080                  * succeed in getting a node here (and never reach
30081                  * kmem_cache_alloc)
30082                  */
30083 -               rtp = this_cpu_ptr(&radix_tree_preloads);
30084 +               rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
30085                 if (rtp->nr) {
30086                         ret = rtp->nodes;
30087                         rtp->nodes = ret->private_data;
30088                         ret->private_data = NULL;
30089                         rtp->nr--;
30090                 }
30091 +               put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
30092                 /*
30093                  * Update the allocation stack trace as this is more useful
30094                  * for debugging.
30095 @@ -257,14 +259,14 @@ static int __radix_tree_preload(gfp_t gfp_mask)
30096         struct radix_tree_node *node;
30097         int ret = -ENOMEM;
30099 -       preempt_disable();
30100 +       local_lock(radix_tree_preloads_lock);
30101         rtp = this_cpu_ptr(&radix_tree_preloads);
30102         while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) {
30103 -               preempt_enable();
30104 +               local_unlock(radix_tree_preloads_lock);
30105                 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
30106                 if (node == NULL)
30107                         goto out;
30108 -               preempt_disable();
30109 +               local_lock(radix_tree_preloads_lock);
30110                 rtp = this_cpu_ptr(&radix_tree_preloads);
30111                 if (rtp->nr < RADIX_TREE_PRELOAD_SIZE) {
30112                         node->private_data = rtp->nodes;
30113 @@ -306,11 +308,17 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
30114         if (gfpflags_allow_blocking(gfp_mask))
30115                 return __radix_tree_preload(gfp_mask);
30116         /* Preloading doesn't help anything with this gfp mask, skip it */
30117 -       preempt_disable();
30118 +       local_lock(radix_tree_preloads_lock);
30119         return 0;
30121  EXPORT_SYMBOL(radix_tree_maybe_preload);
30123 +void radix_tree_preload_end(void)
30125 +       local_unlock(radix_tree_preloads_lock);
30127 +EXPORT_SYMBOL(radix_tree_preload_end);
30129  /*
30130   *     Return the maximum key which can be store into a
30131   *     radix tree with height HEIGHT.
30132 diff --git a/lib/rbtree.c b/lib/rbtree.c
30133 index 1356454e36de..d15d6c4327f1 100644
30134 --- a/lib/rbtree.c
30135 +++ b/lib/rbtree.c
30136 @@ -23,6 +23,7 @@
30138  #include <linux/rbtree_augmented.h>
30139  #include <linux/export.h>
30140 +#include <linux/rcupdate.h>
30142  /*
30143   * red-black trees properties:  http://en.wikipedia.org/wiki/Rbtree
30144 @@ -590,3 +591,13 @@ struct rb_node *rb_first_postorder(const struct rb_root *root)
30145         return rb_left_deepest_node(root->rb_node);
30147  EXPORT_SYMBOL(rb_first_postorder);
30149 +void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
30150 +                                   struct rb_node **rb_link)
30152 +       node->__rb_parent_color = (unsigned long)parent;
30153 +       node->rb_left = node->rb_right = NULL;
30155 +       rcu_assign_pointer(*rb_link, node);
30157 +EXPORT_SYMBOL(rb_link_node_rcu);
30158 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
30159 index bafa9933fa76..ebe3b7edd086 100644
30160 --- a/lib/scatterlist.c
30161 +++ b/lib/scatterlist.c
30162 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
30163                         flush_kernel_dcache_page(miter->page);
30165                 if (miter->__flags & SG_MITER_ATOMIC) {
30166 -                       WARN_ON_ONCE(preemptible());
30167 +                       WARN_ON_ONCE(!pagefault_disabled());
30168                         kunmap_atomic(miter->addr);
30169                 } else
30170                         kunmap(miter->page);
30171 @@ -664,7 +664,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
30172         if (!sg_miter_skip(&miter, skip))
30173                 return false;
30175 -       local_irq_save(flags);
30176 +       local_irq_save_nort(flags);
30178         while (sg_miter_next(&miter) && offset < buflen) {
30179                 unsigned int len;
30180 @@ -681,7 +681,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
30182         sg_miter_stop(&miter);
30184 -       local_irq_restore(flags);
30185 +       local_irq_restore_nort(flags);
30186         return offset;
30188  EXPORT_SYMBOL(sg_copy_buffer);
30189 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
30190 index 1afec32de6f2..11fa431046a8 100644
30191 --- a/lib/smp_processor_id.c
30192 +++ b/lib/smp_processor_id.c
30193 @@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
30194         if (!printk_ratelimit())
30195                 goto out_enable;
30197 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
30198 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
30199 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
30200 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
30201 +               current->comm, current->pid);
30203         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
30204         dump_stack();
30205 diff --git a/mm/Kconfig b/mm/Kconfig
30206 index 97a4e06b15c0..9614351e68b8 100644
30207 --- a/mm/Kconfig
30208 +++ b/mm/Kconfig
30209 @@ -392,7 +392,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
30211  config TRANSPARENT_HUGEPAGE
30212         bool "Transparent Hugepage Support"
30213 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
30214 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
30215         select COMPACTION
30216         help
30217           Transparent Hugepages allows the kernel to use huge pages and
30218 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
30219 index a988d4ef39da..f2c2ee1d5191 100644
30220 --- a/mm/backing-dev.c
30221 +++ b/mm/backing-dev.c
30222 @@ -457,9 +457,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
30224         unsigned long flags;
30226 -       local_irq_save(flags);
30227 +       local_irq_save_nort(flags);
30228         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
30229 -               local_irq_restore(flags);
30230 +               local_irq_restore_nort(flags);
30231                 return;
30232         }
30234 diff --git a/mm/compaction.c b/mm/compaction.c
30235 index b6f145ed7ae1..03cac7f6768a 100644
30236 --- a/mm/compaction.c
30237 +++ b/mm/compaction.c
30238 @@ -1450,10 +1450,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
30239                                 cc->migrate_pfn & ~((1UL << cc->order) - 1);
30241                         if (cc->last_migrated_pfn < current_block_start) {
30242 -                               cpu = get_cpu();
30243 +                               cpu = get_cpu_light();
30244 +                               local_lock_irq(swapvec_lock);
30245                                 lru_add_drain_cpu(cpu);
30246 +                               local_unlock_irq(swapvec_lock);
30247                                 drain_local_pages(zone);
30248 -                               put_cpu();
30249 +                               put_cpu_light();
30250                                 /* No more flushing until we migrate again */
30251                                 cc->last_migrated_pfn = 0;
30252                         }
30253 diff --git a/mm/filemap.c b/mm/filemap.c
30254 index 69f75c77c098..b203169ca0b4 100644
30255 --- a/mm/filemap.c
30256 +++ b/mm/filemap.c
30257 @@ -144,9 +144,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
30258                  * node->private_list is protected by
30259                  * mapping->tree_lock.
30260                  */
30261 -               if (!list_empty(&node->private_list))
30262 -                       list_lru_del(&workingset_shadow_nodes,
30263 +               if (!list_empty(&node->private_list)) {
30264 +                       local_lock(workingset_shadow_lock);
30265 +                       list_lru_del(&__workingset_shadow_nodes,
30266                                      &node->private_list);
30267 +                       local_unlock(workingset_shadow_lock);
30268 +               }
30269         }
30270         return 0;
30272 @@ -218,7 +221,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
30273         if (!workingset_node_pages(node) &&
30274             list_empty(&node->private_list)) {
30275                 node->private_data = mapping;
30276 -               list_lru_add(&workingset_shadow_nodes, &node->private_list);
30277 +               local_lock(workingset_shadow_lock);
30278 +               list_lru_add(&__workingset_shadow_nodes, &node->private_list);
30279 +               local_unlock(workingset_shadow_lock);
30280         }
30283 diff --git a/mm/highmem.c b/mm/highmem.c
30284 index 123bcd3ed4f2..16e8cf26d38a 100644
30285 --- a/mm/highmem.c
30286 +++ b/mm/highmem.c
30287 @@ -29,10 +29,11 @@
30288  #include <linux/kgdb.h>
30289  #include <asm/tlbflush.h>
30292 +#ifndef CONFIG_PREEMPT_RT_FULL
30293  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
30294  DEFINE_PER_CPU(int, __kmap_atomic_idx);
30295  #endif
30296 +#endif
30298  /*
30299   * Virtual_count is not a pure "count".
30300 @@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
30301  unsigned long totalhigh_pages __read_mostly;
30302  EXPORT_SYMBOL(totalhigh_pages);
30305 +#ifndef CONFIG_PREEMPT_RT_FULL
30306  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
30307 +#endif
30309  unsigned int nr_free_highpages (void)
30311 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
30312 index e25b93a4267d..1c619267d9da 100644
30313 --- a/mm/memcontrol.c
30314 +++ b/mm/memcontrol.c
30315 @@ -67,6 +67,8 @@
30316  #include <net/sock.h>
30317  #include <net/ip.h>
30318  #include <net/tcp_memcontrol.h>
30319 +#include <linux/locallock.h>
30321  #include "slab.h"
30323  #include <asm/uaccess.h>
30324 @@ -87,6 +89,7 @@ int do_swap_account __read_mostly;
30325  #define do_swap_account                0
30326  #endif
30328 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
30329  static const char * const mem_cgroup_stat_names[] = {
30330         "cache",
30331         "rss",
30332 @@ -1922,14 +1925,17 @@ static void drain_local_stock(struct work_struct *dummy)
30333   */
30334  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
30336 -       struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
30337 +       struct memcg_stock_pcp *stock;
30338 +       int cpu = get_cpu_light();
30340 +       stock = &per_cpu(memcg_stock, cpu);
30342         if (stock->cached != memcg) { /* reset if necessary */
30343                 drain_stock(stock);
30344                 stock->cached = memcg;
30345         }
30346         stock->nr_pages += nr_pages;
30347 -       put_cpu_var(memcg_stock);
30348 +       put_cpu_light();
30351  /*
30352 @@ -1945,7 +1951,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
30353                 return;
30354         /* Notify other cpus that system-wide "drain" is running */
30355         get_online_cpus();
30356 -       curcpu = get_cpu();
30357 +       curcpu = get_cpu_light();
30358         for_each_online_cpu(cpu) {
30359                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
30360                 struct mem_cgroup *memcg;
30361 @@ -1962,7 +1968,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
30362                                 schedule_work_on(cpu, &stock->work);
30363                 }
30364         }
30365 -       put_cpu();
30366 +       put_cpu_light();
30367         put_online_cpus();
30368         mutex_unlock(&percpu_charge_mutex);
30370 @@ -4691,12 +4697,12 @@ static int mem_cgroup_move_account(struct page *page,
30372         ret = 0;
30374 -       local_irq_disable();
30375 +       local_lock_irq(event_lock);
30376         mem_cgroup_charge_statistics(to, page, nr_pages);
30377         memcg_check_events(to, page);
30378         mem_cgroup_charge_statistics(from, page, -nr_pages);
30379         memcg_check_events(from, page);
30380 -       local_irq_enable();
30381 +       local_unlock_irq(event_lock);
30382  out_unlock:
30383         unlock_page(page);
30384  out:
30385 @@ -5486,10 +5492,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
30386                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
30387         }
30389 -       local_irq_disable();
30390 +       local_lock_irq(event_lock);
30391         mem_cgroup_charge_statistics(memcg, page, nr_pages);
30392         memcg_check_events(memcg, page);
30393 -       local_irq_enable();
30394 +       local_unlock_irq(event_lock);
30396         if (do_swap_account && PageSwapCache(page)) {
30397                 swp_entry_t entry = { .val = page_private(page) };
30398 @@ -5545,14 +5551,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
30399                 memcg_oom_recover(memcg);
30400         }
30402 -       local_irq_save(flags);
30403 +       local_lock_irqsave(event_lock, flags);
30404         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
30405         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
30406         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
30407         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
30408         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
30409         memcg_check_events(memcg, dummy_page);
30410 -       local_irq_restore(flags);
30411 +       local_unlock_irqrestore(event_lock, flags);
30413         if (!mem_cgroup_is_root(memcg))
30414                 css_put_many(&memcg->css, nr_pages);
30415 @@ -5762,6 +5768,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
30417         struct mem_cgroup *memcg, *swap_memcg;
30418         unsigned short oldid;
30419 +       unsigned long flags;
30421         VM_BUG_ON_PAGE(PageLRU(page), page);
30422         VM_BUG_ON_PAGE(page_count(page), page);
30423 @@ -5802,12 +5809,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
30424          * important here to have the interrupts disabled because it is the
30425          * only synchronisation we have for udpating the per-CPU variables.
30426          */
30427 +       local_lock_irqsave(event_lock, flags);
30428 +#ifndef CONFIG_PREEMPT_RT_BASE
30429         VM_BUG_ON(!irqs_disabled());
30430 +#endif
30431         mem_cgroup_charge_statistics(memcg, page, -1);
30432         memcg_check_events(memcg, page);
30434         if (!mem_cgroup_is_root(memcg))
30435                 css_put(&memcg->css);
30436 +       local_unlock_irqrestore(event_lock, flags);
30439  /**
30440 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
30441 index 6f4d27c5bb32..5cd25c745a8f 100644
30442 --- a/mm/mmu_context.c
30443 +++ b/mm/mmu_context.c
30444 @@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
30445         struct task_struct *tsk = current;
30447         task_lock(tsk);
30448 +       preempt_disable_rt();
30449         active_mm = tsk->active_mm;
30450         if (active_mm != mm) {
30451                 atomic_inc(&mm->mm_count);
30452 @@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
30453         }
30454         tsk->mm = mm;
30455         switch_mm(active_mm, mm, tsk);
30456 +       preempt_enable_rt();
30457         task_unlock(tsk);
30458  #ifdef finish_arch_post_lock_switch
30459         finish_arch_post_lock_switch();
30460 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
30461 index 3c70f03d91ec..e1377f157652 100644
30462 --- a/mm/page_alloc.c
30463 +++ b/mm/page_alloc.c
30464 @@ -60,6 +60,7 @@
30465  #include <linux/page_ext.h>
30466  #include <linux/hugetlb.h>
30467  #include <linux/sched/rt.h>
30468 +#include <linux/locallock.h>
30469  #include <linux/page_owner.h>
30470  #include <linux/kthread.h>
30472 @@ -264,6 +265,18 @@ EXPORT_SYMBOL(nr_node_ids);
30473  EXPORT_SYMBOL(nr_online_nodes);
30474  #endif
30476 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
30478 +#ifdef CONFIG_PREEMPT_RT_BASE
30479 +# define cpu_lock_irqsave(cpu, flags)          \
30480 +       local_lock_irqsave_on(pa_lock, flags, cpu)
30481 +# define cpu_unlock_irqrestore(cpu, flags)     \
30482 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
30483 +#else
30484 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
30485 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
30486 +#endif
30488  int page_group_by_mobility_disabled __read_mostly;
30490  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
30491 @@ -820,7 +833,7 @@ static inline int free_pages_check(struct page *page)
30494  /*
30495 - * Frees a number of pages from the PCP lists
30496 + * Frees a number of pages which have been collected from the pcp lists.
30497   * Assumes all pages on list are in same zone, and of same order.
30498   * count is the number of pages to free.
30499   *
30500 @@ -831,18 +844,53 @@ static inline int free_pages_check(struct page *page)
30501   * pinned" detection logic.
30502   */
30503  static void free_pcppages_bulk(struct zone *zone, int count,
30504 -                                       struct per_cpu_pages *pcp)
30505 +                              struct list_head *list)
30507 -       int migratetype = 0;
30508 -       int batch_free = 0;
30509         int to_free = count;
30510         unsigned long nr_scanned;
30511 +       unsigned long flags;
30513 +       spin_lock_irqsave(&zone->lock, flags);
30515 -       spin_lock(&zone->lock);
30516         nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
30517         if (nr_scanned)
30518                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
30520 +       while (!list_empty(list)) {
30521 +               struct page *page = list_first_entry(list, struct page, lru);
30522 +               int mt; /* migratetype of the to-be-freed page */
30524 +               /* must delete as __free_one_page list manipulates */
30525 +               list_del(&page->lru);
30527 +               mt = get_pcppage_migratetype(page);
30528 +               /* MIGRATE_ISOLATE page should not go to pcplists */
30529 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
30530 +               /* Pageblock could have been isolated meanwhile */
30531 +               if (unlikely(has_isolate_pageblock(zone)))
30532 +                       mt = get_pageblock_migratetype(page);
30534 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
30535 +               trace_mm_page_pcpu_drain(page, 0, mt);
30536 +               to_free--;
30537 +       }
30538 +       WARN_ON(to_free != 0);
30539 +       spin_unlock_irqrestore(&zone->lock, flags);
30543 + * Moves a number of pages from the PCP lists to free list which
30544 + * is freed outside of the locked region.
30545 + *
30546 + * Assumes all pages on list are in same zone, and of same order.
30547 + * count is the number of pages to free.
30548 + */
30549 +static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
30550 +                             struct list_head *dst)
30552 +       int migratetype = 0;
30553 +       int batch_free = 0;
30555         while (to_free) {
30556                 struct page *page;
30557                 struct list_head *list;
30558 @@ -858,7 +906,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
30559                         batch_free++;
30560                         if (++migratetype == MIGRATE_PCPTYPES)
30561                                 migratetype = 0;
30562 -                       list = &pcp->lists[migratetype];
30563 +                       list = &src->lists[migratetype];
30564                 } while (list_empty(list));
30566                 /* This is the only non-empty list. Free them all. */
30567 @@ -866,24 +914,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
30568                         batch_free = to_free;
30570                 do {
30571 -                       int mt; /* migratetype of the to-be-freed page */
30573 -                       page = list_entry(list->prev, struct page, lru);
30574 -                       /* must delete as __free_one_page list manipulates */
30575 +                       page = list_last_entry(list, struct page, lru);
30576                         list_del(&page->lru);
30578 -                       mt = get_pcppage_migratetype(page);
30579 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
30580 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
30581 -                       /* Pageblock could have been isolated meanwhile */
30582 -                       if (unlikely(has_isolate_pageblock(zone)))
30583 -                               mt = get_pageblock_migratetype(page);
30585 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
30586 -                       trace_mm_page_pcpu_drain(page, 0, mt);
30587 +                       list_add(&page->lru, dst);
30588                 } while (--to_free && --batch_free && !list_empty(list));
30589         }
30590 -       spin_unlock(&zone->lock);
30593  static void free_one_page(struct zone *zone,
30594 @@ -892,7 +928,9 @@ static void free_one_page(struct zone *zone,
30595                                 int migratetype)
30597         unsigned long nr_scanned;
30598 -       spin_lock(&zone->lock);
30599 +       unsigned long flags;
30601 +       spin_lock_irqsave(&zone->lock, flags);
30602         nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
30603         if (nr_scanned)
30604                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
30605 @@ -902,7 +940,7 @@ static void free_one_page(struct zone *zone,
30606                 migratetype = get_pfnblock_migratetype(page, pfn);
30607         }
30608         __free_one_page(page, pfn, zone, order, migratetype);
30609 -       spin_unlock(&zone->lock);
30610 +       spin_unlock_irqrestore(&zone->lock, flags);
30613  static int free_tail_pages_check(struct page *head_page, struct page *page)
30614 @@ -1053,10 +1091,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
30615                 return;
30617         migratetype = get_pfnblock_migratetype(page, pfn);
30618 -       local_irq_save(flags);
30619 +       local_lock_irqsave(pa_lock, flags);
30620         __count_vm_events(PGFREE, 1 << order);
30621         free_one_page(page_zone(page), page, pfn, order, migratetype);
30622 -       local_irq_restore(flags);
30623 +       local_unlock_irqrestore(pa_lock, flags);
30626  static void __init __free_pages_boot_core(struct page *page,
30627 @@ -1925,16 +1963,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
30628  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
30630         unsigned long flags;
30631 +       LIST_HEAD(dst);
30632         int to_drain, batch;
30634 -       local_irq_save(flags);
30635 +       local_lock_irqsave(pa_lock, flags);
30636         batch = READ_ONCE(pcp->batch);
30637         to_drain = min(pcp->count, batch);
30638         if (to_drain > 0) {
30639 -               free_pcppages_bulk(zone, to_drain, pcp);
30640 +               isolate_pcp_pages(to_drain, pcp, &dst);
30641                 pcp->count -= to_drain;
30642         }
30643 -       local_irq_restore(flags);
30644 +       local_unlock_irqrestore(pa_lock, flags);
30645 +       free_pcppages_bulk(zone, to_drain, &dst);
30647  #endif
30649 @@ -1950,16 +1990,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
30650         unsigned long flags;
30651         struct per_cpu_pageset *pset;
30652         struct per_cpu_pages *pcp;
30653 +       LIST_HEAD(dst);
30654 +       int count;
30656 -       local_irq_save(flags);
30657 +       cpu_lock_irqsave(cpu, flags);
30658         pset = per_cpu_ptr(zone->pageset, cpu);
30660         pcp = &pset->pcp;
30661 -       if (pcp->count) {
30662 -               free_pcppages_bulk(zone, pcp->count, pcp);
30663 +       count = pcp->count;
30664 +       if (count) {
30665 +               isolate_pcp_pages(count, pcp, &dst);
30666                 pcp->count = 0;
30667         }
30668 -       local_irq_restore(flags);
30669 +       cpu_unlock_irqrestore(cpu, flags);
30670 +       if (count)
30671 +               free_pcppages_bulk(zone, count, &dst);
30674  /*
30675 @@ -2045,8 +2090,17 @@ void drain_all_pages(struct zone *zone)
30676                 else
30677                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
30678         }
30679 +#ifndef CONFIG_PREEMPT_RT_BASE
30680         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
30681                                                                 zone, 1);
30682 +#else
30683 +       for_each_cpu(cpu, &cpus_with_pcps) {
30684 +               if (zone)
30685 +                       drain_pages_zone(cpu, zone);
30686 +               else
30687 +                       drain_pages(cpu);
30688 +       }
30689 +#endif
30692  #ifdef CONFIG_HIBERNATION
30693 @@ -2102,7 +2156,7 @@ void free_hot_cold_page(struct page *page, bool cold)
30695         migratetype = get_pfnblock_migratetype(page, pfn);
30696         set_pcppage_migratetype(page, migratetype);
30697 -       local_irq_save(flags);
30698 +       local_lock_irqsave(pa_lock, flags);
30699         __count_vm_event(PGFREE);
30701         /*
30702 @@ -2128,12 +2182,17 @@ void free_hot_cold_page(struct page *page, bool cold)
30703         pcp->count++;
30704         if (pcp->count >= pcp->high) {
30705                 unsigned long batch = READ_ONCE(pcp->batch);
30706 -               free_pcppages_bulk(zone, batch, pcp);
30707 +               LIST_HEAD(dst);
30709 +               isolate_pcp_pages(batch, pcp, &dst);
30710                 pcp->count -= batch;
30711 +               local_unlock_irqrestore(pa_lock, flags);
30712 +               free_pcppages_bulk(zone, batch, &dst);
30713 +               return;
30714         }
30716  out:
30717 -       local_irq_restore(flags);
30718 +       local_unlock_irqrestore(pa_lock, flags);
30721  /*
30722 @@ -2268,7 +2327,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
30723                 struct per_cpu_pages *pcp;
30724                 struct list_head *list;
30726 -               local_irq_save(flags);
30727 +               local_lock_irqsave(pa_lock, flags);
30728                 pcp = &this_cpu_ptr(zone->pageset)->pcp;
30729                 list = &pcp->lists[migratetype];
30730                 if (list_empty(list)) {
30731 @@ -2300,7 +2359,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
30732                          */
30733                         WARN_ON_ONCE(order > 1);
30734                 }
30735 -               spin_lock_irqsave(&zone->lock, flags);
30736 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
30738                 page = NULL;
30739                 if (alloc_flags & ALLOC_HARDER) {
30740 @@ -2310,11 +2369,13 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
30741                 }
30742                 if (!page)
30743                         page = __rmqueue(zone, order, migratetype, gfp_flags);
30744 -               spin_unlock(&zone->lock);
30745 -               if (!page)
30746 +               if (!page) {
30747 +                       spin_unlock(&zone->lock);
30748                         goto failed;
30749 +               }
30750                 __mod_zone_freepage_state(zone, -(1 << order),
30751                                           get_pcppage_migratetype(page));
30752 +               spin_unlock(&zone->lock);
30753         }
30755         __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
30756 @@ -2324,13 +2385,13 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
30758         __count_zone_vm_events(PGALLOC, zone, 1 << order);
30759         zone_statistics(preferred_zone, zone, gfp_flags);
30760 -       local_irq_restore(flags);
30761 +       local_unlock_irqrestore(pa_lock, flags);
30763         VM_BUG_ON_PAGE(bad_range(zone, page), page);
30764         return page;
30766  failed:
30767 -       local_irq_restore(flags);
30768 +       local_unlock_irqrestore(pa_lock, flags);
30769         return NULL;
30772 @@ -5999,6 +6060,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
30773  void __init page_alloc_init(void)
30775         hotcpu_notifier(page_alloc_cpu_notify, 0);
30776 +       local_irq_lock_init(pa_lock);
30779  /*
30780 @@ -6893,7 +6955,7 @@ void zone_pcp_reset(struct zone *zone)
30781         struct per_cpu_pageset *pset;
30783         /* avoid races with drain_pages()  */
30784 -       local_irq_save(flags);
30785 +       local_lock_irqsave(pa_lock, flags);
30786         if (zone->pageset != &boot_pageset) {
30787                 for_each_online_cpu(cpu) {
30788                         pset = per_cpu_ptr(zone->pageset, cpu);
30789 @@ -6902,7 +6964,7 @@ void zone_pcp_reset(struct zone *zone)
30790                 free_percpu(zone->pageset);
30791                 zone->pageset = &boot_pageset;
30792         }
30793 -       local_irq_restore(flags);
30794 +       local_unlock_irqrestore(pa_lock, flags);
30797  #ifdef CONFIG_MEMORY_HOTREMOVE
30798 diff --git a/mm/percpu.c b/mm/percpu.c
30799 index ef6353f0adbd..33ccbac7cdb8 100644
30800 --- a/mm/percpu.c
30801 +++ b/mm/percpu.c
30802 @@ -1285,18 +1285,7 @@ void free_percpu(void __percpu *ptr)
30804  EXPORT_SYMBOL_GPL(free_percpu);
30806 -/**
30807 - * is_kernel_percpu_address - test whether address is from static percpu area
30808 - * @addr: address to test
30809 - *
30810 - * Test whether @addr belongs to in-kernel static percpu area.  Module
30811 - * static percpu areas are not considered.  For those, use
30812 - * is_module_percpu_address().
30813 - *
30814 - * RETURNS:
30815 - * %true if @addr is from in-kernel static percpu area, %false otherwise.
30816 - */
30817 -bool is_kernel_percpu_address(unsigned long addr)
30818 +bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
30820  #ifdef CONFIG_SMP
30821         const size_t static_size = __per_cpu_end - __per_cpu_start;
30822 @@ -1305,15 +1294,35 @@ bool is_kernel_percpu_address(unsigned long addr)
30824         for_each_possible_cpu(cpu) {
30825                 void *start = per_cpu_ptr(base, cpu);
30826 +               void *va = (void *)addr;
30828 -               if ((void *)addr >= start && (void *)addr < start + static_size)
30829 +               if (va >= start && va < start + static_size) {
30830 +                       if (can_addr)
30831 +                               *can_addr = (unsigned long) (va - start);
30832                         return true;
30833 -        }
30834 +               }
30835 +       }
30836  #endif
30837         /* on UP, can't distinguish from other static vars, always false */
30838         return false;
30841 +/**
30842 + * is_kernel_percpu_address - test whether address is from static percpu area
30843 + * @addr: address to test
30844 + *
30845 + * Test whether @addr belongs to in-kernel static percpu area.  Module
30846 + * static percpu areas are not considered.  For those, use
30847 + * is_module_percpu_address().
30848 + *
30849 + * RETURNS:
30850 + * %true if @addr is from in-kernel static percpu area, %false otherwise.
30851 + */
30852 +bool is_kernel_percpu_address(unsigned long addr)
30854 +       return __is_kernel_percpu_address(addr, NULL);
30857  /**
30858   * per_cpu_ptr_to_phys - convert translated percpu address to physical address
30859   * @addr: the address to be converted to physical address
30860 diff --git a/mm/slab.h b/mm/slab.h
30861 index 7b6087197997..afdc57941179 100644
30862 --- a/mm/slab.h
30863 +++ b/mm/slab.h
30864 @@ -324,7 +324,11 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
30865   * The slab lists for all objects.
30866   */
30867  struct kmem_cache_node {
30868 +#ifdef CONFIG_SLUB
30869 +       raw_spinlock_t list_lock;
30870 +#else
30871         spinlock_t list_lock;
30872 +#endif
30874  #ifdef CONFIG_SLAB
30875         struct list_head slabs_partial; /* partial list first, better asm code */
30876 diff --git a/mm/slub.c b/mm/slub.c
30877 index 4cf3a9c768b1..b183c5271607 100644
30878 --- a/mm/slub.c
30879 +++ b/mm/slub.c
30880 @@ -1075,7 +1075,7 @@ static noinline struct kmem_cache_node *free_debug_processing(
30881         void *object = head;
30882         int cnt = 0;
30884 -       spin_lock_irqsave(&n->list_lock, *flags);
30885 +       raw_spin_lock_irqsave(&n->list_lock, *flags);
30886         slab_lock(page);
30888         if (!check_slab(s, page))
30889 @@ -1136,7 +1136,7 @@ static noinline struct kmem_cache_node *free_debug_processing(
30891  fail:
30892         slab_unlock(page);
30893 -       spin_unlock_irqrestore(&n->list_lock, *flags);
30894 +       raw_spin_unlock_irqrestore(&n->list_lock, *flags);
30895         slab_fix(s, "Object at 0x%p not freed", object);
30896         return NULL;
30898 @@ -1263,6 +1263,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
30900  #endif /* CONFIG_SLUB_DEBUG */
30902 +struct slub_free_list {
30903 +       raw_spinlock_t          lock;
30904 +       struct list_head        list;
30906 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
30908  /*
30909   * Hooks for other subsystems that check memory allocations. In a typical
30910   * production configuration these hooks all should produce no code at all.
30911 @@ -1399,10 +1405,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
30912         gfp_t alloc_gfp;
30913         void *start, *p;
30914         int idx, order;
30915 +       bool enableirqs = false;
30917         flags &= gfp_allowed_mask;
30919         if (gfpflags_allow_blocking(flags))
30920 +               enableirqs = true;
30921 +#ifdef CONFIG_PREEMPT_RT_FULL
30922 +       if (system_state == SYSTEM_RUNNING)
30923 +               enableirqs = true;
30924 +#endif
30925 +       if (enableirqs)
30926                 local_irq_enable();
30928         flags |= s->allocflags;
30929 @@ -1473,7 +1486,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
30930         page->frozen = 1;
30932  out:
30933 -       if (gfpflags_allow_blocking(flags))
30934 +       if (enableirqs)
30935                 local_irq_disable();
30936         if (!page)
30937                 return NULL;
30938 @@ -1529,6 +1542,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
30939         __free_kmem_pages(page, order);
30942 +static void free_delayed(struct list_head *h)
30944 +       while(!list_empty(h)) {
30945 +               struct page *page = list_first_entry(h, struct page, lru);
30947 +               list_del(&page->lru);
30948 +               __free_slab(page->slab_cache, page);
30949 +       }
30952  #define need_reserve_slab_rcu                                          \
30953         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
30955 @@ -1560,6 +1583,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
30956                 }
30958                 call_rcu(head, rcu_free_slab);
30959 +       } else if (irqs_disabled()) {
30960 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
30962 +               raw_spin_lock(&f->lock);
30963 +               list_add(&page->lru, &f->list);
30964 +               raw_spin_unlock(&f->lock);
30965         } else
30966                 __free_slab(s, page);
30968 @@ -1673,7 +1702,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
30969         if (!n || !n->nr_partial)
30970                 return NULL;
30972 -       spin_lock(&n->list_lock);
30973 +       raw_spin_lock(&n->list_lock);
30974         list_for_each_entry_safe(page, page2, &n->partial, lru) {
30975                 void *t;
30977 @@ -1698,7 +1727,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
30978                         break;
30980         }
30981 -       spin_unlock(&n->list_lock);
30982 +       raw_spin_unlock(&n->list_lock);
30983         return object;
30986 @@ -1944,7 +1973,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
30987                          * that acquire_slab() will see a slab page that
30988                          * is frozen
30989                          */
30990 -                       spin_lock(&n->list_lock);
30991 +                       raw_spin_lock(&n->list_lock);
30992                 }
30993         } else {
30994                 m = M_FULL;
30995 @@ -1955,7 +1984,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
30996                          * slabs from diagnostic functions will not see
30997                          * any frozen slabs.
30998                          */
30999 -                       spin_lock(&n->list_lock);
31000 +                       raw_spin_lock(&n->list_lock);
31001                 }
31002         }
31004 @@ -1990,7 +2019,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
31005                 goto redo;
31007         if (lock)
31008 -               spin_unlock(&n->list_lock);
31009 +               raw_spin_unlock(&n->list_lock);
31011         if (m == M_FREE) {
31012                 stat(s, DEACTIVATE_EMPTY);
31013 @@ -2022,10 +2051,10 @@ static void unfreeze_partials(struct kmem_cache *s,
31014                 n2 = get_node(s, page_to_nid(page));
31015                 if (n != n2) {
31016                         if (n)
31017 -                               spin_unlock(&n->list_lock);
31018 +                               raw_spin_unlock(&n->list_lock);
31020                         n = n2;
31021 -                       spin_lock(&n->list_lock);
31022 +                       raw_spin_lock(&n->list_lock);
31023                 }
31025                 do {
31026 @@ -2054,7 +2083,7 @@ static void unfreeze_partials(struct kmem_cache *s,
31027         }
31029         if (n)
31030 -               spin_unlock(&n->list_lock);
31031 +               raw_spin_unlock(&n->list_lock);
31033         while (discard_page) {
31034                 page = discard_page;
31035 @@ -2093,14 +2122,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
31036                         pobjects = oldpage->pobjects;
31037                         pages = oldpage->pages;
31038                         if (drain && pobjects > s->cpu_partial) {
31039 +                               struct slub_free_list *f;
31040                                 unsigned long flags;
31041 +                               LIST_HEAD(tofree);
31042                                 /*
31043                                  * partial array is full. Move the existing
31044                                  * set to the per node partial list.
31045                                  */
31046                                 local_irq_save(flags);
31047                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
31048 +                               f = this_cpu_ptr(&slub_free_list);
31049 +                               raw_spin_lock(&f->lock);
31050 +                               list_splice_init(&f->list, &tofree);
31051 +                               raw_spin_unlock(&f->lock);
31052                                 local_irq_restore(flags);
31053 +                               free_delayed(&tofree);
31054                                 oldpage = NULL;
31055                                 pobjects = 0;
31056                                 pages = 0;
31057 @@ -2172,7 +2208,22 @@ static bool has_cpu_slab(int cpu, void *info)
31059  static void flush_all(struct kmem_cache *s)
31061 +       LIST_HEAD(tofree);
31062 +       int cpu;
31064         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
31065 +       for_each_online_cpu(cpu) {
31066 +               struct slub_free_list *f;
31068 +               if (!has_cpu_slab(cpu, s))
31069 +                       continue;
31071 +               f = &per_cpu(slub_free_list, cpu);
31072 +               raw_spin_lock_irq(&f->lock);
31073 +               list_splice_init(&f->list, &tofree);
31074 +               raw_spin_unlock_irq(&f->lock);
31075 +               free_delayed(&tofree);
31076 +       }
31079  /*
31080 @@ -2208,10 +2259,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
31081         unsigned long x = 0;
31082         struct page *page;
31084 -       spin_lock_irqsave(&n->list_lock, flags);
31085 +       raw_spin_lock_irqsave(&n->list_lock, flags);
31086         list_for_each_entry(page, &n->partial, lru)
31087                 x += get_count(page);
31088 -       spin_unlock_irqrestore(&n->list_lock, flags);
31089 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
31090         return x;
31092  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
31093 @@ -2349,8 +2400,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
31094   * already disabled (which is the case for bulk allocation).
31095   */
31096  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
31097 -                         unsigned long addr, struct kmem_cache_cpu *c)
31098 +                         unsigned long addr, struct kmem_cache_cpu *c,
31099 +                         struct list_head *to_free)
31101 +       struct slub_free_list *f;
31102         void *freelist;
31103         struct page *page;
31105 @@ -2410,6 +2463,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
31106         VM_BUG_ON(!c->page->frozen);
31107         c->freelist = get_freepointer(s, freelist);
31108         c->tid = next_tid(c->tid);
31110 +out:
31111 +       f = this_cpu_ptr(&slub_free_list);
31112 +       raw_spin_lock(&f->lock);
31113 +       list_splice_init(&f->list, to_free);
31114 +       raw_spin_unlock(&f->lock);
31116         return freelist;
31118  new_slab:
31119 @@ -2441,7 +2501,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
31120         deactivate_slab(s, page, get_freepointer(s, freelist));
31121         c->page = NULL;
31122         c->freelist = NULL;
31123 -       return freelist;
31124 +       goto out;
31127  /*
31128 @@ -2453,6 +2513,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
31130         void *p;
31131         unsigned long flags;
31132 +       LIST_HEAD(tofree);
31134         local_irq_save(flags);
31135  #ifdef CONFIG_PREEMPT
31136 @@ -2464,8 +2525,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
31137         c = this_cpu_ptr(s->cpu_slab);
31138  #endif
31140 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
31141 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
31142         local_irq_restore(flags);
31143 +       free_delayed(&tofree);
31144         return p;
31147 @@ -2652,7 +2714,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
31149         do {
31150                 if (unlikely(n)) {
31151 -                       spin_unlock_irqrestore(&n->list_lock, flags);
31152 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
31153                         n = NULL;
31154                 }
31155                 prior = page->freelist;
31156 @@ -2684,7 +2746,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
31157                                  * Otherwise the list_lock will synchronize with
31158                                  * other processors updating the list of slabs.
31159                                  */
31160 -                               spin_lock_irqsave(&n->list_lock, flags);
31161 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
31163                         }
31164                 }
31165 @@ -2726,7 +2788,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
31166                 add_partial(n, page, DEACTIVATE_TO_TAIL);
31167                 stat(s, FREE_ADD_PARTIAL);
31168         }
31169 -       spin_unlock_irqrestore(&n->list_lock, flags);
31170 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
31171         return;
31173  slab_empty:
31174 @@ -2741,7 +2803,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
31175                 remove_full(s, n, page);
31176         }
31178 -       spin_unlock_irqrestore(&n->list_lock, flags);
31179 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
31180         stat(s, FREE_SLAB);
31181         discard_slab(s, page);
31183 @@ -2913,6 +2975,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
31184                           void **p)
31186         struct kmem_cache_cpu *c;
31187 +       LIST_HEAD(to_free);
31188         int i;
31190         /* memcg and kmem_cache debug support */
31191 @@ -2936,7 +2999,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
31192                          * of re-populating per CPU c->freelist
31193                          */
31194                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
31195 -                                           _RET_IP_, c);
31196 +                                           _RET_IP_, c, &to_free);
31197                         if (unlikely(!p[i]))
31198                                 goto error;
31200 @@ -2948,6 +3011,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
31201         }
31202         c->tid = next_tid(c->tid);
31203         local_irq_enable();
31204 +       free_delayed(&to_free);
31206         /* Clear memory outside IRQ disabled fastpath loop */
31207         if (unlikely(flags & __GFP_ZERO)) {
31208 @@ -3095,7 +3159,7 @@ static void
31209  init_kmem_cache_node(struct kmem_cache_node *n)
31211         n->nr_partial = 0;
31212 -       spin_lock_init(&n->list_lock);
31213 +       raw_spin_lock_init(&n->list_lock);
31214         INIT_LIST_HEAD(&n->partial);
31215  #ifdef CONFIG_SLUB_DEBUG
31216         atomic_long_set(&n->nr_slabs, 0);
31217 @@ -3677,7 +3741,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
31218                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
31219                         INIT_LIST_HEAD(promote + i);
31221 -               spin_lock_irqsave(&n->list_lock, flags);
31222 +               raw_spin_lock_irqsave(&n->list_lock, flags);
31224                 /*
31225                  * Build lists of slabs to discard or promote.
31226 @@ -3708,7 +3772,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
31227                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
31228                         list_splice(promote + i, &n->partial);
31230 -               spin_unlock_irqrestore(&n->list_lock, flags);
31231 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
31233                 /* Release empty slabs */
31234                 list_for_each_entry_safe(page, t, &discard, lru)
31235 @@ -3884,6 +3948,12 @@ void __init kmem_cache_init(void)
31237         static __initdata struct kmem_cache boot_kmem_cache,
31238                 boot_kmem_cache_node;
31239 +       int cpu;
31241 +       for_each_possible_cpu(cpu) {
31242 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
31243 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
31244 +       }
31246         if (debug_guardpage_minorder())
31247                 slub_max_order = 0;
31248 @@ -4127,7 +4197,7 @@ static int validate_slab_node(struct kmem_cache *s,
31249         struct page *page;
31250         unsigned long flags;
31252 -       spin_lock_irqsave(&n->list_lock, flags);
31253 +       raw_spin_lock_irqsave(&n->list_lock, flags);
31255         list_for_each_entry(page, &n->partial, lru) {
31256                 validate_slab_slab(s, page, map);
31257 @@ -4149,7 +4219,7 @@ static int validate_slab_node(struct kmem_cache *s,
31258                        s->name, count, atomic_long_read(&n->nr_slabs));
31260  out:
31261 -       spin_unlock_irqrestore(&n->list_lock, flags);
31262 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
31263         return count;
31266 @@ -4337,12 +4407,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
31267                 if (!atomic_long_read(&n->nr_slabs))
31268                         continue;
31270 -               spin_lock_irqsave(&n->list_lock, flags);
31271 +               raw_spin_lock_irqsave(&n->list_lock, flags);
31272                 list_for_each_entry(page, &n->partial, lru)
31273                         process_slab(&t, s, page, alloc, map);
31274                 list_for_each_entry(page, &n->full, lru)
31275                         process_slab(&t, s, page, alloc, map);
31276 -               spin_unlock_irqrestore(&n->list_lock, flags);
31277 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
31278         }
31280         for (i = 0; i < t.count; i++) {
31281 diff --git a/mm/swap.c b/mm/swap.c
31282 index 39395fb549c0..ad16649221d7 100644
31283 --- a/mm/swap.c
31284 +++ b/mm/swap.c
31285 @@ -31,6 +31,7 @@
31286  #include <linux/memcontrol.h>
31287  #include <linux/gfp.h>
31288  #include <linux/uio.h>
31289 +#include <linux/locallock.h>
31290  #include <linux/hugetlb.h>
31291  #include <linux/page_idle.h>
31293 @@ -46,6 +47,9 @@ static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
31294  static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
31295  static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
31297 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
31298 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
31300  /*
31301   * This path almost never happens for VM activity - pages are normally
31302   * freed via pagevecs.  But it gets used by networking.
31303 @@ -481,11 +485,11 @@ void rotate_reclaimable_page(struct page *page)
31304                 unsigned long flags;
31306                 page_cache_get(page);
31307 -               local_irq_save(flags);
31308 +               local_lock_irqsave(rotate_lock, flags);
31309                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
31310                 if (!pagevec_add(pvec, page))
31311                         pagevec_move_tail(pvec);
31312 -               local_irq_restore(flags);
31313 +               local_unlock_irqrestore(rotate_lock, flags);
31314         }
31317 @@ -536,12 +540,13 @@ static bool need_activate_page_drain(int cpu)
31318  void activate_page(struct page *page)
31320         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
31321 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
31322 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
31323 +                                                      activate_page_pvecs);
31325                 page_cache_get(page);
31326                 if (!pagevec_add(pvec, page))
31327                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
31328 -               put_cpu_var(activate_page_pvecs);
31329 +               put_locked_var(swapvec_lock, activate_page_pvecs);
31330         }
31333 @@ -567,7 +572,7 @@ void activate_page(struct page *page)
31335  static void __lru_cache_activate_page(struct page *page)
31337 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
31338 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
31339         int i;
31341         /*
31342 @@ -589,7 +594,7 @@ static void __lru_cache_activate_page(struct page *page)
31343                 }
31344         }
31346 -       put_cpu_var(lru_add_pvec);
31347 +       put_locked_var(swapvec_lock, lru_add_pvec);
31350  /*
31351 @@ -630,13 +635,13 @@ EXPORT_SYMBOL(mark_page_accessed);
31353  static void __lru_cache_add(struct page *page)
31355 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
31356 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
31358         page_cache_get(page);
31359         if (!pagevec_space(pvec))
31360                 __pagevec_lru_add(pvec);
31361         pagevec_add(pvec, page);
31362 -       put_cpu_var(lru_add_pvec);
31363 +       put_locked_var(swapvec_lock, lru_add_pvec);
31366  /**
31367 @@ -816,9 +821,15 @@ void lru_add_drain_cpu(int cpu)
31368                 unsigned long flags;
31370                 /* No harm done if a racing interrupt already did this */
31371 -               local_irq_save(flags);
31372 +#ifdef CONFIG_PREEMPT_RT_BASE
31373 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
31374 +               pagevec_move_tail(pvec);
31375 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
31376 +#else
31377 +               local_lock_irqsave(rotate_lock, flags);
31378                 pagevec_move_tail(pvec);
31379 -               local_irq_restore(flags);
31380 +               local_unlock_irqrestore(rotate_lock, flags);
31381 +#endif
31382         }
31384         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
31385 @@ -846,26 +857,47 @@ void deactivate_file_page(struct page *page)
31386                 return;
31388         if (likely(get_page_unless_zero(page))) {
31389 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
31390 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
31391 +                                                      lru_deactivate_file_pvecs);
31393                 if (!pagevec_add(pvec, page))
31394                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
31395 -               put_cpu_var(lru_deactivate_file_pvecs);
31396 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
31397         }
31400  void lru_add_drain(void)
31402 -       lru_add_drain_cpu(get_cpu());
31403 -       put_cpu();
31404 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
31405 +       local_unlock_cpu(swapvec_lock);
31409 +#ifdef CONFIG_PREEMPT_RT_BASE
31410 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
31412 +       local_lock_on(swapvec_lock, cpu);
31413 +       lru_add_drain_cpu(cpu);
31414 +       local_unlock_on(swapvec_lock, cpu);
31417 +#else
31419  static void lru_add_drain_per_cpu(struct work_struct *dummy)
31421         lru_add_drain();
31424  static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
31425 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
31427 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
31429 +       INIT_WORK(work, lru_add_drain_per_cpu);
31430 +       schedule_work_on(cpu, work);
31431 +       cpumask_set_cpu(cpu, has_work);
31433 +#endif
31435  void lru_add_drain_all(void)
31437 @@ -878,20 +910,17 @@ void lru_add_drain_all(void)
31438         cpumask_clear(&has_work);
31440         for_each_online_cpu(cpu) {
31441 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
31443                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
31444                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
31445                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
31446 -                   need_activate_page_drain(cpu)) {
31447 -                       INIT_WORK(work, lru_add_drain_per_cpu);
31448 -                       schedule_work_on(cpu, work);
31449 -                       cpumask_set_cpu(cpu, &has_work);
31450 -               }
31451 +                   need_activate_page_drain(cpu))
31452 +                       remote_lru_add_drain(cpu, &has_work);
31453         }
31455 +#ifndef CONFIG_PREEMPT_RT_BASE
31456         for_each_cpu(cpu, &has_work)
31457                 flush_work(&per_cpu(lru_add_drain_work, cpu));
31458 +#endif
31460         put_online_cpus();
31461         mutex_unlock(&lock);
31462 diff --git a/mm/truncate.c b/mm/truncate.c
31463 index f4c8270f7b84..ff2d614eb91d 100644
31464 --- a/mm/truncate.c
31465 +++ b/mm/truncate.c
31466 @@ -56,8 +56,11 @@ static void clear_exceptional_entry(struct address_space *mapping,
31467          * protected by mapping->tree_lock.
31468          */
31469         if (!workingset_node_shadows(node) &&
31470 -           !list_empty(&node->private_list))
31471 -               list_lru_del(&workingset_shadow_nodes, &node->private_list);
31472 +           !list_empty(&node->private_list)) {
31473 +               local_lock(workingset_shadow_lock);
31474 +               list_lru_del(&__workingset_shadow_nodes, &node->private_list);
31475 +               local_unlock(workingset_shadow_lock);
31476 +       }
31477         __radix_tree_delete_node(&mapping->page_tree, node);
31478  unlock:
31479         spin_unlock_irq(&mapping->tree_lock);
31480 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
31481 index 8e3c9c5a3042..68740314ad54 100644
31482 --- a/mm/vmalloc.c
31483 +++ b/mm/vmalloc.c
31484 @@ -821,7 +821,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
31485         struct vmap_block *vb;
31486         struct vmap_area *va;
31487         unsigned long vb_idx;
31488 -       int node, err;
31489 +       int node, err, cpu;
31490         void *vaddr;
31492         node = numa_node_id();
31493 @@ -864,11 +864,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
31494         BUG_ON(err);
31495         radix_tree_preload_end();
31497 -       vbq = &get_cpu_var(vmap_block_queue);
31498 +       cpu = get_cpu_light();
31499 +       vbq = this_cpu_ptr(&vmap_block_queue);
31500         spin_lock(&vbq->lock);
31501         list_add_tail_rcu(&vb->free_list, &vbq->free);
31502         spin_unlock(&vbq->lock);
31503 -       put_cpu_var(vmap_block_queue);
31504 +       put_cpu_light();
31506         return vaddr;
31508 @@ -937,6 +938,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
31509         struct vmap_block *vb;
31510         void *vaddr = NULL;
31511         unsigned int order;
31512 +       int cpu;
31514         BUG_ON(offset_in_page(size));
31515         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
31516 @@ -951,7 +953,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
31517         order = get_order(size);
31519         rcu_read_lock();
31520 -       vbq = &get_cpu_var(vmap_block_queue);
31521 +       cpu = get_cpu_light();
31522 +       vbq = this_cpu_ptr(&vmap_block_queue);
31523         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
31524                 unsigned long pages_off;
31526 @@ -974,7 +977,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
31527                 break;
31528         }
31530 -       put_cpu_var(vmap_block_queue);
31531 +       put_cpu_light();
31532         rcu_read_unlock();
31534         /* Allocate new block if nothing was found */
31535 diff --git a/mm/vmstat.c b/mm/vmstat.c
31536 index 5712cdaae964..71e04bc3fe66 100644
31537 --- a/mm/vmstat.c
31538 +++ b/mm/vmstat.c
31539 @@ -226,6 +226,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
31540         long x;
31541         long t;
31543 +       preempt_disable_rt();
31544         x = delta + __this_cpu_read(*p);
31546         t = __this_cpu_read(pcp->stat_threshold);
31547 @@ -235,6 +236,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
31548                 x = 0;
31549         }
31550         __this_cpu_write(*p, x);
31551 +       preempt_enable_rt();
31553  EXPORT_SYMBOL(__mod_zone_page_state);
31555 @@ -267,6 +269,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
31556         s8 __percpu *p = pcp->vm_stat_diff + item;
31557         s8 v, t;
31559 +       preempt_disable_rt();
31560         v = __this_cpu_inc_return(*p);
31561         t = __this_cpu_read(pcp->stat_threshold);
31562         if (unlikely(v > t)) {
31563 @@ -275,6 +278,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
31564                 zone_page_state_add(v + overstep, zone, item);
31565                 __this_cpu_write(*p, -overstep);
31566         }
31567 +       preempt_enable_rt();
31570  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
31571 @@ -289,6 +293,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
31572         s8 __percpu *p = pcp->vm_stat_diff + item;
31573         s8 v, t;
31575 +       preempt_disable_rt();
31576         v = __this_cpu_dec_return(*p);
31577         t = __this_cpu_read(pcp->stat_threshold);
31578         if (unlikely(v < - t)) {
31579 @@ -297,6 +302,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
31580                 zone_page_state_add(v - overstep, zone, item);
31581                 __this_cpu_write(*p, overstep);
31582         }
31583 +       preempt_enable_rt();
31586  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
31587 diff --git a/mm/workingset.c b/mm/workingset.c
31588 index df66f426fdcf..6db7b243fa0d 100644
31589 --- a/mm/workingset.c
31590 +++ b/mm/workingset.c
31591 @@ -264,7 +264,8 @@ void workingset_activation(struct page *page)
31592   * point where they would still be useful.
31593   */
31595 -struct list_lru workingset_shadow_nodes;
31596 +struct list_lru __workingset_shadow_nodes;
31597 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
31599  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
31600                                         struct shrink_control *sc)
31601 @@ -274,9 +275,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
31602         unsigned long pages;
31604         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
31605 -       local_irq_disable();
31606 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
31607 -       local_irq_enable();
31608 +       local_lock_irq(workingset_shadow_lock);
31609 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
31610 +       local_unlock_irq(workingset_shadow_lock);
31612         pages = node_present_pages(sc->nid);
31613         /*
31614 @@ -361,9 +362,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
31615         spin_unlock(&mapping->tree_lock);
31616         ret = LRU_REMOVED_RETRY;
31617  out:
31618 -       local_irq_enable();
31619 +       local_unlock_irq(workingset_shadow_lock);
31620         cond_resched();
31621 -       local_irq_disable();
31622 +       local_lock_irq(workingset_shadow_lock);
31623         spin_lock(lru_lock);
31624         return ret;
31626 @@ -374,10 +375,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
31627         unsigned long ret;
31629         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
31630 -       local_irq_disable();
31631 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
31632 +       local_lock_irq(workingset_shadow_lock);
31633 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
31634                                     shadow_lru_isolate, NULL);
31635 -       local_irq_enable();
31636 +       local_unlock_irq(workingset_shadow_lock);
31637         return ret;
31640 @@ -398,7 +399,7 @@ static int __init workingset_init(void)
31642         int ret;
31644 -       ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
31645 +       ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
31646         if (ret)
31647                 goto err;
31648         ret = register_shrinker(&workingset_shadow_shrinker);
31649 @@ -406,7 +407,7 @@ static int __init workingset_init(void)
31650                 goto err_list_lru;
31651         return 0;
31652  err_list_lru:
31653 -       list_lru_destroy(&workingset_shadow_nodes);
31654 +       list_lru_destroy(&__workingset_shadow_nodes);
31655  err:
31656         return ret;
31658 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
31659 index c1ea19478119..529552c3716d 100644
31660 --- a/mm/zsmalloc.c
31661 +++ b/mm/zsmalloc.c
31662 @@ -64,6 +64,7 @@
31663  #include <linux/debugfs.h>
31664  #include <linux/zsmalloc.h>
31665  #include <linux/zpool.h>
31666 +#include <linux/locallock.h>
31668  /*
31669   * This must be power of 2 and greater than of equal to sizeof(link_free).
31670 @@ -403,6 +404,7 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
31672  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
31673  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
31674 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
31676  static int is_first_page(struct page *page)
31678 @@ -1289,7 +1291,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
31679         class = pool->size_class[class_idx];
31680         off = obj_idx_to_offset(page, obj_idx, class->size);
31682 -       area = &get_cpu_var(zs_map_area);
31683 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
31684         area->vm_mm = mm;
31685         if (off + class->size <= PAGE_SIZE) {
31686                 /* this object is contained entirely within a page */
31687 @@ -1342,7 +1344,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
31689                 __zs_unmap_object(area, pages, off, class->size);
31690         }
31691 -       put_cpu_var(zs_map_area);
31692 +       put_locked_var(zs_map_area_lock, zs_map_area);
31693         unpin_tag(handle);
31695  EXPORT_SYMBOL_GPL(zs_unmap_object);
31696 diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
31697 index c842f40c1173..035a5f6e3de9 100644
31698 --- a/net/bluetooth/hci_sock.c
31699 +++ b/net/bluetooth/hci_sock.c
31700 @@ -213,15 +213,13 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
31703  /* Send frame to sockets with specific channel */
31704 -void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
31705 -                        int flag, struct sock *skip_sk)
31706 +static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
31707 +                                 int flag, struct sock *skip_sk)
31709         struct sock *sk;
31711         BT_DBG("channel %u len %d", channel, skb->len);
31713 -       read_lock(&hci_sk_list.lock);
31715         sk_for_each(sk, &hci_sk_list.head) {
31716                 struct sk_buff *nskb;
31718 @@ -247,6 +245,13 @@ void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
31719                         kfree_skb(nskb);
31720         }
31724 +void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
31725 +                        int flag, struct sock *skip_sk)
31727 +       read_lock(&hci_sk_list.lock);
31728 +       __hci_send_to_channel(channel, skb, flag, skip_sk);
31729         read_unlock(&hci_sk_list.lock);
31732 @@ -299,8 +304,8 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
31733         hdr->index = cpu_to_le16(hdev->id);
31734         hdr->len = cpu_to_le16(skb->len);
31736 -       hci_send_to_channel(HCI_CHANNEL_MONITOR, skb_copy,
31737 -                           HCI_SOCK_TRUSTED, NULL);
31738 +       __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb_copy,
31739 +                             HCI_SOCK_TRUSTED, NULL);
31740         kfree_skb(skb_copy);
31743 diff --git a/net/core/dev.c b/net/core/dev.c
31744 index 3b67c1e5756f..63614e930907 100644
31745 --- a/net/core/dev.c
31746 +++ b/net/core/dev.c
31747 @@ -186,6 +186,7 @@ static unsigned int napi_gen_id = NR_CPUS;
31748  static DEFINE_HASHTABLE(napi_hash, 8);
31750  static seqcount_t devnet_rename_seq;
31751 +static DEFINE_MUTEX(devnet_rename_mutex);
31753  static inline void dev_base_seq_inc(struct net *net)
31755 @@ -207,14 +208,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
31756  static inline void rps_lock(struct softnet_data *sd)
31758  #ifdef CONFIG_RPS
31759 -       spin_lock(&sd->input_pkt_queue.lock);
31760 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
31761  #endif
31764  static inline void rps_unlock(struct softnet_data *sd)
31766  #ifdef CONFIG_RPS
31767 -       spin_unlock(&sd->input_pkt_queue.lock);
31768 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
31769  #endif
31772 @@ -884,7 +885,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
31773         strcpy(name, dev->name);
31774         rcu_read_unlock();
31775         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
31776 -               cond_resched();
31777 +               mutex_lock(&devnet_rename_mutex);
31778 +               mutex_unlock(&devnet_rename_mutex);
31779                 goto retry;
31780         }
31782 @@ -1153,20 +1155,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
31783         if (dev->flags & IFF_UP)
31784                 return -EBUSY;
31786 -       write_seqcount_begin(&devnet_rename_seq);
31787 +       mutex_lock(&devnet_rename_mutex);
31788 +       __raw_write_seqcount_begin(&devnet_rename_seq);
31790 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
31791 -               write_seqcount_end(&devnet_rename_seq);
31792 -               return 0;
31793 -       }
31794 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
31795 +               goto outunlock;
31797         memcpy(oldname, dev->name, IFNAMSIZ);
31799         err = dev_get_valid_name(net, dev, newname);
31800 -       if (err < 0) {
31801 -               write_seqcount_end(&devnet_rename_seq);
31802 -               return err;
31803 -       }
31804 +       if (err < 0)
31805 +               goto outunlock;
31807         if (oldname[0] && !strchr(oldname, '%'))
31808                 netdev_info(dev, "renamed from %s\n", oldname);
31809 @@ -1179,11 +1178,12 @@ int dev_change_name(struct net_device *dev, const char *newname)
31810         if (ret) {
31811                 memcpy(dev->name, oldname, IFNAMSIZ);
31812                 dev->name_assign_type = old_assign_type;
31813 -               write_seqcount_end(&devnet_rename_seq);
31814 -               return ret;
31815 +               err = ret;
31816 +               goto outunlock;
31817         }
31819 -       write_seqcount_end(&devnet_rename_seq);
31820 +       __raw_write_seqcount_end(&devnet_rename_seq);
31821 +       mutex_unlock(&devnet_rename_mutex);
31823         netdev_adjacent_rename_links(dev, oldname);
31825 @@ -1204,7 +1204,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
31826                 /* err >= 0 after dev_alloc_name() or stores the first errno */
31827                 if (err >= 0) {
31828                         err = ret;
31829 -                       write_seqcount_begin(&devnet_rename_seq);
31830 +                       mutex_lock(&devnet_rename_mutex);
31831 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
31832                         memcpy(dev->name, oldname, IFNAMSIZ);
31833                         memcpy(oldname, newname, IFNAMSIZ);
31834                         dev->name_assign_type = old_assign_type;
31835 @@ -1217,6 +1218,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
31836         }
31838         return err;
31840 +outunlock:
31841 +       __raw_write_seqcount_end(&devnet_rename_seq);
31842 +       mutex_unlock(&devnet_rename_mutex);
31843 +       return err;
31846  /**
31847 @@ -2270,6 +2276,7 @@ static inline void __netif_reschedule(struct Qdisc *q)
31848         sd->output_queue_tailp = &q->next_sched;
31849         raise_softirq_irqoff(NET_TX_SOFTIRQ);
31850         local_irq_restore(flags);
31851 +       preempt_check_resched_rt();
31854  void __netif_schedule(struct Qdisc *q)
31855 @@ -2354,6 +2361,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
31856         __this_cpu_write(softnet_data.completion_queue, skb);
31857         raise_softirq_irqoff(NET_TX_SOFTIRQ);
31858         local_irq_restore(flags);
31859 +       preempt_check_resched_rt();
31861  EXPORT_SYMBOL(__dev_kfree_skb_irq);
31863 @@ -2918,7 +2926,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
31864          * This permits __QDISC___STATE_RUNNING owner to get the lock more
31865          * often and dequeue packets faster.
31866          */
31867 +#ifdef CONFIG_PREEMPT_RT_FULL
31868 +       contended = true;
31869 +#else
31870         contended = qdisc_is_running(q);
31871 +#endif
31872         if (unlikely(contended))
31873                 spin_lock(&q->busylock);
31875 @@ -2978,9 +2990,44 @@ static void skb_update_prio(struct sk_buff *skb)
31876  #define skb_update_prio(skb)
31877  #endif
31879 +#ifdef CONFIG_PREEMPT_RT_FULL
31881 +static inline int xmit_rec_read(void)
31883 +       return current->xmit_recursion;
31886 +static inline void xmit_rec_inc(void)
31888 +       current->xmit_recursion++;
31891 +static inline void xmit_rec_dec(void)
31893 +       current->xmit_recursion--;
31896 +#else
31898  DEFINE_PER_CPU(int, xmit_recursion);
31899  EXPORT_SYMBOL(xmit_recursion);
31901 +static inline int xmit_rec_read(void)
31903 +       return __this_cpu_read(xmit_recursion);
31906 +static inline void xmit_rec_inc(void)
31908 +       __this_cpu_inc(xmit_recursion);
31911 +static inline void xmit_rec_dec(void)
31913 +       __this_cpu_dec(xmit_recursion);
31915 +#endif
31917  #define RECURSION_LIMIT 10
31919  /**
31920 @@ -3175,7 +3222,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
31922                 if (txq->xmit_lock_owner != cpu) {
31924 -                       if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
31925 +                       if (xmit_rec_read() > RECURSION_LIMIT)
31926                                 goto recursion_alert;
31928                         skb = validate_xmit_skb(skb, dev);
31929 @@ -3185,9 +3232,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
31930                         HARD_TX_LOCK(dev, txq, cpu);
31932                         if (!netif_xmit_stopped(txq)) {
31933 -                               __this_cpu_inc(xmit_recursion);
31934 +                               xmit_rec_inc();
31935                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
31936 -                               __this_cpu_dec(xmit_recursion);
31937 +                               xmit_rec_dec();
31938                                 if (dev_xmit_complete(rc)) {
31939                                         HARD_TX_UNLOCK(dev, txq);
31940                                         goto out;
31941 @@ -3561,6 +3608,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
31942         rps_unlock(sd);
31944         local_irq_restore(flags);
31945 +       preempt_check_resched_rt();
31947         atomic_long_inc(&skb->dev->rx_dropped);
31948         kfree_skb(skb);
31949 @@ -3579,7 +3627,7 @@ static int netif_rx_internal(struct sk_buff *skb)
31950                 struct rps_dev_flow voidflow, *rflow = &voidflow;
31951                 int cpu;
31953 -               preempt_disable();
31954 +               migrate_disable();
31955                 rcu_read_lock();
31957                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
31958 @@ -3589,13 +3637,13 @@ static int netif_rx_internal(struct sk_buff *skb)
31959                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
31961                 rcu_read_unlock();
31962 -               preempt_enable();
31963 +               migrate_enable();
31964         } else
31965  #endif
31966         {
31967                 unsigned int qtail;
31968 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
31969 -               put_cpu();
31970 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
31971 +               put_cpu_light();
31972         }
31973         return ret;
31975 @@ -3629,16 +3677,44 @@ int netif_rx_ni(struct sk_buff *skb)
31977         trace_netif_rx_ni_entry(skb);
31979 -       preempt_disable();
31980 +       local_bh_disable();
31981         err = netif_rx_internal(skb);
31982 -       if (local_softirq_pending())
31983 -               do_softirq();
31984 -       preempt_enable();
31985 +       local_bh_enable();
31987         return err;
31989  EXPORT_SYMBOL(netif_rx_ni);
31991 +#ifdef CONFIG_PREEMPT_RT_FULL
31993 + * RT runs ksoftirqd as a real time thread and the root_lock is a
31994 + * "sleeping spinlock". If the trylock fails then we can go into an
31995 + * infinite loop when ksoftirqd preempted the task which actually
31996 + * holds the lock, because we requeue q and raise NET_TX softirq
31997 + * causing ksoftirqd to loop forever.
31998 + *
31999 + * It's safe to use spin_lock on RT here as softirqs run in thread
32000 + * context and cannot deadlock against the thread which is holding
32001 + * root_lock.
32002 + *
32003 + * On !RT the trylock might fail, but there we bail out from the
32004 + * softirq loop after 10 attempts which we can't do on RT. And the
32005 + * task holding root_lock cannot be preempted, so the only downside of
32006 + * that trylock is that we need 10 loops to decide that we should have
32007 + * given up in the first one :)
32008 + */
32009 +static inline int take_root_lock(spinlock_t *lock)
32011 +       spin_lock(lock);
32012 +       return 1;
32014 +#else
32015 +static inline int take_root_lock(spinlock_t *lock)
32017 +       return spin_trylock(lock);
32019 +#endif
32021  static void net_tx_action(struct softirq_action *h)
32023         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
32024 @@ -3680,7 +3756,7 @@ static void net_tx_action(struct softirq_action *h)
32025                         head = head->next_sched;
32027                         root_lock = qdisc_lock(q);
32028 -                       if (spin_trylock(root_lock)) {
32029 +                       if (take_root_lock(root_lock)) {
32030                                 smp_mb__before_atomic();
32031                                 clear_bit(__QDISC_STATE_SCHED,
32032                                           &q->state);
32033 @@ -4102,7 +4178,7 @@ static void flush_backlog(void *arg)
32034         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
32035                 if (skb->dev == dev) {
32036                         __skb_unlink(skb, &sd->input_pkt_queue);
32037 -                       kfree_skb(skb);
32038 +                       __skb_queue_tail(&sd->tofree_queue, skb);
32039                         input_queue_head_incr(sd);
32040                 }
32041         }
32042 @@ -4111,10 +4187,13 @@ static void flush_backlog(void *arg)
32043         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
32044                 if (skb->dev == dev) {
32045                         __skb_unlink(skb, &sd->process_queue);
32046 -                       kfree_skb(skb);
32047 +                       __skb_queue_tail(&sd->tofree_queue, skb);
32048                         input_queue_head_incr(sd);
32049                 }
32050         }
32052 +       if (!skb_queue_empty(&sd->tofree_queue))
32053 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
32056  static int napi_gro_complete(struct sk_buff *skb)
32057 @@ -4581,6 +4660,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
32058                 sd->rps_ipi_list = NULL;
32060                 local_irq_enable();
32061 +               preempt_check_resched_rt();
32063                 /* Send pending IPI's to kick RPS processing on remote cpus. */
32064                 while (remsd) {
32065 @@ -4594,6 +4674,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
32066         } else
32067  #endif
32068                 local_irq_enable();
32069 +       preempt_check_resched_rt();
32072  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
32073 @@ -4675,9 +4756,11 @@ void __napi_schedule(struct napi_struct *n)
32074         local_irq_save(flags);
32075         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
32076         local_irq_restore(flags);
32077 +       preempt_check_resched_rt();
32079  EXPORT_SYMBOL(__napi_schedule);
32081 +#ifndef CONFIG_PREEMPT_RT_FULL
32082  /**
32083   * __napi_schedule_irqoff - schedule for receive
32084   * @n: entry to schedule
32085 @@ -4689,6 +4772,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
32086         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
32088  EXPORT_SYMBOL(__napi_schedule_irqoff);
32089 +#endif
32091  void __napi_complete(struct napi_struct *n)
32093 @@ -4912,13 +4996,21 @@ static void net_rx_action(struct softirq_action *h)
32094         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
32095         unsigned long time_limit = jiffies + 2;
32096         int budget = netdev_budget;
32097 +       struct sk_buff_head tofree_q;
32098 +       struct sk_buff *skb;
32099         LIST_HEAD(list);
32100         LIST_HEAD(repoll);
32102 +       __skb_queue_head_init(&tofree_q);
32104         local_irq_disable();
32105 +       skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
32106         list_splice_init(&sd->poll_list, &list);
32107         local_irq_enable();
32109 +       while ((skb = __skb_dequeue(&tofree_q)))
32110 +               kfree_skb(skb);
32112         for (;;) {
32113                 struct napi_struct *n;
32115 @@ -4948,7 +5040,7 @@ static void net_rx_action(struct softirq_action *h)
32116         list_splice_tail(&repoll, &list);
32117         list_splice(&list, &sd->poll_list);
32118         if (!list_empty(&sd->poll_list))
32119 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
32120 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
32122         net_rps_action_and_irq_enable(sd);
32124 @@ -7287,7 +7379,7 @@ EXPORT_SYMBOL(free_netdev);
32125  void synchronize_net(void)
32127         might_sleep();
32128 -       if (rtnl_is_locked())
32129 +       if (rtnl_is_locked() && !IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
32130                 synchronize_rcu_expedited();
32131         else
32132                 synchronize_rcu();
32133 @@ -7528,16 +7620,20 @@ static int dev_cpu_callback(struct notifier_block *nfb,
32135         raise_softirq_irqoff(NET_TX_SOFTIRQ);
32136         local_irq_enable();
32137 +       preempt_check_resched_rt();
32139         /* Process offline CPU's input_pkt_queue */
32140         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
32141                 netif_rx_ni(skb);
32142                 input_queue_head_incr(oldsd);
32143         }
32144 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
32145 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
32146                 netif_rx_ni(skb);
32147                 input_queue_head_incr(oldsd);
32148         }
32149 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
32150 +               kfree_skb(skb);
32151 +       }
32153         return NOTIFY_OK;
32155 @@ -7839,8 +7935,9 @@ static int __init net_dev_init(void)
32156         for_each_possible_cpu(i) {
32157                 struct softnet_data *sd = &per_cpu(softnet_data, i);
32159 -               skb_queue_head_init(&sd->input_pkt_queue);
32160 -               skb_queue_head_init(&sd->process_queue);
32161 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
32162 +               skb_queue_head_init_raw(&sd->process_queue);
32163 +               skb_queue_head_init_raw(&sd->tofree_queue);
32164                 INIT_LIST_HEAD(&sd->poll_list);
32165                 sd->output_queue_tailp = &sd->output_queue;
32166  #ifdef CONFIG_RPS
32167 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
32168 index 86b619501350..f5c4897b52a0 100644
32169 --- a/net/core/skbuff.c
32170 +++ b/net/core/skbuff.c
32171 @@ -63,6 +63,7 @@
32172  #include <linux/errqueue.h>
32173  #include <linux/prefetch.h>
32174  #include <linux/if_vlan.h>
32175 +#include <linux/locallock.h>
32177  #include <net/protocol.h>
32178  #include <net/dst.h>
32179 @@ -351,6 +352,8 @@ EXPORT_SYMBOL(build_skb);
32181  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
32182  static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
32183 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
32184 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
32186  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
32188 @@ -358,10 +361,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
32189         unsigned long flags;
32190         void *data;
32192 -       local_irq_save(flags);
32193 +       local_lock_irqsave(netdev_alloc_lock, flags);
32194         nc = this_cpu_ptr(&netdev_alloc_cache);
32195         data = __alloc_page_frag(nc, fragsz, gfp_mask);
32196 -       local_irq_restore(flags);
32197 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
32198         return data;
32201 @@ -380,9 +383,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
32203  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
32205 -       struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
32206 +       struct page_frag_cache *nc;
32207 +       void *data;
32209 -       return __alloc_page_frag(nc, fragsz, gfp_mask);
32210 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
32211 +       data = __alloc_page_frag(nc, fragsz, gfp_mask);
32212 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
32213 +       return data;
32216  void *napi_alloc_frag(unsigned int fragsz)
32217 @@ -429,13 +436,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
32218         if (sk_memalloc_socks())
32219                 gfp_mask |= __GFP_MEMALLOC;
32221 -       local_irq_save(flags);
32222 +       local_lock_irqsave(netdev_alloc_lock, flags);
32224         nc = this_cpu_ptr(&netdev_alloc_cache);
32225         data = __alloc_page_frag(nc, len, gfp_mask);
32226         pfmemalloc = nc->pfmemalloc;
32228 -       local_irq_restore(flags);
32229 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
32231         if (unlikely(!data))
32232                 return NULL;
32233 @@ -476,9 +483,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
32234  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
32235                                  gfp_t gfp_mask)
32237 -       struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
32238 +       struct page_frag_cache *nc;
32239         struct sk_buff *skb;
32240         void *data;
32241 +       bool pfmemalloc;
32243         len += NET_SKB_PAD + NET_IP_ALIGN;
32245 @@ -496,7 +504,11 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
32246         if (sk_memalloc_socks())
32247                 gfp_mask |= __GFP_MEMALLOC;
32249 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
32250         data = __alloc_page_frag(nc, len, gfp_mask);
32251 +       pfmemalloc = nc->pfmemalloc;
32252 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
32254         if (unlikely(!data))
32255                 return NULL;
32257 @@ -507,7 +519,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
32258         }
32260         /* use OR instead of assignment to avoid clearing of bits in mask */
32261 -       if (nc->pfmemalloc)
32262 +       if (pfmemalloc)
32263                 skb->pfmemalloc = 1;
32264         skb->head_frag = 1;
32266 diff --git a/net/core/sock.c b/net/core/sock.c
32267 index cd12cb6fe366..982a06dab369 100644
32268 --- a/net/core/sock.c
32269 +++ b/net/core/sock.c
32270 @@ -2449,12 +2449,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
32271         if (sk->sk_lock.owned)
32272                 __lock_sock(sk);
32273         sk->sk_lock.owned = 1;
32274 -       spin_unlock(&sk->sk_lock.slock);
32275 +       spin_unlock_bh(&sk->sk_lock.slock);
32276         /*
32277          * The sk_lock has mutex_lock() semantics here:
32278          */
32279         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
32280 -       local_bh_enable();
32282  EXPORT_SYMBOL(lock_sock_nested);
32284 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
32285 index 36e26977c908..2c1ce3e80ee4 100644
32286 --- a/net/ipv4/icmp.c
32287 +++ b/net/ipv4/icmp.c
32288 @@ -69,6 +69,7 @@
32289  #include <linux/jiffies.h>
32290  #include <linux/kernel.h>
32291  #include <linux/fcntl.h>
32292 +#include <linux/sysrq.h>
32293  #include <linux/socket.h>
32294  #include <linux/in.h>
32295  #include <linux/inet.h>
32296 @@ -77,6 +78,7 @@
32297  #include <linux/string.h>
32298  #include <linux/netfilter_ipv4.h>
32299  #include <linux/slab.h>
32300 +#include <linux/locallock.h>
32301  #include <net/snmp.h>
32302  #include <net/ip.h>
32303  #include <net/route.h>
32304 @@ -204,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
32305   *
32306   *     On SMP we have one ICMP socket per-cpu.
32307   */
32308 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
32310  static struct sock *icmp_sk(struct net *net)
32312         return *this_cpu_ptr(net->ipv4.icmp_sk);
32313 @@ -215,12 +219,18 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
32315         local_bh_disable();
32317 +       if (!local_trylock(icmp_sk_lock)) {
32318 +               local_bh_enable();
32319 +               return NULL;
32320 +       }
32322         sk = icmp_sk(net);
32324         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
32325                 /* This can happen if the output path signals a
32326                  * dst_link_failure() for an outgoing ICMP packet.
32327                  */
32328 +               local_unlock(icmp_sk_lock);
32329                 local_bh_enable();
32330                 return NULL;
32331         }
32332 @@ -230,6 +240,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
32333  static inline void icmp_xmit_unlock(struct sock *sk)
32335         spin_unlock_bh(&sk->sk_lock.slock);
32336 +       local_unlock(icmp_sk_lock);
32339  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
32340 @@ -358,6 +369,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
32341         struct sock *sk;
32342         struct sk_buff *skb;
32344 +       local_lock(icmp_sk_lock);
32345         sk = icmp_sk(dev_net((*rt)->dst.dev));
32346         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
32347                            icmp_param->data_len+icmp_param->head_len,
32348 @@ -380,6 +392,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
32349                 skb->ip_summed = CHECKSUM_NONE;
32350                 ip_push_pending_frames(sk, fl4);
32351         }
32352 +       local_unlock(icmp_sk_lock);
32355  /*
32356 @@ -890,6 +903,30 @@ static bool icmp_redirect(struct sk_buff *skb)
32357         return true;
32361 + * 32bit and 64bit have different timestamp length, so we check for
32362 + * the cookie at offset 20 and verify it is repeated at offset 50
32363 + */
32364 +#define CO_POS0                20
32365 +#define CO_POS1                50
32366 +#define CO_SIZE                sizeof(int)
32367 +#define ICMP_SYSRQ_SIZE        57
32370 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
32371 + * pattern and if it matches send the next byte as a trigger to sysrq.
32372 + */
32373 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
32375 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
32376 +       char *p = skb->data;
32378 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
32379 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
32380 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
32381 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
32384  /*
32385   *     Handle ICMP_ECHO ("ping") requests.
32386   *
32387 @@ -917,6 +954,11 @@ static bool icmp_echo(struct sk_buff *skb)
32388                 icmp_param.data_len        = skb->len;
32389                 icmp_param.head_len        = sizeof(struct icmphdr);
32390                 icmp_reply(&icmp_param, skb);
32392 +               if (skb->len == ICMP_SYSRQ_SIZE &&
32393 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
32394 +                       icmp_check_sysrq(net, skb);
32395 +               }
32396         }
32397         /* should there be an ICMP stat for ignored echos? */
32398         return true;
32399 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
32400 index 70fb352e317f..1bcd436709a4 100644
32401 --- a/net/ipv4/sysctl_net_ipv4.c
32402 +++ b/net/ipv4/sysctl_net_ipv4.c
32403 @@ -817,6 +817,13 @@ static struct ctl_table ipv4_net_table[] = {
32404                 .mode           = 0644,
32405                 .proc_handler   = proc_dointvec
32406         },
32407 +       {
32408 +               .procname       = "icmp_echo_sysrq",
32409 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
32410 +               .maxlen         = sizeof(int),
32411 +               .mode           = 0644,
32412 +               .proc_handler   = proc_dointvec
32413 +       },
32414         {
32415                 .procname       = "icmp_ignore_bogus_error_responses",
32416                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
32417 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
32418 index 61c93a93f228..4df48ae72abc 100644
32419 --- a/net/ipv4/tcp_ipv4.c
32420 +++ b/net/ipv4/tcp_ipv4.c
32421 @@ -62,6 +62,7 @@
32422  #include <linux/init.h>
32423  #include <linux/times.h>
32424  #include <linux/slab.h>
32425 +#include <linux/locallock.h>
32427  #include <net/net_namespace.h>
32428  #include <net/icmp.h>
32429 @@ -570,6 +571,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
32431  EXPORT_SYMBOL(tcp_v4_send_check);
32433 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
32434  /*
32435   *     This routine will send an RST to the other tcp.
32436   *
32437 @@ -691,10 +693,13 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
32438                 arg.bound_dev_if = sk->sk_bound_dev_if;
32440         arg.tos = ip_hdr(skb)->tos;
32442 +       local_lock(tcp_sk_lock);
32443         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
32444                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
32445                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
32446                               &arg, arg.iov[0].iov_len);
32447 +       local_unlock(tcp_sk_lock);
32449         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
32450         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
32451 @@ -776,10 +781,12 @@ static void tcp_v4_send_ack(struct net *net,
32452         if (oif)
32453                 arg.bound_dev_if = oif;
32454         arg.tos = tos;
32455 +       local_lock(tcp_sk_lock);
32456         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
32457                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
32458                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
32459                               &arg, arg.iov[0].iov_len);
32460 +       local_unlock(tcp_sk_lock);
32462         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
32464 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
32465 index 3bcabc2ba4a6..c3c798388ab8 100644
32466 --- a/net/mac80211/rx.c
32467 +++ b/net/mac80211/rx.c
32468 @@ -3605,7 +3605,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb,
32469         struct ieee80211_supported_band *sband;
32470         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
32472 -       WARN_ON_ONCE(softirq_count() == 0);
32473 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
32475         if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
32476                 goto drop;
32477 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
32478 index f39276d1c2d7..10880c89d62f 100644
32479 --- a/net/netfilter/core.c
32480 +++ b/net/netfilter/core.c
32481 @@ -22,11 +22,17 @@
32482  #include <linux/proc_fs.h>
32483  #include <linux/mutex.h>
32484  #include <linux/slab.h>
32485 +#include <linux/locallock.h>
32486  #include <net/net_namespace.h>
32487  #include <net/sock.h>
32489  #include "nf_internals.h"
32491 +#ifdef CONFIG_PREEMPT_RT_BASE
32492 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
32493 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
32494 +#endif
32496  static DEFINE_MUTEX(afinfo_mutex);
32498  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
32499 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
32500 index 92ca3e106c2b..ddb4970c3e27 100644
32501 --- a/net/packet/af_packet.c
32502 +++ b/net/packet/af_packet.c
32503 @@ -63,6 +63,7 @@
32504  #include <linux/if_packet.h>
32505  #include <linux/wireless.h>
32506  #include <linux/kernel.h>
32507 +#include <linux/delay.h>
32508  #include <linux/kmod.h>
32509  #include <linux/slab.h>
32510  #include <linux/vmalloc.h>
32511 @@ -694,7 +695,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
32512         if (BLOCK_NUM_PKTS(pbd)) {
32513                 while (atomic_read(&pkc->blk_fill_in_prog)) {
32514                         /* Waiting for skb_copy_bits to finish... */
32515 -                       cpu_relax();
32516 +                       cpu_chill();
32517                 }
32518         }
32520 @@ -956,7 +957,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
32521                 if (!(status & TP_STATUS_BLK_TMO)) {
32522                         while (atomic_read(&pkc->blk_fill_in_prog)) {
32523                                 /* Waiting for skb_copy_bits to finish... */
32524 -                               cpu_relax();
32525 +                               cpu_chill();
32526                         }
32527                 }
32528                 prb_close_block(pkc, pbd, po, status);
32529 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
32530 index a2340748ec86..19123a97b354 100644
32531 --- a/net/rds/ib_rdma.c
32532 +++ b/net/rds/ib_rdma.c
32533 @@ -34,6 +34,7 @@
32534  #include <linux/slab.h>
32535  #include <linux/rculist.h>
32536  #include <linux/llist.h>
32537 +#include <linux/delay.h>
32539  #include "rds.h"
32540  #include "ib.h"
32541 @@ -313,7 +314,7 @@ static inline void wait_clean_list_grace(void)
32542         for_each_online_cpu(cpu) {
32543                 flag = &per_cpu(clean_list_grace, cpu);
32544                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
32545 -                       cpu_relax();
32546 +                       cpu_chill();
32547         }
32550 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
32551 index aa4725038f94..00b81cab28f3 100644
32552 --- a/net/sched/sch_generic.c
32553 +++ b/net/sched/sch_generic.c
32554 @@ -893,7 +893,7 @@ void dev_deactivate_many(struct list_head *head)
32555         /* Wait for outstanding qdisc_run calls. */
32556         list_for_each_entry(dev, head, close_list)
32557                 while (some_qdisc_is_busy(dev))
32558 -                       yield();
32559 +                       msleep(1);
32562  void dev_deactivate(struct net_device *dev)
32563 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
32564 index a6cbb2104667..5b69bb580617 100644
32565 --- a/net/sunrpc/svc_xprt.c
32566 +++ b/net/sunrpc/svc_xprt.c
32567 @@ -340,7 +340,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
32568                 goto out;
32569         }
32571 -       cpu = get_cpu();
32572 +       cpu = get_cpu_light();
32573         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
32575         atomic_long_inc(&pool->sp_stats.packets);
32576 @@ -376,7 +376,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
32578                 atomic_long_inc(&pool->sp_stats.threads_woken);
32579                 wake_up_process(rqstp->rq_task);
32580 -               put_cpu();
32581 +               put_cpu_light();
32582                 goto out;
32583         }
32584         rcu_read_unlock();
32585 @@ -397,7 +397,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
32586                 goto redo_search;
32587         }
32588         rqstp = NULL;
32589 -       put_cpu();
32590 +       put_cpu_light();
32591  out:
32592         trace_svc_xprt_do_enqueue(xprt, rqstp);
32594 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
32595 index 6fdc97ef6023..523e0420d7f0 100755
32596 --- a/scripts/mkcompile_h
32597 +++ b/scripts/mkcompile_h
32598 @@ -4,7 +4,8 @@ TARGET=$1
32599  ARCH=$2
32600  SMP=$3
32601  PREEMPT=$4
32602 -CC=$5
32603 +RT=$5
32604 +CC=$6
32606  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
32608 @@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
32609  CONFIG_FLAGS=""
32610  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
32611  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
32612 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
32613  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
32615  # Truncate to maximum length
32616 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
32617 index 4ba64fd49759..34e50186885d 100644
32618 --- a/sound/core/pcm_native.c
32619 +++ b/sound/core/pcm_native.c
32620 @@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
32621  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
32623         if (!substream->pcm->nonatomic)
32624 -               local_irq_disable();
32625 +               local_irq_disable_nort();
32626         snd_pcm_stream_lock(substream);
32628  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
32629 @@ -150,7 +150,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
32631         snd_pcm_stream_unlock(substream);
32632         if (!substream->pcm->nonatomic)
32633 -               local_irq_enable();
32634 +               local_irq_enable_nort();
32636  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
32638 @@ -158,7 +158,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
32640         unsigned long flags = 0;
32641         if (!substream->pcm->nonatomic)
32642 -               local_irq_save(flags);
32643 +               local_irq_save_nort(flags);
32644         snd_pcm_stream_lock(substream);
32645         return flags;
32647 @@ -176,7 +176,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
32649         snd_pcm_stream_unlock(substream);
32650         if (!substream->pcm->nonatomic)
32651 -               local_irq_restore(flags);
32652 +               local_irq_restore_nort(flags);
32654  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
32656 diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
32657 index 4f70d12e392d..9378d0919ed8 100644
32658 --- a/virt/kvm/async_pf.c
32659 +++ b/virt/kvm/async_pf.c
32660 @@ -98,8 +98,8 @@ static void async_pf_execute(struct work_struct *work)
32661          * This memory barrier pairs with prepare_to_wait's set_current_state()
32662          */
32663         smp_mb();
32664 -       if (waitqueue_active(&vcpu->wq))
32665 -               wake_up_interruptible(&vcpu->wq);
32666 +       if (swait_active(&vcpu->wq))
32667 +               swake_up(&vcpu->wq);
32669         mmput(mm);
32670         kvm_put_kvm(vcpu->kvm);
32671 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
32672 index d080f06fd8d9..a8d8150fd398 100644
32673 --- a/virt/kvm/kvm_main.c
32674 +++ b/virt/kvm/kvm_main.c
32675 @@ -228,8 +228,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
32676         vcpu->kvm = kvm;
32677         vcpu->vcpu_id = id;
32678         vcpu->pid = NULL;
32679 -       vcpu->halt_poll_ns = 0;
32680 -       init_waitqueue_head(&vcpu->wq);
32681 +       init_swait_queue_head(&vcpu->wq);
32682         kvm_async_pf_vcpu_init(vcpu);
32684         vcpu->pre_pcpu = -1;
32685 @@ -2008,7 +2007,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
32686  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
32688         ktime_t start, cur;
32689 -       DEFINE_WAIT(wait);
32690 +       DECLARE_SWAITQUEUE(wait);
32691         bool waited = false;
32692         u64 block_ns;
32694 @@ -2033,7 +2032,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
32695         kvm_arch_vcpu_blocking(vcpu);
32697         for (;;) {
32698 -               prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
32699 +               prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
32701                 if (kvm_vcpu_check_block(vcpu) < 0)
32702                         break;
32703 @@ -2042,7 +2041,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
32704                 schedule();
32705         }
32707 -       finish_wait(&vcpu->wq, &wait);
32708 +       finish_swait(&vcpu->wq, &wait);
32709         cur = ktime_get();
32711         kvm_arch_vcpu_unblocking(vcpu);
32712 @@ -2074,11 +2073,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
32714         int me;
32715         int cpu = vcpu->cpu;
32716 -       wait_queue_head_t *wqp;
32717 +       struct swait_queue_head *wqp;
32719         wqp = kvm_arch_vcpu_wq(vcpu);
32720 -       if (waitqueue_active(wqp)) {
32721 -               wake_up_interruptible(wqp);
32722 +       if (swait_active(wqp)) {
32723 +               swake_up(wqp);
32724                 ++vcpu->stat.halt_wakeup;
32725         }
32727 @@ -2179,7 +2178,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
32728                                 continue;
32729                         if (vcpu == me)
32730                                 continue;
32731 -                       if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
32732 +                       if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
32733                                 continue;
32734                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
32735                                 continue;