1 #include <linux/init.h>
4 #include <linux/spinlock.h>
6 #include <linux/interrupt.h>
7 #include <linux/export.h>
10 #include <asm/tlbflush.h>
11 #include <asm/mmu_context.h>
12 #include <asm/cache.h>
14 #include <asm/uv/uv.h>
15 #include <linux/debugfs.h>
18 * TLB flushing, formerly SMP-only
21 * These mean you can really definitely utterly forget about
22 * writing to user space from interrupts. (Its not allowed anyway).
24 * Optimizations Manfred Spraul <manfred@colorfullife.com>
26 * More scalable flush, from Andi Kleen
28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
31 void leave_mm(int cpu
)
33 struct mm_struct
*loaded_mm
= this_cpu_read(cpu_tlbstate
.loaded_mm
);
36 * It's plausible that we're in lazy TLB mode while our mm is init_mm.
37 * If so, our callers still expect us to flush the TLB, but there
38 * aren't any user TLB entries in init_mm to worry about.
40 * This needs to happen before any other sanity checks due to
41 * intel_idle's shenanigans.
43 if (loaded_mm
== &init_mm
)
46 if (this_cpu_read(cpu_tlbstate
.state
) == TLBSTATE_OK
)
49 switch_mm(NULL
, &init_mm
, NULL
);
51 EXPORT_SYMBOL_GPL(leave_mm
);
53 void switch_mm(struct mm_struct
*prev
, struct mm_struct
*next
,
54 struct task_struct
*tsk
)
58 local_irq_save(flags
);
59 switch_mm_irqs_off(prev
, next
, tsk
);
60 local_irq_restore(flags
);
63 void switch_mm_irqs_off(struct mm_struct
*prev
, struct mm_struct
*next
,
64 struct task_struct
*tsk
)
66 unsigned cpu
= smp_processor_id();
67 struct mm_struct
*real_prev
= this_cpu_read(cpu_tlbstate
.loaded_mm
);
70 * NB: The scheduler will call us with prev == next when
71 * switching from lazy TLB mode to normal mode if active_mm
72 * isn't changing. When this happens, there is no guarantee
73 * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
75 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
78 this_cpu_write(cpu_tlbstate
.state
, TLBSTATE_OK
);
80 if (real_prev
== next
) {
82 * There's nothing to do: we always keep the per-mm control
83 * regs in sync with cpu_tlbstate.loaded_mm. Just
84 * sanity-check mm_cpumask.
86 if (WARN_ON_ONCE(!cpumask_test_cpu(cpu
, mm_cpumask(next
))))
87 cpumask_set_cpu(cpu
, mm_cpumask(next
));
91 if (IS_ENABLED(CONFIG_VMAP_STACK
)) {
93 * If our current stack is in vmalloc space and isn't
94 * mapped in the new pgd, we'll double-fault. Forcibly
97 unsigned int stack_pgd_index
= pgd_index(current_stack_pointer());
99 pgd_t
*pgd
= next
->pgd
+ stack_pgd_index
;
101 if (unlikely(pgd_none(*pgd
)))
102 set_pgd(pgd
, init_mm
.pgd
[stack_pgd_index
]);
105 this_cpu_write(cpu_tlbstate
.loaded_mm
, next
);
107 WARN_ON_ONCE(cpumask_test_cpu(cpu
, mm_cpumask(next
)));
108 cpumask_set_cpu(cpu
, mm_cpumask(next
));
111 * Re-load page tables.
113 * This logic has an ordering constraint:
115 * CPU 0: Write to a PTE for 'next'
116 * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
117 * CPU 1: set bit 1 in next's mm_cpumask
118 * CPU 1: load from the PTE that CPU 0 writes (implicit)
120 * We need to prevent an outcome in which CPU 1 observes
121 * the new PTE value and CPU 0 observes bit 1 clear in
122 * mm_cpumask. (If that occurs, then the IPI will never
123 * be sent, and CPU 0's TLB will contain a stale entry.)
125 * The bad outcome can occur if either CPU's load is
126 * reordered before that CPU's store, so both CPUs must
127 * execute full barriers to prevent this from happening.
129 * Thus, switch_mm needs a full barrier between the
130 * store to mm_cpumask and any operation that could load
131 * from next->pgd. TLB fills are special and can happen
132 * due to instruction fetches or for no reason at all,
133 * and neither LOCK nor MFENCE orders them.
134 * Fortunately, load_cr3() is serializing and gives the
135 * ordering guarantee we need.
140 * This gets called via leave_mm() in the idle path where RCU
141 * functions differently. Tracing normally uses RCU, so we have to
142 * call the tracepoint specially here.
144 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH
, TLB_FLUSH_ALL
);
146 /* Stop flush ipis for the previous mm */
147 WARN_ON_ONCE(!cpumask_test_cpu(cpu
, mm_cpumask(real_prev
)) &&
148 real_prev
!= &init_mm
);
149 cpumask_clear_cpu(cpu
, mm_cpumask(real_prev
));
151 /* Load per-mm CR4 and LDTR state */
153 switch_ldt(real_prev
, next
);
156 static void flush_tlb_func_common(const struct flush_tlb_info
*f
,
157 bool local
, enum tlb_flush_reason reason
)
159 /* This code cannot presently handle being reentered. */
160 VM_WARN_ON(!irqs_disabled());
162 if (this_cpu_read(cpu_tlbstate
.state
) != TLBSTATE_OK
) {
163 leave_mm(smp_processor_id());
167 if (f
->end
== TLB_FLUSH_ALL
) {
170 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL
);
171 trace_tlb_flush(reason
, TLB_FLUSH_ALL
);
174 unsigned long nr_pages
= (f
->end
- f
->start
) >> PAGE_SHIFT
;
176 while (addr
< f
->end
) {
177 __flush_tlb_single(addr
);
181 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE
, nr_pages
);
182 trace_tlb_flush(reason
, nr_pages
);
186 static void flush_tlb_func_local(void *info
, enum tlb_flush_reason reason
)
188 const struct flush_tlb_info
*f
= info
;
190 flush_tlb_func_common(f
, true, reason
);
193 static void flush_tlb_func_remote(void *info
)
195 const struct flush_tlb_info
*f
= info
;
197 inc_irq_stat(irq_tlb_count
);
199 if (f
->mm
&& f
->mm
!= this_cpu_read(cpu_tlbstate
.loaded_mm
))
202 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED
);
203 flush_tlb_func_common(f
, false, TLB_REMOTE_SHOOTDOWN
);
206 void native_flush_tlb_others(const struct cpumask
*cpumask
,
207 const struct flush_tlb_info
*info
)
209 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH
);
210 if (info
->end
== TLB_FLUSH_ALL
)
211 trace_tlb_flush(TLB_REMOTE_SEND_IPI
, TLB_FLUSH_ALL
);
213 trace_tlb_flush(TLB_REMOTE_SEND_IPI
,
214 (info
->end
- info
->start
) >> PAGE_SHIFT
);
216 if (is_uv_system()) {
219 cpu
= smp_processor_id();
220 cpumask
= uv_flush_tlb_others(cpumask
, info
);
222 smp_call_function_many(cpumask
, flush_tlb_func_remote
,
226 smp_call_function_many(cpumask
, flush_tlb_func_remote
,
231 * See Documentation/x86/tlb.txt for details. We choose 33
232 * because it is large enough to cover the vast majority (at
233 * least 95%) of allocations, and is small enough that we are
234 * confident it will not cause too much overhead. Each single
235 * flush is about 100 ns, so this caps the maximum overhead at
238 * This is in units of pages.
240 static unsigned long tlb_single_page_flush_ceiling __read_mostly
= 33;
242 void flush_tlb_mm_range(struct mm_struct
*mm
, unsigned long start
,
243 unsigned long end
, unsigned long vmflag
)
247 struct flush_tlb_info info
= {
253 /* Synchronize with switch_mm. */
256 /* Should we flush just the requested range? */
257 if ((end
!= TLB_FLUSH_ALL
) &&
258 !(vmflag
& VM_HUGETLB
) &&
259 ((end
- start
) >> PAGE_SHIFT
) <= tlb_single_page_flush_ceiling
) {
264 info
.end
= TLB_FLUSH_ALL
;
267 if (mm
== this_cpu_read(cpu_tlbstate
.loaded_mm
)) {
268 VM_WARN_ON(irqs_disabled());
270 flush_tlb_func_local(&info
, TLB_LOCAL_MM_SHOOTDOWN
);
274 if (cpumask_any_but(mm_cpumask(mm
), cpu
) < nr_cpu_ids
)
275 flush_tlb_others(mm_cpumask(mm
), &info
);
280 static void do_flush_tlb_all(void *info
)
282 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED
);
284 if (this_cpu_read(cpu_tlbstate
.state
) == TLBSTATE_LAZY
)
285 leave_mm(smp_processor_id());
288 void flush_tlb_all(void)
290 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH
);
291 on_each_cpu(do_flush_tlb_all
, NULL
, 1);
294 static void do_kernel_range_flush(void *info
)
296 struct flush_tlb_info
*f
= info
;
299 /* flush range by one by one 'invlpg' */
300 for (addr
= f
->start
; addr
< f
->end
; addr
+= PAGE_SIZE
)
301 __flush_tlb_single(addr
);
304 void flush_tlb_kernel_range(unsigned long start
, unsigned long end
)
307 /* Balance as user space task's flush, a bit conservative */
308 if (end
== TLB_FLUSH_ALL
||
309 (end
- start
) > tlb_single_page_flush_ceiling
<< PAGE_SHIFT
) {
310 on_each_cpu(do_flush_tlb_all
, NULL
, 1);
312 struct flush_tlb_info info
;
315 on_each_cpu(do_kernel_range_flush
, &info
, 1);
319 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch
*batch
)
321 struct flush_tlb_info info
= {
324 .end
= TLB_FLUSH_ALL
,
329 if (cpumask_test_cpu(cpu
, &batch
->cpumask
)) {
330 VM_WARN_ON(irqs_disabled());
332 flush_tlb_func_local(&info
, TLB_LOCAL_SHOOTDOWN
);
336 if (cpumask_any_but(&batch
->cpumask
, cpu
) < nr_cpu_ids
)
337 flush_tlb_others(&batch
->cpumask
, &info
);
338 cpumask_clear(&batch
->cpumask
);
343 static ssize_t
tlbflush_read_file(struct file
*file
, char __user
*user_buf
,
344 size_t count
, loff_t
*ppos
)
349 len
= sprintf(buf
, "%ld\n", tlb_single_page_flush_ceiling
);
350 return simple_read_from_buffer(user_buf
, count
, ppos
, buf
, len
);
353 static ssize_t
tlbflush_write_file(struct file
*file
,
354 const char __user
*user_buf
, size_t count
, loff_t
*ppos
)
360 len
= min(count
, sizeof(buf
) - 1);
361 if (copy_from_user(buf
, user_buf
, len
))
365 if (kstrtoint(buf
, 0, &ceiling
))
371 tlb_single_page_flush_ceiling
= ceiling
;
375 static const struct file_operations fops_tlbflush
= {
376 .read
= tlbflush_read_file
,
377 .write
= tlbflush_write_file
,
378 .llseek
= default_llseek
,
381 static int __init
create_tlb_single_page_flush_ceiling(void)
383 debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR
| S_IWUSR
,
384 arch_debugfs_dir
, NULL
, &fops_tlbflush
);
387 late_initcall(create_tlb_single_page_flush_ceiling
);