1 #define pr_fmt(fmt) "SMP alternatives: " fmt
3 #include <linux/module.h>
4 #include <linux/sched.h>
5 #include <linux/mutex.h>
6 #include <linux/list.h>
7 #include <linux/stringify.h>
8 #include <linux/kprobes.h>
10 #include <linux/vmalloc.h>
11 #include <linux/memory.h>
12 #include <linux/stop_machine.h>
13 #include <linux/slab.h>
14 #include <asm/alternative.h>
15 #include <asm/sections.h>
16 #include <asm/pgtable.h>
19 #include <asm/cacheflush.h>
20 #include <asm/tlbflush.h>
22 #include <asm/fixmap.h>
24 #define MAX_PATCH_LEN (255-1)
26 #ifdef CONFIG_HOTPLUG_CPU
27 static int smp_alt_once
;
29 static int __init
bootonly(char *str
)
34 __setup("smp-alt-boot", bootonly
);
36 #define smp_alt_once 1
39 static int __initdata_or_module debug_alternative
;
41 static int __init
debug_alt(char *str
)
43 debug_alternative
= 1;
46 __setup("debug-alternative", debug_alt
);
48 static int noreplace_smp
;
50 static int __init
setup_noreplace_smp(char *str
)
55 __setup("noreplace-smp", setup_noreplace_smp
);
57 #ifdef CONFIG_PARAVIRT
58 static int __initdata_or_module noreplace_paravirt
= 0;
60 static int __init
setup_noreplace_paravirt(char *str
)
62 noreplace_paravirt
= 1;
65 __setup("noreplace-paravirt", setup_noreplace_paravirt
);
68 #define DPRINTK(fmt, ...) \
70 if (debug_alternative) \
71 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
75 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
76 * that correspond to that nop. Getting from one nop to the next, we
77 * add to the array the offset that is equal to the sum of all sizes of
78 * nops preceding the one we are after.
80 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
81 * nice symmetry of sizes of the previous nops.
83 #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
84 static const unsigned char intelnops
[] =
96 static const unsigned char * const intel_nops
[ASM_NOP_MAX
+2] =
102 intelnops
+ 1 + 2 + 3,
103 intelnops
+ 1 + 2 + 3 + 4,
104 intelnops
+ 1 + 2 + 3 + 4 + 5,
105 intelnops
+ 1 + 2 + 3 + 4 + 5 + 6,
106 intelnops
+ 1 + 2 + 3 + 4 + 5 + 6 + 7,
107 intelnops
+ 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
112 static const unsigned char k8nops
[] =
124 static const unsigned char * const k8_nops
[ASM_NOP_MAX
+2] =
131 k8nops
+ 1 + 2 + 3 + 4,
132 k8nops
+ 1 + 2 + 3 + 4 + 5,
133 k8nops
+ 1 + 2 + 3 + 4 + 5 + 6,
134 k8nops
+ 1 + 2 + 3 + 4 + 5 + 6 + 7,
135 k8nops
+ 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
139 #if defined(K7_NOP1) && !defined(CONFIG_X86_64)
140 static const unsigned char k7nops
[] =
152 static const unsigned char * const k7_nops
[ASM_NOP_MAX
+2] =
159 k7nops
+ 1 + 2 + 3 + 4,
160 k7nops
+ 1 + 2 + 3 + 4 + 5,
161 k7nops
+ 1 + 2 + 3 + 4 + 5 + 6,
162 k7nops
+ 1 + 2 + 3 + 4 + 5 + 6 + 7,
163 k7nops
+ 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
168 static const unsigned char __initconst_or_module p6nops
[] =
180 static const unsigned char * const p6_nops
[ASM_NOP_MAX
+2] =
187 p6nops
+ 1 + 2 + 3 + 4,
188 p6nops
+ 1 + 2 + 3 + 4 + 5,
189 p6nops
+ 1 + 2 + 3 + 4 + 5 + 6,
190 p6nops
+ 1 + 2 + 3 + 4 + 5 + 6 + 7,
191 p6nops
+ 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
195 /* Initialize these to a safe default */
197 const unsigned char * const *ideal_nops
= p6_nops
;
199 const unsigned char * const *ideal_nops
= intel_nops
;
202 void __init
arch_init_ideal_nops(void)
204 switch (boot_cpu_data
.x86_vendor
) {
205 case X86_VENDOR_INTEL
:
207 * Due to a decoder implementation quirk, some
208 * specific Intel CPUs actually perform better with
209 * the "k8_nops" than with the SDM-recommended NOPs.
211 if (boot_cpu_data
.x86
== 6 &&
212 boot_cpu_data
.x86_model
>= 0x0f &&
213 boot_cpu_data
.x86_model
!= 0x1c &&
214 boot_cpu_data
.x86_model
!= 0x26 &&
215 boot_cpu_data
.x86_model
!= 0x27 &&
216 boot_cpu_data
.x86_model
< 0x30) {
217 ideal_nops
= k8_nops
;
218 } else if (boot_cpu_has(X86_FEATURE_NOPL
)) {
219 ideal_nops
= p6_nops
;
222 ideal_nops
= k8_nops
;
224 ideal_nops
= intel_nops
;
230 ideal_nops
= k8_nops
;
232 if (boot_cpu_has(X86_FEATURE_K8
))
233 ideal_nops
= k8_nops
;
234 else if (boot_cpu_has(X86_FEATURE_K7
))
235 ideal_nops
= k7_nops
;
237 ideal_nops
= intel_nops
;
242 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
243 static void __init_or_module
add_nops(void *insns
, unsigned int len
)
246 unsigned int noplen
= len
;
247 if (noplen
> ASM_NOP_MAX
)
248 noplen
= ASM_NOP_MAX
;
249 memcpy(insns
, ideal_nops
[noplen
], noplen
);
255 extern struct alt_instr __alt_instructions
[], __alt_instructions_end
[];
256 extern s32 __smp_locks
[], __smp_locks_end
[];
257 void *text_poke_early(void *addr
, const void *opcode
, size_t len
);
259 /* Replace instructions with better alternatives for this CPU type.
260 This runs before SMP is initialized to avoid SMP problems with
261 self modifying code. This implies that asymmetric systems where
262 APs have less capabilities than the boot processor are not handled.
263 Tough. Make sure you disable such features by hand. */
265 void __init_or_module
apply_alternatives(struct alt_instr
*start
,
266 struct alt_instr
*end
)
269 u8
*instr
, *replacement
;
270 u8 insnbuf
[MAX_PATCH_LEN
];
272 DPRINTK("%s: alt table %p -> %p\n", __func__
, start
, end
);
274 * The scan order should be from start to end. A later scanned
275 * alternative code can overwrite a previous scanned alternative code.
276 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
279 * So be careful if you want to change the scan order to any other
282 for (a
= start
; a
< end
; a
++) {
283 instr
= (u8
*)&a
->instr_offset
+ a
->instr_offset
;
284 replacement
= (u8
*)&a
->repl_offset
+ a
->repl_offset
;
285 BUG_ON(a
->replacementlen
> a
->instrlen
);
286 BUG_ON(a
->instrlen
> sizeof(insnbuf
));
287 BUG_ON(a
->cpuid
>= NCAPINTS
*32);
288 if (!boot_cpu_has(a
->cpuid
))
291 memcpy(insnbuf
, replacement
, a
->replacementlen
);
293 /* 0xe8 is a relative jump; fix the offset. */
294 if (*insnbuf
== 0xe8 && a
->replacementlen
== 5)
295 *(s32
*)(insnbuf
+ 1) += replacement
- instr
;
297 add_nops(insnbuf
+ a
->replacementlen
,
298 a
->instrlen
- a
->replacementlen
);
300 text_poke_early(instr
, insnbuf
, a
->instrlen
);
306 static void alternatives_smp_lock(const s32
*start
, const s32
*end
,
307 u8
*text
, u8
*text_end
)
311 mutex_lock(&text_mutex
);
312 for (poff
= start
; poff
< end
; poff
++) {
313 u8
*ptr
= (u8
*)poff
+ *poff
;
315 if (!*poff
|| ptr
< text
|| ptr
>= text_end
)
317 /* turn DS segment override prefix into lock prefix */
319 text_poke(ptr
, ((unsigned char []){0xf0}), 1);
321 mutex_unlock(&text_mutex
);
324 static void alternatives_smp_unlock(const s32
*start
, const s32
*end
,
325 u8
*text
, u8
*text_end
)
332 mutex_lock(&text_mutex
);
333 for (poff
= start
; poff
< end
; poff
++) {
334 u8
*ptr
= (u8
*)poff
+ *poff
;
336 if (!*poff
|| ptr
< text
|| ptr
>= text_end
)
338 /* turn lock prefix into DS segment override prefix */
340 text_poke(ptr
, ((unsigned char []){0x3E}), 1);
342 mutex_unlock(&text_mutex
);
345 struct smp_alt_module
{
346 /* what is this ??? */
350 /* ptrs to lock prefixes */
352 const s32
*locks_end
;
354 /* .text segment, needed to avoid patching init code ;) */
358 struct list_head next
;
360 static LIST_HEAD(smp_alt_modules
);
361 static DEFINE_MUTEX(smp_alt
);
362 static int smp_mode
= 1; /* protected by smp_alt */
364 void __init_or_module
alternatives_smp_module_add(struct module
*mod
,
366 void *locks
, void *locks_end
,
367 void *text
, void *text_end
)
369 struct smp_alt_module
*smp
;
375 if (boot_cpu_has(X86_FEATURE_UP
))
376 alternatives_smp_unlock(locks
, locks_end
,
381 smp
= kzalloc(sizeof(*smp
), GFP_KERNEL
);
383 return; /* we'll run the (safe but slow) SMP code then ... */
388 smp
->locks_end
= locks_end
;
390 smp
->text_end
= text_end
;
391 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
392 __func__
, smp
->locks
, smp
->locks_end
,
393 smp
->text
, smp
->text_end
, smp
->name
);
395 mutex_lock(&smp_alt
);
396 list_add_tail(&smp
->next
, &smp_alt_modules
);
397 if (boot_cpu_has(X86_FEATURE_UP
))
398 alternatives_smp_unlock(smp
->locks
, smp
->locks_end
,
399 smp
->text
, smp
->text_end
);
400 mutex_unlock(&smp_alt
);
403 void __init_or_module
alternatives_smp_module_del(struct module
*mod
)
405 struct smp_alt_module
*item
;
407 if (smp_alt_once
|| noreplace_smp
)
410 mutex_lock(&smp_alt
);
411 list_for_each_entry(item
, &smp_alt_modules
, next
) {
412 if (mod
!= item
->mod
)
414 list_del(&item
->next
);
415 mutex_unlock(&smp_alt
);
416 DPRINTK("%s: %s\n", __func__
, item
->name
);
420 mutex_unlock(&smp_alt
);
423 bool skip_smp_alternatives
;
424 void alternatives_smp_switch(int smp
)
426 struct smp_alt_module
*mod
;
428 #ifdef CONFIG_LOCKDEP
430 * Older binutils section handling bug prevented
431 * alternatives-replacement from working reliably.
433 * If this still occurs then you should see a hang
434 * or crash shortly after this line:
436 pr_info("lockdep: fixing up alternatives\n");
439 if (noreplace_smp
|| smp_alt_once
|| skip_smp_alternatives
)
441 BUG_ON(!smp
&& (num_online_cpus() > 1));
443 mutex_lock(&smp_alt
);
446 * Avoid unnecessary switches because it forces JIT based VMs to
447 * throw away all cached translations, which can be quite costly.
449 if (smp
== smp_mode
) {
452 pr_info("switching to SMP code\n");
453 clear_cpu_cap(&boot_cpu_data
, X86_FEATURE_UP
);
454 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP
);
455 list_for_each_entry(mod
, &smp_alt_modules
, next
)
456 alternatives_smp_lock(mod
->locks
, mod
->locks_end
,
457 mod
->text
, mod
->text_end
);
459 pr_info("switching to UP code\n");
460 set_cpu_cap(&boot_cpu_data
, X86_FEATURE_UP
);
461 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP
);
462 list_for_each_entry(mod
, &smp_alt_modules
, next
)
463 alternatives_smp_unlock(mod
->locks
, mod
->locks_end
,
464 mod
->text
, mod
->text_end
);
467 mutex_unlock(&smp_alt
);
470 /* Return 1 if the address range is reserved for smp-alternatives */
471 int alternatives_text_reserved(void *start
, void *end
)
473 struct smp_alt_module
*mod
;
475 u8
*text_start
= start
;
478 list_for_each_entry(mod
, &smp_alt_modules
, next
) {
479 if (mod
->text
> text_end
|| mod
->text_end
< text_start
)
481 for (poff
= mod
->locks
; poff
< mod
->locks_end
; poff
++) {
482 const u8
*ptr
= (const u8
*)poff
+ *poff
;
484 if (text_start
<= ptr
&& text_end
> ptr
)
493 #ifdef CONFIG_PARAVIRT
494 void __init_or_module
apply_paravirt(struct paravirt_patch_site
*start
,
495 struct paravirt_patch_site
*end
)
497 struct paravirt_patch_site
*p
;
498 char insnbuf
[MAX_PATCH_LEN
];
500 if (noreplace_paravirt
)
503 for (p
= start
; p
< end
; p
++) {
506 BUG_ON(p
->len
> MAX_PATCH_LEN
);
507 /* prep the buffer with the original instructions */
508 memcpy(insnbuf
, p
->instr
, p
->len
);
509 used
= pv_init_ops
.patch(p
->instrtype
, p
->clobbers
, insnbuf
,
510 (unsigned long)p
->instr
, p
->len
);
512 BUG_ON(used
> p
->len
);
514 /* Pad the rest with nops */
515 add_nops(insnbuf
+ used
, p
->len
- used
);
516 text_poke_early(p
->instr
, insnbuf
, p
->len
);
519 extern struct paravirt_patch_site __start_parainstructions
[],
520 __stop_parainstructions
[];
521 #endif /* CONFIG_PARAVIRT */
523 void __init
alternative_instructions(void)
525 /* The patching is not fully atomic, so try to avoid local interruptions
526 that might execute the to be patched code.
527 Other CPUs are not running. */
531 * Don't stop machine check exceptions while patching.
532 * MCEs only happen when something got corrupted and in this
533 * case we must do something about the corruption.
534 * Ignoring it is worse than a unlikely patching race.
535 * Also machine checks tend to be broadcast and if one CPU
536 * goes into machine check the others follow quickly, so we don't
537 * expect a machine check to cause undue problems during to code
541 apply_alternatives(__alt_instructions
, __alt_instructions_end
);
543 /* switch to patch-once-at-boottime-only mode and free the
544 * tables in case we know the number of CPUs will never ever
546 #ifdef CONFIG_HOTPLUG_CPU
547 if (num_possible_cpus() < 2)
553 if (1 == num_possible_cpus()) {
554 pr_info("switching to UP code\n");
555 set_cpu_cap(&boot_cpu_data
, X86_FEATURE_UP
);
556 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP
);
558 alternatives_smp_unlock(__smp_locks
, __smp_locks_end
,
562 alternatives_smp_module_add(NULL
, "core kernel",
563 __smp_locks
, __smp_locks_end
,
566 /* Only switch to UP mode if we don't immediately boot others */
567 if (num_present_cpus() == 1 || setup_max_cpus
<= 1)
568 alternatives_smp_switch(0);
571 apply_paravirt(__parainstructions
, __parainstructions_end
);
574 free_init_pages("SMP alternatives",
575 (unsigned long)__smp_locks
,
576 (unsigned long)__smp_locks_end
);
582 * text_poke_early - Update instructions on a live kernel at boot time
583 * @addr: address to modify
584 * @opcode: source of the copy
585 * @len: length to copy
587 * When you use this code to patch more than one byte of an instruction
588 * you need to make sure that other CPUs cannot execute this code in parallel.
589 * Also no thread must be currently preempted in the middle of these
590 * instructions. And on the local CPU you need to be protected again NMI or MCE
591 * handlers seeing an inconsistent instruction while you patch.
593 void *__init_or_module
text_poke_early(void *addr
, const void *opcode
,
597 local_irq_save(flags
);
598 memcpy(addr
, opcode
, len
);
600 local_irq_restore(flags
);
601 /* Could also do a CLFLUSH here to speed up CPU recovery; but
602 that causes hangs on some VIA CPUs. */
607 * text_poke - Update instructions on a live kernel
608 * @addr: address to modify
609 * @opcode: source of the copy
610 * @len: length to copy
612 * Only atomic text poke/set should be allowed when not doing early patching.
613 * It means the size must be writable atomically and the address must be aligned
614 * in a way that permits an atomic write. It also makes sure we fit on a single
617 * Note: Must be called under text_mutex.
619 void *__kprobes
text_poke(void *addr
, const void *opcode
, size_t len
)
623 struct page
*pages
[2];
626 if (!core_kernel_text((unsigned long)addr
)) {
627 pages
[0] = vmalloc_to_page(addr
);
628 pages
[1] = vmalloc_to_page(addr
+ PAGE_SIZE
);
630 pages
[0] = virt_to_page(addr
);
631 WARN_ON(!PageReserved(pages
[0]));
632 pages
[1] = virt_to_page(addr
+ PAGE_SIZE
);
635 local_irq_save(flags
);
636 set_fixmap(FIX_TEXT_POKE0
, page_to_phys(pages
[0]));
638 set_fixmap(FIX_TEXT_POKE1
, page_to_phys(pages
[1]));
639 vaddr
= (char *)fix_to_virt(FIX_TEXT_POKE0
);
640 memcpy(&vaddr
[(unsigned long)addr
& ~PAGE_MASK
], opcode
, len
);
641 clear_fixmap(FIX_TEXT_POKE0
);
643 clear_fixmap(FIX_TEXT_POKE1
);
646 /* Could also do a CLFLUSH here to speed up CPU recovery; but
647 that causes hangs on some VIA CPUs. */
648 for (i
= 0; i
< len
; i
++)
649 BUG_ON(((char *)addr
)[i
] != ((char *)opcode
)[i
]);
650 local_irq_restore(flags
);
655 * Cross-modifying kernel text with stop_machine().
656 * This code originally comes from immediate value.
658 static atomic_t stop_machine_first
;
659 static int wrote_text
;
661 struct text_poke_params
{
662 struct text_poke_param
*params
;
666 static int __kprobes
stop_machine_text_poke(void *data
)
668 struct text_poke_params
*tpp
= data
;
669 struct text_poke_param
*p
;
672 if (atomic_xchg(&stop_machine_first
, 0)) {
673 for (i
= 0; i
< tpp
->nparams
; i
++) {
675 text_poke(p
->addr
, p
->opcode
, p
->len
);
677 smp_wmb(); /* Make sure other cpus see that this has run */
682 smp_mb(); /* Load wrote_text before following execution */
685 for (i
= 0; i
< tpp
->nparams
; i
++) {
687 flush_icache_range((unsigned long)p
->addr
,
688 (unsigned long)p
->addr
+ p
->len
);
691 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
692 * that a core serializing instruction such as "cpuid" should be
693 * executed on _each_ core before the new instruction is made visible.
700 * text_poke_smp - Update instructions on a live kernel on SMP
701 * @addr: address to modify
702 * @opcode: source of the copy
703 * @len: length to copy
705 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
706 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
707 * should be allowed, since stop_machine() does _not_ protect code against
710 * Note: Must be called under get_online_cpus() and text_mutex.
712 void *__kprobes
text_poke_smp(void *addr
, const void *opcode
, size_t len
)
714 struct text_poke_params tpp
;
715 struct text_poke_param p
;
722 atomic_set(&stop_machine_first
, 1);
724 /* Use __stop_machine() because the caller already got online_cpus. */
725 __stop_machine(stop_machine_text_poke
, (void *)&tpp
, cpu_online_mask
);
730 * text_poke_smp_batch - Update instructions on a live kernel on SMP
731 * @params: an array of text_poke parameters
732 * @n: the number of elements in params.
734 * Modify multi-byte instruction by using stop_machine() on SMP. Since the
735 * stop_machine() is heavy task, it is better to aggregate text_poke requests
736 * and do it once if possible.
738 * Note: Must be called under get_online_cpus() and text_mutex.
740 void __kprobes
text_poke_smp_batch(struct text_poke_param
*params
, int n
)
742 struct text_poke_params tpp
= {.params
= params
, .nparams
= n
};
744 atomic_set(&stop_machine_first
, 1);
746 __stop_machine(stop_machine_text_poke
, (void *)&tpp
, cpu_online_mask
);