2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage
extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags
= CLONE_VM
| CLONE_UNTRACED
;
59 unsigned long boot_option_idle_override
= 0;
60 EXPORT_SYMBOL(boot_option_idle_override
);
63 * Powermanagement idle function, if any..
65 void (*pm_idle
)(void);
66 EXPORT_SYMBOL(pm_idle
);
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier
);
70 void idle_notifier_register(struct notifier_block
*n
)
72 atomic_notifier_chain_register(&idle_notifier
, n
);
78 atomic_notifier_call_chain(&idle_notifier
, IDLE_START
, NULL
);
81 static void __exit_idle(void)
83 if (test_and_clear_bit_pda(0, isidle
) == 0)
85 atomic_notifier_call_chain(&idle_notifier
, IDLE_END
, NULL
);
88 /* Called from interrupts to signify idle end */
91 /* idle loop has pid 0 */
98 * We use this if we don't have any better
101 void default_idle(void)
103 current_thread_info()->status
&= ~TS_POLLING
;
105 * TS_POLLING-cleared state must be visible before we
110 safe_halt(); /* enables interrupts racelessly */
113 current_thread_info()->status
|= TS_POLLING
;
116 #ifdef CONFIG_HOTPLUG_CPU
117 DECLARE_PER_CPU(int, cpu_state
);
120 /* We halt the CPU with physical CPU hotplug */
121 static inline void play_dead(void)
127 __get_cpu_var(cpu_state
) = CPU_DEAD
;
134 static inline void play_dead(void)
138 #endif /* CONFIG_HOTPLUG_CPU */
141 * The idle thread. There's no useful work to be
142 * done, so just try to conserve power and have a
143 * low exit latency (ie sit in a loop waiting for
144 * somebody to say that they'd like to reschedule)
148 current_thread_info()->status
|= TS_POLLING
;
149 /* endless idle loop with no priority at all */
151 tick_nohz_stop_sched_tick();
152 while (!need_resched()) {
159 if (cpu_is_offline(smp_processor_id()))
162 * Idle routines should keep interrupts disabled
163 * from here on, until they go to idle.
164 * Otherwise, idle callbacks can misfire.
169 /* In many cases the interrupt that ended idle
170 has already called exit_idle. But some idle
171 loops can be woken up without interrupt. */
175 tick_nohz_restart_sched_tick();
176 preempt_enable_no_resched();
182 /* Prints also some state that isn't saved in the pt_regs */
183 void __show_regs(struct pt_regs
* regs
)
185 unsigned long cr0
= 0L, cr2
= 0L, cr3
= 0L, cr4
= 0L, fs
, gs
, shadowgs
;
186 unsigned long d0
, d1
, d2
, d3
, d6
, d7
;
187 unsigned int fsindex
, gsindex
;
188 unsigned int ds
, cs
, es
;
192 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
193 current
->pid
, current
->comm
, print_tainted(),
194 init_utsname()->release
,
195 (int)strcspn(init_utsname()->version
, " "),
196 init_utsname()->version
);
197 printk("RIP: %04lx:[<%016lx>] ", regs
->cs
& 0xffff, regs
->ip
);
198 printk_address(regs
->ip
, 1);
199 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs
->ss
, regs
->sp
,
201 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
202 regs
->ax
, regs
->bx
, regs
->cx
);
203 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
204 regs
->dx
, regs
->si
, regs
->di
);
205 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
206 regs
->bp
, regs
->r8
, regs
->r9
);
207 printk("R10: %016lx R11: %016lx R12: %016lx\n",
208 regs
->r10
, regs
->r11
, regs
->r12
);
209 printk("R13: %016lx R14: %016lx R15: %016lx\n",
210 regs
->r13
, regs
->r14
, regs
->r15
);
212 asm("movl %%ds,%0" : "=r" (ds
));
213 asm("movl %%cs,%0" : "=r" (cs
));
214 asm("movl %%es,%0" : "=r" (es
));
215 asm("movl %%fs,%0" : "=r" (fsindex
));
216 asm("movl %%gs,%0" : "=r" (gsindex
));
218 rdmsrl(MSR_FS_BASE
, fs
);
219 rdmsrl(MSR_GS_BASE
, gs
);
220 rdmsrl(MSR_KERNEL_GS_BASE
, shadowgs
);
227 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
228 fs
,fsindex
,gs
,gsindex
,shadowgs
);
229 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs
, ds
, es
, cr0
);
230 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2
, cr3
, cr4
);
235 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0
, d1
, d2
);
239 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3
, d6
, d7
);
242 void show_regs(struct pt_regs
*regs
)
244 printk("CPU %d:", smp_processor_id());
246 show_trace(NULL
, regs
, (void *)(regs
+ 1), regs
->bp
);
250 * Free current thread data structures etc..
252 void exit_thread(void)
254 struct task_struct
*me
= current
;
255 struct thread_struct
*t
= &me
->thread
;
257 if (me
->thread
.io_bitmap_ptr
) {
258 struct tss_struct
*tss
= &per_cpu(init_tss
, get_cpu());
260 kfree(t
->io_bitmap_ptr
);
261 t
->io_bitmap_ptr
= NULL
;
262 clear_thread_flag(TIF_IO_BITMAP
);
264 * Careful, clear this in the TSS too:
266 memset(tss
->io_bitmap
, 0xff, t
->io_bitmap_max
);
267 t
->io_bitmap_max
= 0;
272 void flush_thread(void)
274 struct task_struct
*tsk
= current
;
276 if (test_tsk_thread_flag(tsk
, TIF_ABI_PENDING
)) {
277 clear_tsk_thread_flag(tsk
, TIF_ABI_PENDING
);
278 if (test_tsk_thread_flag(tsk
, TIF_IA32
)) {
279 clear_tsk_thread_flag(tsk
, TIF_IA32
);
281 set_tsk_thread_flag(tsk
, TIF_IA32
);
282 current_thread_info()->status
|= TS_COMPAT
;
285 clear_tsk_thread_flag(tsk
, TIF_DEBUG
);
287 tsk
->thread
.debugreg0
= 0;
288 tsk
->thread
.debugreg1
= 0;
289 tsk
->thread
.debugreg2
= 0;
290 tsk
->thread
.debugreg3
= 0;
291 tsk
->thread
.debugreg6
= 0;
292 tsk
->thread
.debugreg7
= 0;
293 memset(tsk
->thread
.tls_array
, 0, sizeof(tsk
->thread
.tls_array
));
295 * Forget coprocessor state..
297 tsk
->fpu_counter
= 0;
302 void release_thread(struct task_struct
*dead_task
)
305 if (dead_task
->mm
->context
.size
) {
306 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
308 dead_task
->mm
->context
.ldt
,
309 dead_task
->mm
->context
.size
);
315 static inline void set_32bit_tls(struct task_struct
*t
, int tls
, u32 addr
)
317 struct user_desc ud
= {
324 struct desc_struct
*desc
= t
->thread
.tls_array
;
329 static inline u32
read_32bit_tls(struct task_struct
*t
, int tls
)
331 return get_desc_base(&t
->thread
.tls_array
[tls
]);
335 * This gets called before we allocate a new thread and copy
336 * the current task into it.
338 void prepare_to_copy(struct task_struct
*tsk
)
343 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long sp
,
344 unsigned long unused
,
345 struct task_struct
* p
, struct pt_regs
* regs
)
348 struct pt_regs
* childregs
;
349 struct task_struct
*me
= current
;
351 childregs
= ((struct pt_regs
*)
352 (THREAD_SIZE
+ task_stack_page(p
))) - 1;
358 childregs
->sp
= (unsigned long)childregs
;
360 p
->thread
.sp
= (unsigned long) childregs
;
361 p
->thread
.sp0
= (unsigned long) (childregs
+1);
362 p
->thread
.usersp
= me
->thread
.usersp
;
364 set_tsk_thread_flag(p
, TIF_FORK
);
366 p
->thread
.fs
= me
->thread
.fs
;
367 p
->thread
.gs
= me
->thread
.gs
;
369 asm("mov %%gs,%0" : "=m" (p
->thread
.gsindex
));
370 asm("mov %%fs,%0" : "=m" (p
->thread
.fsindex
));
371 asm("mov %%es,%0" : "=m" (p
->thread
.es
));
372 asm("mov %%ds,%0" : "=m" (p
->thread
.ds
));
374 if (unlikely(test_tsk_thread_flag(me
, TIF_IO_BITMAP
))) {
375 p
->thread
.io_bitmap_ptr
= kmalloc(IO_BITMAP_BYTES
, GFP_KERNEL
);
376 if (!p
->thread
.io_bitmap_ptr
) {
377 p
->thread
.io_bitmap_max
= 0;
380 memcpy(p
->thread
.io_bitmap_ptr
, me
->thread
.io_bitmap_ptr
,
382 set_tsk_thread_flag(p
, TIF_IO_BITMAP
);
386 * Set a new TLS for the child thread?
388 if (clone_flags
& CLONE_SETTLS
) {
389 #ifdef CONFIG_IA32_EMULATION
390 if (test_thread_flag(TIF_IA32
))
391 err
= do_set_thread_area(p
, -1,
392 (struct user_desc __user
*)childregs
->si
, 0);
395 err
= do_arch_prctl(p
, ARCH_SET_FS
, childregs
->r8
);
401 if (err
&& p
->thread
.io_bitmap_ptr
) {
402 kfree(p
->thread
.io_bitmap_ptr
);
403 p
->thread
.io_bitmap_max
= 0;
409 start_thread(struct pt_regs
*regs
, unsigned long new_ip
, unsigned long new_sp
)
411 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
415 write_pda(oldrsp
, new_sp
);
416 regs
->cs
= __USER_CS
;
417 regs
->ss
= __USER_DS
;
421 * Free the old FP and other extended state
423 free_thread_xstate(current
);
425 EXPORT_SYMBOL_GPL(start_thread
);
427 static void hard_disable_TSC(void)
429 write_cr4(read_cr4() | X86_CR4_TSD
);
432 void disable_TSC(void)
435 if (!test_and_set_thread_flag(TIF_NOTSC
))
437 * Must flip the CPU state synchronously with
438 * TIF_NOTSC in the current running context.
444 static void hard_enable_TSC(void)
446 write_cr4(read_cr4() & ~X86_CR4_TSD
);
449 static void enable_TSC(void)
452 if (test_and_clear_thread_flag(TIF_NOTSC
))
454 * Must flip the CPU state synchronously with
455 * TIF_NOTSC in the current running context.
461 int get_tsc_mode(unsigned long adr
)
465 if (test_thread_flag(TIF_NOTSC
))
466 val
= PR_TSC_SIGSEGV
;
470 return put_user(val
, (unsigned int __user
*)adr
);
473 int set_tsc_mode(unsigned int val
)
475 if (val
== PR_TSC_SIGSEGV
)
477 else if (val
== PR_TSC_ENABLE
)
486 * This special macro can be used to load a debugging register
488 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
490 static inline void __switch_to_xtra(struct task_struct
*prev_p
,
491 struct task_struct
*next_p
,
492 struct tss_struct
*tss
)
494 struct thread_struct
*prev
, *next
;
495 unsigned long debugctl
;
497 prev
= &prev_p
->thread
,
498 next
= &next_p
->thread
;
500 debugctl
= prev
->debugctlmsr
;
501 if (next
->ds_area_msr
!= prev
->ds_area_msr
) {
502 /* we clear debugctl to make sure DS
503 * is not in use when we change it */
505 update_debugctlmsr(0);
506 wrmsrl(MSR_IA32_DS_AREA
, next
->ds_area_msr
);
509 if (next
->debugctlmsr
!= debugctl
)
510 update_debugctlmsr(next
->debugctlmsr
);
512 if (test_tsk_thread_flag(next_p
, TIF_DEBUG
)) {
522 if (test_tsk_thread_flag(prev_p
, TIF_NOTSC
) ^
523 test_tsk_thread_flag(next_p
, TIF_NOTSC
)) {
524 /* prev and next are different */
525 if (test_tsk_thread_flag(next_p
, TIF_NOTSC
))
531 if (test_tsk_thread_flag(next_p
, TIF_IO_BITMAP
)) {
533 * Copy the relevant range of the IO bitmap.
534 * Normally this is 128 bytes or less:
536 memcpy(tss
->io_bitmap
, next
->io_bitmap_ptr
,
537 max(prev
->io_bitmap_max
, next
->io_bitmap_max
));
538 } else if (test_tsk_thread_flag(prev_p
, TIF_IO_BITMAP
)) {
540 * Clear any possible leftover bits:
542 memset(tss
->io_bitmap
, 0xff, prev
->io_bitmap_max
);
546 if (test_tsk_thread_flag(prev_p
, TIF_BTS_TRACE_TS
))
547 ptrace_bts_take_timestamp(prev_p
, BTS_TASK_DEPARTS
);
549 if (test_tsk_thread_flag(next_p
, TIF_BTS_TRACE_TS
))
550 ptrace_bts_take_timestamp(next_p
, BTS_TASK_ARRIVES
);
555 * switch_to(x,y) should switch tasks from x to y.
557 * This could still be optimized:
558 * - fold all the options into a flag word and test it with a single test.
559 * - could test fs/gs bitsliced
561 * Kprobes not supported here. Set the probe on schedule instead.
564 __switch_to(struct task_struct
*prev_p
, struct task_struct
*next_p
)
566 struct thread_struct
*prev
= &prev_p
->thread
,
567 *next
= &next_p
->thread
;
568 int cpu
= smp_processor_id();
569 struct tss_struct
*tss
= &per_cpu(init_tss
, cpu
);
571 /* we're going to use this soon, after a few expensive things */
572 if (next_p
->fpu_counter
>5)
573 prefetch(next
->xstate
);
576 * Reload esp0, LDT and the page table pointer:
582 * This won't pick up thread selector changes, but I guess that is ok.
584 asm volatile("mov %%es,%0" : "=m" (prev
->es
));
585 if (unlikely(next
->es
| prev
->es
))
586 loadsegment(es
, next
->es
);
588 asm volatile ("mov %%ds,%0" : "=m" (prev
->ds
));
589 if (unlikely(next
->ds
| prev
->ds
))
590 loadsegment(ds
, next
->ds
);
599 asm volatile("movl %%fs,%0" : "=r" (fsindex
));
600 /* segment register != 0 always requires a reload.
601 also reload when it has changed.
602 when prev process used 64bit base always reload
603 to avoid an information leak. */
604 if (unlikely(fsindex
| next
->fsindex
| prev
->fs
)) {
605 loadsegment(fs
, next
->fsindex
);
606 /* check if the user used a selector != 0
607 * if yes clear 64bit base, since overloaded base
608 * is always mapped to the Null selector
613 /* when next process has a 64bit base use it */
615 wrmsrl(MSR_FS_BASE
, next
->fs
);
616 prev
->fsindex
= fsindex
;
620 asm volatile("movl %%gs,%0" : "=r" (gsindex
));
621 if (unlikely(gsindex
| next
->gsindex
| prev
->gs
)) {
622 load_gs_index(next
->gsindex
);
627 wrmsrl(MSR_KERNEL_GS_BASE
, next
->gs
);
628 prev
->gsindex
= gsindex
;
631 /* Must be after DS reload */
635 * Switch the PDA and FPU contexts.
637 prev
->usersp
= read_pda(oldrsp
);
638 write_pda(oldrsp
, next
->usersp
);
639 write_pda(pcurrent
, next_p
);
641 write_pda(kernelstack
,
642 (unsigned long)task_stack_page(next_p
) + THREAD_SIZE
- PDA_STACKOFFSET
);
643 #ifdef CONFIG_CC_STACKPROTECTOR
644 write_pda(stack_canary
, next_p
->stack_canary
);
646 * Build time only check to make sure the stack_canary is at
647 * offset 40 in the pda; this is a gcc ABI requirement
649 BUILD_BUG_ON(offsetof(struct x8664_pda
, stack_canary
) != 40);
653 * Now maybe reload the debug registers and handle I/O bitmaps
655 if (unlikely(task_thread_info(next_p
)->flags
& _TIF_WORK_CTXSW_NEXT
||
656 task_thread_info(prev_p
)->flags
& _TIF_WORK_CTXSW_PREV
))
657 __switch_to_xtra(prev_p
, next_p
, tss
);
659 /* If the task has used fpu the last 5 timeslices, just do a full
660 * restore of the math state immediately to avoid the trap; the
661 * chances of needing FPU soon are obviously high now
663 * tsk_used_math() checks prevent calling math_state_restore(),
664 * which can sleep in the case of !tsk_used_math()
666 if (tsk_used_math(next_p
) && next_p
->fpu_counter
> 5)
667 math_state_restore();
672 * sys_execve() executes a new program.
675 long sys_execve(char __user
*name
, char __user
* __user
*argv
,
676 char __user
* __user
*envp
, struct pt_regs
*regs
)
681 filename
= getname(name
);
682 error
= PTR_ERR(filename
);
683 if (IS_ERR(filename
))
685 error
= do_execve(filename
, argv
, envp
, regs
);
690 void set_personality_64bit(void)
692 /* inherit personality from parent */
694 /* Make sure to be in 64bit mode */
695 clear_thread_flag(TIF_IA32
);
697 /* TBD: overwrites user setup. Should have two bits.
698 But 64bit processes have always behaved this way,
699 so it's not too bad. The main problem is just that
700 32bit childs are affected again. */
701 current
->personality
&= ~READ_IMPLIES_EXEC
;
704 asmlinkage
long sys_fork(struct pt_regs
*regs
)
706 return do_fork(SIGCHLD
, regs
->sp
, regs
, 0, NULL
, NULL
);
710 sys_clone(unsigned long clone_flags
, unsigned long newsp
,
711 void __user
*parent_tid
, void __user
*child_tid
, struct pt_regs
*regs
)
715 return do_fork(clone_flags
, newsp
, regs
, 0, parent_tid
, child_tid
);
719 * This is trivial, and on the face of it looks like it
720 * could equally well be done in user mode.
722 * Not so, for quite unobvious reasons - register pressure.
723 * In user mode vfork() cannot have a stack frame, and if
724 * done by calling the "clone()" system call directly, you
725 * do not have enough call-clobbered registers to hold all
726 * the information you need.
728 asmlinkage
long sys_vfork(struct pt_regs
*regs
)
730 return do_fork(CLONE_VFORK
| CLONE_VM
| SIGCHLD
, regs
->sp
, regs
, 0,
734 unsigned long get_wchan(struct task_struct
*p
)
740 if (!p
|| p
== current
|| p
->state
==TASK_RUNNING
)
742 stack
= (unsigned long)task_stack_page(p
);
743 if (p
->thread
.sp
< stack
|| p
->thread
.sp
> stack
+THREAD_SIZE
)
745 fp
= *(u64
*)(p
->thread
.sp
);
747 if (fp
< (unsigned long)stack
||
748 fp
> (unsigned long)stack
+THREAD_SIZE
)
751 if (!in_sched_functions(ip
))
754 } while (count
++ < 16);
758 long do_arch_prctl(struct task_struct
*task
, int code
, unsigned long addr
)
761 int doit
= task
== current
;
766 if (addr
>= TASK_SIZE_OF(task
))
769 /* handle small bases via the GDT because that's faster to
771 if (addr
<= 0xffffffff) {
772 set_32bit_tls(task
, GS_TLS
, addr
);
774 load_TLS(&task
->thread
, cpu
);
775 load_gs_index(GS_TLS_SEL
);
777 task
->thread
.gsindex
= GS_TLS_SEL
;
780 task
->thread
.gsindex
= 0;
781 task
->thread
.gs
= addr
;
784 ret
= checking_wrmsrl(MSR_KERNEL_GS_BASE
, addr
);
790 /* Not strictly needed for fs, but do it for symmetry
792 if (addr
>= TASK_SIZE_OF(task
))
795 /* handle small bases via the GDT because that's faster to
797 if (addr
<= 0xffffffff) {
798 set_32bit_tls(task
, FS_TLS
, addr
);
800 load_TLS(&task
->thread
, cpu
);
801 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL
));
803 task
->thread
.fsindex
= FS_TLS_SEL
;
806 task
->thread
.fsindex
= 0;
807 task
->thread
.fs
= addr
;
809 /* set the selector to 0 to not confuse
811 asm volatile("movl %0,%%fs" :: "r" (0));
812 ret
= checking_wrmsrl(MSR_FS_BASE
, addr
);
819 if (task
->thread
.fsindex
== FS_TLS_SEL
)
820 base
= read_32bit_tls(task
, FS_TLS
);
822 rdmsrl(MSR_FS_BASE
, base
);
824 base
= task
->thread
.fs
;
825 ret
= put_user(base
, (unsigned long __user
*)addr
);
831 if (task
->thread
.gsindex
== GS_TLS_SEL
)
832 base
= read_32bit_tls(task
, GS_TLS
);
834 asm("movl %%gs,%0" : "=r" (gsindex
));
836 rdmsrl(MSR_KERNEL_GS_BASE
, base
);
838 base
= task
->thread
.gs
;
841 base
= task
->thread
.gs
;
842 ret
= put_user(base
, (unsigned long __user
*)addr
);
854 long sys_arch_prctl(int code
, unsigned long addr
)
856 return do_arch_prctl(current
, code
, addr
);
859 unsigned long arch_align_stack(unsigned long sp
)
861 if (!(current
->personality
& ADDR_NO_RANDOMIZE
) && randomize_va_space
)
862 sp
-= get_random_int() % 8192;
866 unsigned long arch_randomize_brk(struct mm_struct
*mm
)
868 unsigned long range_end
= mm
->brk
+ 0x02000000;
869 return randomize_range(mm
->brk
, range_end
, 0) ? : mm
->brk
;