Linux-2.6.12-rc2
[linux-2.6/next.git] / arch / x86_64 / kernel / process.c
blob9922d2ba24a37ed6fd39bbc03c36670d6d73167d
1 /*
2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
12 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
16 * This file handles the architecture-dependent parts of process handling..
19 #include <stdarg.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
23 #include <linux/kernel.h>
24 #include <linux/mm.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/module.h>
30 #include <linux/a.out.h>
31 #include <linux/interrupt.h>
32 #include <linux/delay.h>
33 #include <linux/irq.h>
34 #include <linux/ptrace.h>
35 #include <linux/utsname.h>
36 #include <linux/random.h>
38 #include <asm/uaccess.h>
39 #include <asm/pgtable.h>
40 #include <asm/system.h>
41 #include <asm/io.h>
42 #include <asm/processor.h>
43 #include <asm/i387.h>
44 #include <asm/mmu_context.h>
45 #include <asm/pda.h>
46 #include <asm/prctl.h>
47 #include <asm/kdebug.h>
48 #include <asm/desc.h>
49 #include <asm/proto.h>
50 #include <asm/ia32.h>
52 asmlinkage extern void ret_from_fork(void);
54 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
56 static atomic_t hlt_counter = ATOMIC_INIT(0);
58 unsigned long boot_option_idle_override = 0;
59 EXPORT_SYMBOL(boot_option_idle_override);
62 * Powermanagement idle function, if any..
64 void (*pm_idle)(void);
65 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
67 void disable_hlt(void)
69 atomic_inc(&hlt_counter);
72 EXPORT_SYMBOL(disable_hlt);
74 void enable_hlt(void)
76 atomic_dec(&hlt_counter);
79 EXPORT_SYMBOL(enable_hlt);
82 * We use this if we don't have any better
83 * idle routine..
85 void default_idle(void)
87 if (!atomic_read(&hlt_counter)) {
88 local_irq_disable();
89 if (!need_resched())
90 safe_halt();
91 else
92 local_irq_enable();
97 * On SMP it's slightly faster (but much more power-consuming!)
98 * to poll the ->need_resched flag instead of waiting for the
99 * cross-CPU IPI to arrive. Use this option with caution.
101 static void poll_idle (void)
103 int oldval;
105 local_irq_enable();
108 * Deal with another CPU just having chosen a thread to
109 * run here:
111 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
113 if (!oldval) {
114 set_thread_flag(TIF_POLLING_NRFLAG);
115 asm volatile(
116 "2:"
117 "testl %0,%1;"
118 "rep; nop;"
119 "je 2b;"
121 "i" (_TIF_NEED_RESCHED),
122 "m" (current_thread_info()->flags));
123 } else {
124 set_need_resched();
128 void cpu_idle_wait(void)
130 unsigned int cpu, this_cpu = get_cpu();
131 cpumask_t map;
133 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
134 put_cpu();
136 cpus_clear(map);
137 for_each_online_cpu(cpu) {
138 per_cpu(cpu_idle_state, cpu) = 1;
139 cpu_set(cpu, map);
142 __get_cpu_var(cpu_idle_state) = 0;
144 wmb();
145 do {
146 ssleep(1);
147 for_each_online_cpu(cpu) {
148 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
149 cpu_clear(cpu, map);
151 cpus_and(map, map, cpu_online_map);
152 } while (!cpus_empty(map));
154 EXPORT_SYMBOL_GPL(cpu_idle_wait);
157 * The idle thread. There's no useful work to be
158 * done, so just try to conserve power and have a
159 * low exit latency (ie sit in a loop waiting for
160 * somebody to say that they'd like to reschedule)
162 void cpu_idle (void)
164 /* endless idle loop with no priority at all */
165 while (1) {
166 while (!need_resched()) {
167 void (*idle)(void);
169 if (__get_cpu_var(cpu_idle_state))
170 __get_cpu_var(cpu_idle_state) = 0;
172 rmb();
173 idle = pm_idle;
174 if (!idle)
175 idle = default_idle;
176 idle();
179 schedule();
184 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
185 * which can obviate IPI to trigger checking of need_resched.
186 * We execute MONITOR against need_resched and enter optimized wait state
187 * through MWAIT. Whenever someone changes need_resched, we would be woken
188 * up from MWAIT (without an IPI).
190 static void mwait_idle(void)
192 local_irq_enable();
194 if (!need_resched()) {
195 set_thread_flag(TIF_POLLING_NRFLAG);
196 do {
197 __monitor((void *)&current_thread_info()->flags, 0, 0);
198 if (need_resched())
199 break;
200 __mwait(0, 0);
201 } while (!need_resched());
202 clear_thread_flag(TIF_POLLING_NRFLAG);
206 void __init select_idle_routine(const struct cpuinfo_x86 *c)
208 static int printed;
209 if (cpu_has(c, X86_FEATURE_MWAIT)) {
211 * Skip, if setup has overridden idle.
212 * One CPU supports mwait => All CPUs supports mwait
214 if (!pm_idle) {
215 if (!printed) {
216 printk("using mwait in idle threads.\n");
217 printed = 1;
219 pm_idle = mwait_idle;
224 static int __init idle_setup (char *str)
226 if (!strncmp(str, "poll", 4)) {
227 printk("using polling idle threads.\n");
228 pm_idle = poll_idle;
231 boot_option_idle_override = 1;
232 return 1;
235 __setup("idle=", idle_setup);
237 /* Prints also some state that isn't saved in the pt_regs */
238 void __show_regs(struct pt_regs * regs)
240 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
241 unsigned int fsindex,gsindex;
242 unsigned int ds,cs,es;
244 printk("\n");
245 print_modules();
246 printk("Pid: %d, comm: %.20s %s %s\n",
247 current->pid, current->comm, print_tainted(), system_utsname.release);
248 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
249 printk_address(regs->rip);
250 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
251 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
252 regs->rax, regs->rbx, regs->rcx);
253 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
254 regs->rdx, regs->rsi, regs->rdi);
255 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
256 regs->rbp, regs->r8, regs->r9);
257 printk("R10: %016lx R11: %016lx R12: %016lx\n",
258 regs->r10, regs->r11, regs->r12);
259 printk("R13: %016lx R14: %016lx R15: %016lx\n",
260 regs->r13, regs->r14, regs->r15);
262 asm("movl %%ds,%0" : "=r" (ds));
263 asm("movl %%cs,%0" : "=r" (cs));
264 asm("movl %%es,%0" : "=r" (es));
265 asm("movl %%fs,%0" : "=r" (fsindex));
266 asm("movl %%gs,%0" : "=r" (gsindex));
268 rdmsrl(MSR_FS_BASE, fs);
269 rdmsrl(MSR_GS_BASE, gs);
270 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
272 asm("movq %%cr0, %0": "=r" (cr0));
273 asm("movq %%cr2, %0": "=r" (cr2));
274 asm("movq %%cr3, %0": "=r" (cr3));
275 asm("movq %%cr4, %0": "=r" (cr4));
277 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
278 fs,fsindex,gs,gsindex,shadowgs);
279 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
280 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
283 void show_regs(struct pt_regs *regs)
285 __show_regs(regs);
286 show_trace(&regs->rsp);
290 * Free current thread data structures etc..
292 void exit_thread(void)
294 struct task_struct *me = current;
295 struct thread_struct *t = &me->thread;
296 if (me->thread.io_bitmap_ptr) {
297 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
299 kfree(t->io_bitmap_ptr);
300 t->io_bitmap_ptr = NULL;
302 * Careful, clear this in the TSS too:
304 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
305 t->io_bitmap_max = 0;
306 put_cpu();
310 void flush_thread(void)
312 struct task_struct *tsk = current;
313 struct thread_info *t = current_thread_info();
315 if (t->flags & _TIF_ABI_PENDING)
316 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
318 tsk->thread.debugreg0 = 0;
319 tsk->thread.debugreg1 = 0;
320 tsk->thread.debugreg2 = 0;
321 tsk->thread.debugreg3 = 0;
322 tsk->thread.debugreg6 = 0;
323 tsk->thread.debugreg7 = 0;
324 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
326 * Forget coprocessor state..
328 clear_fpu(tsk);
329 clear_used_math();
332 void release_thread(struct task_struct *dead_task)
334 if (dead_task->mm) {
335 if (dead_task->mm->context.size) {
336 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
337 dead_task->comm,
338 dead_task->mm->context.ldt,
339 dead_task->mm->context.size);
340 BUG();
345 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
347 struct user_desc ud = {
348 .base_addr = addr,
349 .limit = 0xfffff,
350 .seg_32bit = 1,
351 .limit_in_pages = 1,
352 .useable = 1,
354 struct n_desc_struct *desc = (void *)t->thread.tls_array;
355 desc += tls;
356 desc->a = LDT_entry_a(&ud);
357 desc->b = LDT_entry_b(&ud);
360 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
362 struct desc_struct *desc = (void *)t->thread.tls_array;
363 desc += tls;
364 return desc->base0 |
365 (((u32)desc->base1) << 16) |
366 (((u32)desc->base2) << 24);
370 * This gets called before we allocate a new thread and copy
371 * the current task into it.
373 void prepare_to_copy(struct task_struct *tsk)
375 unlazy_fpu(tsk);
378 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
379 unsigned long unused,
380 struct task_struct * p, struct pt_regs * regs)
382 int err;
383 struct pt_regs * childregs;
384 struct task_struct *me = current;
386 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
388 *childregs = *regs;
390 childregs->rax = 0;
391 childregs->rsp = rsp;
392 if (rsp == ~0UL) {
393 childregs->rsp = (unsigned long)childregs;
396 p->thread.rsp = (unsigned long) childregs;
397 p->thread.rsp0 = (unsigned long) (childregs+1);
398 p->thread.userrsp = me->thread.userrsp;
400 set_ti_thread_flag(p->thread_info, TIF_FORK);
402 p->thread.fs = me->thread.fs;
403 p->thread.gs = me->thread.gs;
405 asm("movl %%gs,%0" : "=m" (p->thread.gsindex));
406 asm("movl %%fs,%0" : "=m" (p->thread.fsindex));
407 asm("movl %%es,%0" : "=m" (p->thread.es));
408 asm("movl %%ds,%0" : "=m" (p->thread.ds));
410 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
411 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
412 if (!p->thread.io_bitmap_ptr) {
413 p->thread.io_bitmap_max = 0;
414 return -ENOMEM;
416 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
420 * Set a new TLS for the child thread?
422 if (clone_flags & CLONE_SETTLS) {
423 #ifdef CONFIG_IA32_EMULATION
424 if (test_thread_flag(TIF_IA32))
425 err = ia32_child_tls(p, childregs);
426 else
427 #endif
428 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
429 if (err)
430 goto out;
432 err = 0;
433 out:
434 if (err && p->thread.io_bitmap_ptr) {
435 kfree(p->thread.io_bitmap_ptr);
436 p->thread.io_bitmap_max = 0;
438 return err;
442 * This special macro can be used to load a debugging register
444 #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
447 * switch_to(x,y) should switch tasks from x to y.
449 * This could still be optimized:
450 * - fold all the options into a flag word and test it with a single test.
451 * - could test fs/gs bitsliced
453 struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
455 struct thread_struct *prev = &prev_p->thread,
456 *next = &next_p->thread;
457 int cpu = smp_processor_id();
458 struct tss_struct *tss = &per_cpu(init_tss, cpu);
460 unlazy_fpu(prev_p);
463 * Reload esp0, LDT and the page table pointer:
465 tss->rsp0 = next->rsp0;
468 * Switch DS and ES.
469 * This won't pick up thread selector changes, but I guess that is ok.
471 asm volatile("movl %%es,%0" : "=m" (prev->es));
472 if (unlikely(next->es | prev->es))
473 loadsegment(es, next->es);
475 asm volatile ("movl %%ds,%0" : "=m" (prev->ds));
476 if (unlikely(next->ds | prev->ds))
477 loadsegment(ds, next->ds);
479 load_TLS(next, cpu);
482 * Switch FS and GS.
485 unsigned fsindex;
486 asm volatile("movl %%fs,%0" : "=r" (fsindex));
487 /* segment register != 0 always requires a reload.
488 also reload when it has changed.
489 when prev process used 64bit base always reload
490 to avoid an information leak. */
491 if (unlikely(fsindex | next->fsindex | prev->fs)) {
492 loadsegment(fs, next->fsindex);
493 /* check if the user used a selector != 0
494 * if yes clear 64bit base, since overloaded base
495 * is always mapped to the Null selector
497 if (fsindex)
498 prev->fs = 0;
500 /* when next process has a 64bit base use it */
501 if (next->fs)
502 wrmsrl(MSR_FS_BASE, next->fs);
503 prev->fsindex = fsindex;
506 unsigned gsindex;
507 asm volatile("movl %%gs,%0" : "=r" (gsindex));
508 if (unlikely(gsindex | next->gsindex | prev->gs)) {
509 load_gs_index(next->gsindex);
510 if (gsindex)
511 prev->gs = 0;
513 if (next->gs)
514 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
515 prev->gsindex = gsindex;
519 * Switch the PDA context.
521 prev->userrsp = read_pda(oldrsp);
522 write_pda(oldrsp, next->userrsp);
523 write_pda(pcurrent, next_p);
524 write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
527 * Now maybe reload the debug registers
529 if (unlikely(next->debugreg7)) {
530 loaddebug(next, 0);
531 loaddebug(next, 1);
532 loaddebug(next, 2);
533 loaddebug(next, 3);
534 /* no 4 and 5 */
535 loaddebug(next, 6);
536 loaddebug(next, 7);
541 * Handle the IO bitmap
543 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
544 if (next->io_bitmap_ptr)
546 * Copy the relevant range of the IO bitmap.
547 * Normally this is 128 bytes or less:
549 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
550 max(prev->io_bitmap_max, next->io_bitmap_max));
551 else {
553 * Clear any possible leftover bits:
555 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
559 return prev_p;
563 * sys_execve() executes a new program.
565 asmlinkage
566 long sys_execve(char __user *name, char __user * __user *argv,
567 char __user * __user *envp, struct pt_regs regs)
569 long error;
570 char * filename;
572 filename = getname(name);
573 error = PTR_ERR(filename);
574 if (IS_ERR(filename))
575 return error;
576 error = do_execve(filename, argv, envp, &regs);
577 if (error == 0) {
578 task_lock(current);
579 current->ptrace &= ~PT_DTRACE;
580 task_unlock(current);
582 putname(filename);
583 return error;
586 void set_personality_64bit(void)
588 /* inherit personality from parent */
590 /* Make sure to be in 64bit mode */
591 clear_thread_flag(TIF_IA32);
593 /* TBD: overwrites user setup. Should have two bits.
594 But 64bit processes have always behaved this way,
595 so it's not too bad. The main problem is just that
596 32bit childs are affected again. */
597 current->personality &= ~READ_IMPLIES_EXEC;
600 asmlinkage long sys_fork(struct pt_regs *regs)
602 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
605 asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
607 if (!newsp)
608 newsp = regs->rsp;
609 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
613 * This is trivial, and on the face of it looks like it
614 * could equally well be done in user mode.
616 * Not so, for quite unobvious reasons - register pressure.
617 * In user mode vfork() cannot have a stack frame, and if
618 * done by calling the "clone()" system call directly, you
619 * do not have enough call-clobbered registers to hold all
620 * the information you need.
622 asmlinkage long sys_vfork(struct pt_regs *regs)
624 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
625 NULL, NULL);
628 unsigned long get_wchan(struct task_struct *p)
630 unsigned long stack;
631 u64 fp,rip;
632 int count = 0;
634 if (!p || p == current || p->state==TASK_RUNNING)
635 return 0;
636 stack = (unsigned long)p->thread_info;
637 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
638 return 0;
639 fp = *(u64 *)(p->thread.rsp);
640 do {
641 if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
642 return 0;
643 rip = *(u64 *)(fp+8);
644 if (!in_sched_functions(rip))
645 return rip;
646 fp = *(u64 *)fp;
647 } while (count++ < 16);
648 return 0;
651 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
653 int ret = 0;
654 int doit = task == current;
655 int cpu;
657 switch (code) {
658 case ARCH_SET_GS:
659 if (addr >= TASK_SIZE)
660 return -EPERM;
661 cpu = get_cpu();
662 /* handle small bases via the GDT because that's faster to
663 switch. */
664 if (addr <= 0xffffffff) {
665 set_32bit_tls(task, GS_TLS, addr);
666 if (doit) {
667 load_TLS(&task->thread, cpu);
668 load_gs_index(GS_TLS_SEL);
670 task->thread.gsindex = GS_TLS_SEL;
671 task->thread.gs = 0;
672 } else {
673 task->thread.gsindex = 0;
674 task->thread.gs = addr;
675 if (doit) {
676 load_gs_index(0);
677 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
680 put_cpu();
681 break;
682 case ARCH_SET_FS:
683 /* Not strictly needed for fs, but do it for symmetry
684 with gs */
685 if (addr >= TASK_SIZE)
686 return -EPERM;
687 cpu = get_cpu();
688 /* handle small bases via the GDT because that's faster to
689 switch. */
690 if (addr <= 0xffffffff) {
691 set_32bit_tls(task, FS_TLS, addr);
692 if (doit) {
693 load_TLS(&task->thread, cpu);
694 asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
696 task->thread.fsindex = FS_TLS_SEL;
697 task->thread.fs = 0;
698 } else {
699 task->thread.fsindex = 0;
700 task->thread.fs = addr;
701 if (doit) {
702 /* set the selector to 0 to not confuse
703 __switch_to */
704 asm volatile("movl %0,%%fs" :: "r" (0));
705 ret = checking_wrmsrl(MSR_FS_BASE, addr);
708 put_cpu();
709 break;
710 case ARCH_GET_FS: {
711 unsigned long base;
712 if (task->thread.fsindex == FS_TLS_SEL)
713 base = read_32bit_tls(task, FS_TLS);
714 else if (doit) {
715 rdmsrl(MSR_FS_BASE, base);
716 } else
717 base = task->thread.fs;
718 ret = put_user(base, (unsigned long __user *)addr);
719 break;
721 case ARCH_GET_GS: {
722 unsigned long base;
723 if (task->thread.gsindex == GS_TLS_SEL)
724 base = read_32bit_tls(task, GS_TLS);
725 else if (doit) {
726 rdmsrl(MSR_KERNEL_GS_BASE, base);
727 } else
728 base = task->thread.gs;
729 ret = put_user(base, (unsigned long __user *)addr);
730 break;
733 default:
734 ret = -EINVAL;
735 break;
738 return ret;
741 long sys_arch_prctl(int code, unsigned long addr)
743 return do_arch_prctl(current, code, addr);
747 * Capture the user space registers if the task is not running (in user space)
749 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
751 struct pt_regs *pp, ptregs;
753 pp = (struct pt_regs *)(tsk->thread.rsp0);
754 --pp;
756 ptregs = *pp;
757 ptregs.cs &= 0xffff;
758 ptregs.ss &= 0xffff;
760 elf_core_copy_regs(regs, &ptregs);
762 return 1;
765 unsigned long arch_align_stack(unsigned long sp)
767 if (randomize_va_space)
768 sp -= get_random_int() % 8192;
769 return sp & ~0xf;