2 * linux/arch/i386/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
11 * This file handles the architecture-dependent parts of process handling..
16 #include <linux/errno.h>
17 #include <linux/sched.h>
19 #include <linux/kernel.h>
21 #include <linux/elfcore.h>
22 #include <linux/smp.h>
23 #include <linux/smp_lock.h>
24 #include <linux/stddef.h>
25 #include <linux/slab.h>
26 #include <linux/vmalloc.h>
27 #include <linux/user.h>
28 #include <linux/a.out.h>
29 #include <linux/interrupt.h>
30 #include <linux/config.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/reboot.h>
34 #include <linux/init.h>
35 #include <linux/mc146818rtc.h>
36 #include <linux/module.h>
37 #include <linux/kallsyms.h>
38 #include <linux/ptrace.h>
39 #include <linux/random.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
46 #include <asm/processor.h>
50 #ifdef CONFIG_MATH_EMULATION
51 #include <asm/math_emu.h>
54 #include <linux/irq.h>
55 #include <linux/err.h>
57 asmlinkage
void ret_from_fork(void) __asm__("ret_from_fork");
59 static int hlt_counter
;
61 unsigned long boot_option_idle_override
= 0;
62 EXPORT_SYMBOL(boot_option_idle_override
);
65 * Return saved PC of a blocked thread.
67 unsigned long thread_saved_pc(struct task_struct
*tsk
)
69 return ((unsigned long *)tsk
->thread
.esp
)[3];
73 * Powermanagement idle function, if any..
75 void (*pm_idle
)(void);
76 static DEFINE_PER_CPU(unsigned int, cpu_idle_state
);
78 void disable_hlt(void)
83 EXPORT_SYMBOL(disable_hlt
);
90 EXPORT_SYMBOL(enable_hlt
);
93 * We use this if we don't have any better
96 void default_idle(void)
98 if (!hlt_counter
&& boot_cpu_data
.hlt_works_ok
) {
110 * On SMP it's slightly faster (but much more power-consuming!)
111 * to poll the ->work.need_resched flag instead of waiting for the
112 * cross-CPU IPI to arrive. Use this option with caution.
114 static void poll_idle (void)
121 * Deal with another CPU just having chosen a thread to
124 oldval
= test_and_clear_thread_flag(TIF_NEED_RESCHED
);
127 set_thread_flag(TIF_POLLING_NRFLAG
);
133 : : "i"(_TIF_NEED_RESCHED
), "m" (current_thread_info()->flags
));
135 clear_thread_flag(TIF_POLLING_NRFLAG
);
142 * The idle thread. There's no useful work to be
143 * done, so just try to conserve power and have a
144 * low exit latency (ie sit in a loop waiting for
145 * somebody to say that they'd like to reschedule)
149 /* endless idle loop with no priority at all */
151 while (!need_resched()) {
154 if (__get_cpu_var(cpu_idle_state
))
155 __get_cpu_var(cpu_idle_state
) = 0;
163 __get_cpu_var(irq_stat
).idle_timestamp
= jiffies
;
170 void cpu_idle_wait(void)
172 unsigned int cpu
, this_cpu
= get_cpu();
175 set_cpus_allowed(current
, cpumask_of_cpu(this_cpu
));
179 for_each_online_cpu(cpu
) {
180 per_cpu(cpu_idle_state
, cpu
) = 1;
184 __get_cpu_var(cpu_idle_state
) = 0;
189 for_each_online_cpu(cpu
) {
190 if (cpu_isset(cpu
, map
) && !per_cpu(cpu_idle_state
, cpu
))
193 cpus_and(map
, map
, cpu_online_map
);
194 } while (!cpus_empty(map
));
196 EXPORT_SYMBOL_GPL(cpu_idle_wait
);
199 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
200 * which can obviate IPI to trigger checking of need_resched.
201 * We execute MONITOR against need_resched and enter optimized wait state
202 * through MWAIT. Whenever someone changes need_resched, we would be woken
203 * up from MWAIT (without an IPI).
205 static void mwait_idle(void)
209 if (!need_resched()) {
210 set_thread_flag(TIF_POLLING_NRFLAG
);
212 __monitor((void *)¤t_thread_info()->flags
, 0, 0);
216 } while (!need_resched());
217 clear_thread_flag(TIF_POLLING_NRFLAG
);
221 void __init
select_idle_routine(const struct cpuinfo_x86
*c
)
223 if (cpu_has(c
, X86_FEATURE_MWAIT
)) {
224 printk("monitor/mwait feature present.\n");
226 * Skip, if setup has overridden idle.
227 * One CPU supports mwait => All CPUs supports mwait
230 printk("using mwait in idle threads.\n");
231 pm_idle
= mwait_idle
;
236 static int __init
idle_setup (char *str
)
238 if (!strncmp(str
, "poll", 4)) {
239 printk("using polling idle threads.\n");
241 #ifdef CONFIG_X86_SMP
242 if (smp_num_siblings
> 1)
243 printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
245 } else if (!strncmp(str
, "halt", 4)) {
246 printk("using halt in idle threads.\n");
247 pm_idle
= default_idle
;
250 boot_option_idle_override
= 1;
254 __setup("idle=", idle_setup
);
256 void show_regs(struct pt_regs
* regs
)
258 unsigned long cr0
= 0L, cr2
= 0L, cr3
= 0L, cr4
= 0L;
261 printk("Pid: %d, comm: %20s\n", current
->pid
, current
->comm
);
262 printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs
->xcs
,regs
->eip
, smp_processor_id());
263 print_symbol("EIP is at %s\n", regs
->eip
);
266 printk(" ESP: %04x:%08lx",0xffff & regs
->xss
,regs
->esp
);
267 printk(" EFLAGS: %08lx %s (%s)\n",
268 regs
->eflags
, print_tainted(), system_utsname
.release
);
269 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
270 regs
->eax
,regs
->ebx
,regs
->ecx
,regs
->edx
);
271 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
272 regs
->esi
, regs
->edi
, regs
->ebp
);
273 printk(" DS: %04x ES: %04x\n",
274 0xffff & regs
->xds
,0xffff & regs
->xes
);
276 __asm__("movl %%cr0, %0": "=r" (cr0
));
277 __asm__("movl %%cr2, %0": "=r" (cr2
));
278 __asm__("movl %%cr3, %0": "=r" (cr3
));
279 /* This could fault if %cr4 does not exist */
280 __asm__("1: movl %%cr4, %0 \n"
282 ".section __ex_table,\"a\" \n"
285 : "=r" (cr4
): "0" (0));
286 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0
, cr2
, cr3
, cr4
);
287 show_trace(NULL
, ®s
->esp
);
291 * This gets run with %ebx containing the
292 * function to call, and %edx containing
295 extern void kernel_thread_helper(void);
296 __asm__(".section .text\n"
298 "kernel_thread_helper:\n\t"
307 * Create a kernel thread
309 int kernel_thread(int (*fn
)(void *), void * arg
, unsigned long flags
)
313 memset(®s
, 0, sizeof(regs
));
315 regs
.ebx
= (unsigned long) fn
;
316 regs
.edx
= (unsigned long) arg
;
318 regs
.xds
= __USER_DS
;
319 regs
.xes
= __USER_DS
;
321 regs
.eip
= (unsigned long) kernel_thread_helper
;
322 regs
.xcs
= __KERNEL_CS
;
323 regs
.eflags
= X86_EFLAGS_IF
| X86_EFLAGS_SF
| X86_EFLAGS_PF
| 0x2;
325 /* Ok, create the new process.. */
326 return do_fork(flags
| CLONE_VM
| CLONE_UNTRACED
, 0, ®s
, 0, NULL
, NULL
);
330 * Free current thread data structures etc..
332 void exit_thread(void)
334 struct task_struct
*tsk
= current
;
335 struct thread_struct
*t
= &tsk
->thread
;
337 /* The process may have allocated an io port bitmap... nuke it. */
338 if (unlikely(NULL
!= t
->io_bitmap_ptr
)) {
340 struct tss_struct
*tss
= &per_cpu(init_tss
, cpu
);
342 kfree(t
->io_bitmap_ptr
);
343 t
->io_bitmap_ptr
= NULL
;
345 * Careful, clear this in the TSS too:
347 memset(tss
->io_bitmap
, 0xff, tss
->io_bitmap_max
);
348 t
->io_bitmap_max
= 0;
349 tss
->io_bitmap_owner
= NULL
;
350 tss
->io_bitmap_max
= 0;
351 tss
->io_bitmap_base
= INVALID_IO_BITMAP_OFFSET
;
356 void flush_thread(void)
358 struct task_struct
*tsk
= current
;
360 memset(tsk
->thread
.debugreg
, 0, sizeof(unsigned long)*8);
361 memset(tsk
->thread
.tls_array
, 0, sizeof(tsk
->thread
.tls_array
));
363 * Forget coprocessor state..
369 void release_thread(struct task_struct
*dead_task
)
372 // temporary debugging check
373 if (dead_task
->mm
->context
.size
) {
374 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
376 dead_task
->mm
->context
.ldt
,
377 dead_task
->mm
->context
.size
);
382 release_vm86_irqs(dead_task
);
386 * This gets called before we allocate a new thread and copy
387 * the current task into it.
389 void prepare_to_copy(struct task_struct
*tsk
)
394 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long esp
,
395 unsigned long unused
,
396 struct task_struct
* p
, struct pt_regs
* regs
)
398 struct pt_regs
* childregs
;
399 struct task_struct
*tsk
;
402 childregs
= ((struct pt_regs
*) (THREAD_SIZE
+ (unsigned long) p
->thread_info
)) - 1;
405 childregs
->esp
= esp
;
407 p
->thread
.esp
= (unsigned long) childregs
;
408 p
->thread
.esp0
= (unsigned long) (childregs
+1);
410 p
->thread
.eip
= (unsigned long) ret_from_fork
;
412 savesegment(fs
,p
->thread
.fs
);
413 savesegment(gs
,p
->thread
.gs
);
416 if (unlikely(NULL
!= tsk
->thread
.io_bitmap_ptr
)) {
417 p
->thread
.io_bitmap_ptr
= kmalloc(IO_BITMAP_BYTES
, GFP_KERNEL
);
418 if (!p
->thread
.io_bitmap_ptr
) {
419 p
->thread
.io_bitmap_max
= 0;
422 memcpy(p
->thread
.io_bitmap_ptr
, tsk
->thread
.io_bitmap_ptr
,
427 * Set a new TLS for the child thread?
429 if (clone_flags
& CLONE_SETTLS
) {
430 struct desc_struct
*desc
;
431 struct user_desc info
;
435 if (copy_from_user(&info
, (void __user
*)childregs
->esi
, sizeof(info
)))
438 if (LDT_empty(&info
))
441 idx
= info
.entry_number
;
442 if (idx
< GDT_ENTRY_TLS_MIN
|| idx
> GDT_ENTRY_TLS_MAX
)
445 desc
= p
->thread
.tls_array
+ idx
- GDT_ENTRY_TLS_MIN
;
446 desc
->a
= LDT_entry_a(&info
);
447 desc
->b
= LDT_entry_b(&info
);
452 if (err
&& p
->thread
.io_bitmap_ptr
) {
453 kfree(p
->thread
.io_bitmap_ptr
);
454 p
->thread
.io_bitmap_max
= 0;
460 * fill in the user structure for a core dump..
462 void dump_thread(struct pt_regs
* regs
, struct user
* dump
)
466 /* changed the size calculations - should hopefully work better. lbt */
467 dump
->magic
= CMAGIC
;
468 dump
->start_code
= 0;
469 dump
->start_stack
= regs
->esp
& ~(PAGE_SIZE
- 1);
470 dump
->u_tsize
= ((unsigned long) current
->mm
->end_code
) >> PAGE_SHIFT
;
471 dump
->u_dsize
= ((unsigned long) (current
->mm
->brk
+ (PAGE_SIZE
-1))) >> PAGE_SHIFT
;
472 dump
->u_dsize
-= dump
->u_tsize
;
474 for (i
= 0; i
< 8; i
++)
475 dump
->u_debugreg
[i
] = current
->thread
.debugreg
[i
];
477 if (dump
->start_stack
< TASK_SIZE
)
478 dump
->u_ssize
= ((unsigned long) (TASK_SIZE
- dump
->start_stack
)) >> PAGE_SHIFT
;
480 dump
->regs
.ebx
= regs
->ebx
;
481 dump
->regs
.ecx
= regs
->ecx
;
482 dump
->regs
.edx
= regs
->edx
;
483 dump
->regs
.esi
= regs
->esi
;
484 dump
->regs
.edi
= regs
->edi
;
485 dump
->regs
.ebp
= regs
->ebp
;
486 dump
->regs
.eax
= regs
->eax
;
487 dump
->regs
.ds
= regs
->xds
;
488 dump
->regs
.es
= regs
->xes
;
489 savesegment(fs
,dump
->regs
.fs
);
490 savesegment(gs
,dump
->regs
.gs
);
491 dump
->regs
.orig_eax
= regs
->orig_eax
;
492 dump
->regs
.eip
= regs
->eip
;
493 dump
->regs
.cs
= regs
->xcs
;
494 dump
->regs
.eflags
= regs
->eflags
;
495 dump
->regs
.esp
= regs
->esp
;
496 dump
->regs
.ss
= regs
->xss
;
498 dump
->u_fpvalid
= dump_fpu (regs
, &dump
->i387
);
502 * Capture the user space registers if the task is not running (in user space)
504 int dump_task_regs(struct task_struct
*tsk
, elf_gregset_t
*regs
)
506 struct pt_regs ptregs
;
508 ptregs
= *(struct pt_regs
*)
509 ((unsigned long)tsk
->thread_info
+THREAD_SIZE
- sizeof(ptregs
));
510 ptregs
.xcs
&= 0xffff;
511 ptregs
.xds
&= 0xffff;
512 ptregs
.xes
&= 0xffff;
513 ptregs
.xss
&= 0xffff;
515 elf_core_copy_regs(regs
, &ptregs
);
521 handle_io_bitmap(struct thread_struct
*next
, struct tss_struct
*tss
)
523 if (!next
->io_bitmap_ptr
) {
525 * Disable the bitmap via an invalid offset. We still cache
526 * the previous bitmap owner and the IO bitmap contents:
528 tss
->io_bitmap_base
= INVALID_IO_BITMAP_OFFSET
;
531 if (likely(next
== tss
->io_bitmap_owner
)) {
533 * Previous owner of the bitmap (hence the bitmap content)
534 * matches the next task, we dont have to do anything but
535 * to set a valid offset in the TSS:
537 tss
->io_bitmap_base
= IO_BITMAP_OFFSET
;
541 * Lazy TSS's I/O bitmap copy. We set an invalid offset here
542 * and we let the task to get a GPF in case an I/O instruction
543 * is performed. The handler of the GPF will verify that the
544 * faulting task has a valid I/O bitmap and, it true, does the
545 * real copy and restart the instruction. This will save us
546 * redundant copies when the currently switched task does not
547 * perform any I/O during its timeslice.
549 tss
->io_bitmap_base
= INVALID_IO_BITMAP_OFFSET_LAZY
;
552 * This special macro can be used to load a debugging register
554 #define loaddebug(thread,register) \
555 __asm__("movl %0,%%db" #register \
557 :"r" (thread->debugreg[register]))
560 * switch_to(x,yn) should switch tasks from x to y.
562 * We fsave/fwait so that an exception goes off at the right time
563 * (as a call from the fsave or fwait in effect) rather than to
564 * the wrong process. Lazy FP saving no longer makes any sense
565 * with modern CPU's, and this simplifies a lot of things (SMP
566 * and UP become the same).
568 * NOTE! We used to use the x86 hardware context switching. The
569 * reason for not using it any more becomes apparent when you
570 * try to recover gracefully from saved state that is no longer
571 * valid (stale segment register values in particular). With the
572 * hardware task-switch, there is no way to fix up bad state in
573 * a reasonable manner.
575 * The fact that Intel documents the hardware task-switching to
576 * be slow is a fairly red herring - this code is not noticeably
577 * faster. However, there _is_ some room for improvement here,
578 * so the performance issues may eventually be a valid point.
579 * More important, however, is the fact that this allows us much
582 * The return value (in %eax) will be the "prev" task after
583 * the task-switch, and shows up in ret_from_fork in entry.S,
586 struct task_struct fastcall
* __switch_to(struct task_struct
*prev_p
, struct task_struct
*next_p
)
588 struct thread_struct
*prev
= &prev_p
->thread
,
589 *next
= &next_p
->thread
;
590 int cpu
= smp_processor_id();
591 struct tss_struct
*tss
= &per_cpu(init_tss
, cpu
);
593 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
595 __unlazy_fpu(prev_p
);
598 * Reload esp0, LDT and the page table pointer:
600 load_esp0(tss
, next
);
603 * Load the per-thread Thread-Local Storage descriptor.
608 * Save away %fs and %gs. No need to save %es and %ds, as
609 * those are always kernel segments while inside the kernel.
611 asm volatile("movl %%fs,%0":"=m" (*(int *)&prev
->fs
));
612 asm volatile("movl %%gs,%0":"=m" (*(int *)&prev
->gs
));
615 * Restore %fs and %gs if needed.
617 if (unlikely(prev
->fs
| prev
->gs
| next
->fs
| next
->gs
)) {
618 loadsegment(fs
, next
->fs
);
619 loadsegment(gs
, next
->gs
);
623 * Now maybe reload the debug registers
625 if (unlikely(next
->debugreg
[7])) {
635 if (unlikely(prev
->io_bitmap_ptr
|| next
->io_bitmap_ptr
))
636 handle_io_bitmap(next
, tss
);
641 asmlinkage
int sys_fork(struct pt_regs regs
)
643 return do_fork(SIGCHLD
, regs
.esp
, ®s
, 0, NULL
, NULL
);
646 asmlinkage
int sys_clone(struct pt_regs regs
)
648 unsigned long clone_flags
;
650 int __user
*parent_tidptr
, *child_tidptr
;
652 clone_flags
= regs
.ebx
;
654 parent_tidptr
= (int __user
*)regs
.edx
;
655 child_tidptr
= (int __user
*)regs
.edi
;
658 return do_fork(clone_flags
, newsp
, ®s
, 0, parent_tidptr
, child_tidptr
);
662 * This is trivial, and on the face of it looks like it
663 * could equally well be done in user mode.
665 * Not so, for quite unobvious reasons - register pressure.
666 * In user mode vfork() cannot have a stack frame, and if
667 * done by calling the "clone()" system call directly, you
668 * do not have enough call-clobbered registers to hold all
669 * the information you need.
671 asmlinkage
int sys_vfork(struct pt_regs regs
)
673 return do_fork(CLONE_VFORK
| CLONE_VM
| SIGCHLD
, regs
.esp
, ®s
, 0, NULL
, NULL
);
677 * sys_execve() executes a new program.
679 asmlinkage
int sys_execve(struct pt_regs regs
)
684 filename
= getname((char __user
*) regs
.ebx
);
685 error
= PTR_ERR(filename
);
686 if (IS_ERR(filename
))
688 error
= do_execve(filename
,
689 (char __user
* __user
*) regs
.ecx
,
690 (char __user
* __user
*) regs
.edx
,
694 current
->ptrace
&= ~PT_DTRACE
;
695 task_unlock(current
);
696 /* Make sure we don't return using sysenter.. */
697 set_thread_flag(TIF_IRET
);
704 #define top_esp (THREAD_SIZE - sizeof(unsigned long))
705 #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
707 unsigned long get_wchan(struct task_struct
*p
)
709 unsigned long ebp
, esp
, eip
;
710 unsigned long stack_page
;
712 if (!p
|| p
== current
|| p
->state
== TASK_RUNNING
)
714 stack_page
= (unsigned long)p
->thread_info
;
716 if (!stack_page
|| esp
< stack_page
|| esp
> top_esp
+stack_page
)
718 /* include/asm-i386/system.h:switch_to() pushes ebp last. */
719 ebp
= *(unsigned long *) esp
;
721 if (ebp
< stack_page
|| ebp
> top_ebp
+stack_page
)
723 eip
= *(unsigned long *) (ebp
+4);
724 if (!in_sched_functions(eip
))
726 ebp
= *(unsigned long *) ebp
;
727 } while (count
++ < 16);
732 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
734 static int get_free_idx(void)
736 struct thread_struct
*t
= ¤t
->thread
;
739 for (idx
= 0; idx
< GDT_ENTRY_TLS_ENTRIES
; idx
++)
740 if (desc_empty(t
->tls_array
+ idx
))
741 return idx
+ GDT_ENTRY_TLS_MIN
;
746 * Set a given TLS descriptor:
748 asmlinkage
int sys_set_thread_area(struct user_desc __user
*u_info
)
750 struct thread_struct
*t
= ¤t
->thread
;
751 struct user_desc info
;
752 struct desc_struct
*desc
;
755 if (copy_from_user(&info
, u_info
, sizeof(info
)))
757 idx
= info
.entry_number
;
760 * index -1 means the kernel should try to find and
761 * allocate an empty descriptor:
764 idx
= get_free_idx();
767 if (put_user(idx
, &u_info
->entry_number
))
771 if (idx
< GDT_ENTRY_TLS_MIN
|| idx
> GDT_ENTRY_TLS_MAX
)
774 desc
= t
->tls_array
+ idx
- GDT_ENTRY_TLS_MIN
;
777 * We must not get preempted while modifying the TLS.
781 if (LDT_empty(&info
)) {
785 desc
->a
= LDT_entry_a(&info
);
786 desc
->b
= LDT_entry_b(&info
);
796 * Get the current Thread-Local Storage area:
799 #define GET_BASE(desc) ( \
800 (((desc)->a >> 16) & 0x0000ffff) | \
801 (((desc)->b << 16) & 0x00ff0000) | \
802 ( (desc)->b & 0xff000000) )
804 #define GET_LIMIT(desc) ( \
805 ((desc)->a & 0x0ffff) | \
806 ((desc)->b & 0xf0000) )
808 #define GET_32BIT(desc) (((desc)->b >> 22) & 1)
809 #define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
810 #define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
811 #define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
812 #define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
813 #define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
815 asmlinkage
int sys_get_thread_area(struct user_desc __user
*u_info
)
817 struct user_desc info
;
818 struct desc_struct
*desc
;
821 if (get_user(idx
, &u_info
->entry_number
))
823 if (idx
< GDT_ENTRY_TLS_MIN
|| idx
> GDT_ENTRY_TLS_MAX
)
826 desc
= current
->thread
.tls_array
+ idx
- GDT_ENTRY_TLS_MIN
;
828 info
.entry_number
= idx
;
829 info
.base_addr
= GET_BASE(desc
);
830 info
.limit
= GET_LIMIT(desc
);
831 info
.seg_32bit
= GET_32BIT(desc
);
832 info
.contents
= GET_CONTENTS(desc
);
833 info
.read_exec_only
= !GET_WRITABLE(desc
);
834 info
.limit_in_pages
= GET_LIMIT_PAGES(desc
);
835 info
.seg_not_present
= !GET_PRESENT(desc
);
836 info
.useable
= GET_USEABLE(desc
);
838 if (copy_to_user(u_info
, &info
, sizeof(info
)))
843 unsigned long arch_align_stack(unsigned long sp
)
845 if (randomize_va_space
)
846 sp
-= get_random_int() % 8192;