4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also entry.S and others).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
14 #include <linux/config.h>
15 #include <linux/slab.h>
16 #include <linux/init.h>
17 #include <linux/unistd.h>
18 #include <linux/smp_lock.h>
19 #include <linux/module.h>
20 #include <linux/vmalloc.h>
21 #include <linux/completion.h>
22 #include <linux/namespace.h>
23 #include <linux/personality.h>
24 #include <linux/mempolicy.h>
25 #include <linux/sem.h>
26 #include <linux/file.h>
27 #include <linux/binfmts.h>
28 #include <linux/mman.h>
30 #include <linux/cpu.h>
31 #include <linux/security.h>
32 #include <linux/swap.h>
33 #include <linux/syscalls.h>
34 #include <linux/jiffies.h>
35 #include <linux/futex.h>
36 #include <linux/ptrace.h>
37 #include <linux/mount.h>
38 #include <linux/audit.h>
39 #include <linux/profile.h>
40 #include <linux/rmap.h>
42 #include <asm/pgtable.h>
43 #include <asm/pgalloc.h>
44 #include <asm/uaccess.h>
45 #include <asm/mmu_context.h>
46 #include <asm/cacheflush.h>
47 #include <asm/tlbflush.h>
49 /* The idle threads do not count..
50 * Protected by write_lock_irq(&tasklist_lock)
55 unsigned long total_forks
; /* Handle normal Linux uptimes. */
57 DEFINE_PER_CPU(unsigned long, process_counts
) = 0;
59 rwlock_t tasklist_lock __cacheline_aligned
= RW_LOCK_UNLOCKED
; /* outer */
61 EXPORT_SYMBOL(tasklist_lock
);
63 int nr_processes(void)
68 for_each_online_cpu(cpu
)
69 total
+= per_cpu(process_counts
, cpu
);
74 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
75 # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
76 # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
77 static kmem_cache_t
*task_struct_cachep
;
80 void free_task(struct task_struct
*tsk
)
82 free_thread_info(tsk
->thread_info
);
83 free_task_struct(tsk
);
85 EXPORT_SYMBOL(free_task
);
87 void __put_task_struct(struct task_struct
*tsk
)
89 WARN_ON(!(tsk
->state
& (TASK_DEAD
| TASK_ZOMBIE
)));
90 WARN_ON(atomic_read(&tsk
->usage
));
91 WARN_ON(tsk
== current
);
93 if (unlikely(tsk
->audit_context
))
95 security_task_free(tsk
);
97 put_group_info(tsk
->group_info
);
99 if (!profile_handoff_task(tsk
))
103 void fastcall
add_wait_queue(wait_queue_head_t
*q
, wait_queue_t
* wait
)
107 wait
->flags
&= ~WQ_FLAG_EXCLUSIVE
;
108 spin_lock_irqsave(&q
->lock
, flags
);
109 __add_wait_queue(q
, wait
);
110 spin_unlock_irqrestore(&q
->lock
, flags
);
113 EXPORT_SYMBOL(add_wait_queue
);
115 void fastcall
add_wait_queue_exclusive(wait_queue_head_t
*q
, wait_queue_t
* wait
)
119 wait
->flags
|= WQ_FLAG_EXCLUSIVE
;
120 spin_lock_irqsave(&q
->lock
, flags
);
121 __add_wait_queue_tail(q
, wait
);
122 spin_unlock_irqrestore(&q
->lock
, flags
);
125 EXPORT_SYMBOL(add_wait_queue_exclusive
);
127 void fastcall
remove_wait_queue(wait_queue_head_t
*q
, wait_queue_t
* wait
)
131 spin_lock_irqsave(&q
->lock
, flags
);
132 __remove_wait_queue(q
, wait
);
133 spin_unlock_irqrestore(&q
->lock
, flags
);
136 EXPORT_SYMBOL(remove_wait_queue
);
140 * Note: we use "set_current_state()" _after_ the wait-queue add,
141 * because we need a memory barrier there on SMP, so that any
142 * wake-function that tests for the wait-queue being active
143 * will be guaranteed to see waitqueue addition _or_ subsequent
144 * tests in this thread will see the wakeup having taken place.
146 * The spin_unlock() itself is semi-permeable and only protects
147 * one way (it only protects stuff inside the critical region and
148 * stops them from bleeding out - it would still allow subsequent
149 * loads to move into the the critical region).
151 void fastcall
prepare_to_wait(wait_queue_head_t
*q
, wait_queue_t
*wait
, int state
)
155 wait
->flags
&= ~WQ_FLAG_EXCLUSIVE
;
156 spin_lock_irqsave(&q
->lock
, flags
);
157 if (list_empty(&wait
->task_list
))
158 __add_wait_queue(q
, wait
);
160 * don't alter the task state if this is just going to
161 * queue an async wait queue callback
163 if (is_sync_wait(wait
))
164 set_current_state(state
);
165 spin_unlock_irqrestore(&q
->lock
, flags
);
168 EXPORT_SYMBOL(prepare_to_wait
);
171 prepare_to_wait_exclusive(wait_queue_head_t
*q
, wait_queue_t
*wait
, int state
)
175 wait
->flags
|= WQ_FLAG_EXCLUSIVE
;
176 spin_lock_irqsave(&q
->lock
, flags
);
177 if (list_empty(&wait
->task_list
))
178 __add_wait_queue_tail(q
, wait
);
180 * don't alter the task state if this is just going to
181 * queue an async wait queue callback
183 if (is_sync_wait(wait
))
184 set_current_state(state
);
185 spin_unlock_irqrestore(&q
->lock
, flags
);
188 EXPORT_SYMBOL(prepare_to_wait_exclusive
);
190 void fastcall
finish_wait(wait_queue_head_t
*q
, wait_queue_t
*wait
)
194 __set_current_state(TASK_RUNNING
);
196 * We can check for list emptiness outside the lock
198 * - we use the "careful" check that verifies both
199 * the next and prev pointers, so that there cannot
200 * be any half-pending updates in progress on other
201 * CPU's that we haven't seen yet (and that might
202 * still change the stack area.
204 * - all other users take the lock (ie we can only
205 * have _one_ other CPU that looks at or modifies
208 if (!list_empty_careful(&wait
->task_list
)) {
209 spin_lock_irqsave(&q
->lock
, flags
);
210 list_del_init(&wait
->task_list
);
211 spin_unlock_irqrestore(&q
->lock
, flags
);
215 EXPORT_SYMBOL(finish_wait
);
217 int autoremove_wake_function(wait_queue_t
*wait
, unsigned mode
, int sync
, void *key
)
219 int ret
= default_wake_function(wait
, mode
, sync
, key
);
222 list_del_init(&wait
->task_list
);
226 EXPORT_SYMBOL(autoremove_wake_function
);
228 void __init
fork_init(unsigned long mempages
)
230 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
231 #ifndef ARCH_MIN_TASKALIGN
232 #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
234 /* create a slab on which task_structs can be allocated */
236 kmem_cache_create("task_struct", sizeof(struct task_struct
),
237 ARCH_MIN_TASKALIGN
, SLAB_PANIC
, NULL
, NULL
);
241 * The default maximum number of threads is set to a safe
242 * value: the thread structures can take up at most half
245 max_threads
= mempages
/ (THREAD_SIZE
/PAGE_SIZE
) / 8;
247 * we need to allow at least 20 threads to boot a system
252 init_task
.rlim
[RLIMIT_NPROC
].rlim_cur
= max_threads
/2;
253 init_task
.rlim
[RLIMIT_NPROC
].rlim_max
= max_threads
/2;
256 static struct task_struct
*dup_task_struct(struct task_struct
*orig
)
258 struct task_struct
*tsk
;
259 struct thread_info
*ti
;
261 prepare_to_copy(orig
);
263 tsk
= alloc_task_struct();
267 ti
= alloc_thread_info(tsk
);
269 free_task_struct(tsk
);
273 *ti
= *orig
->thread_info
;
275 tsk
->thread_info
= ti
;
278 /* One for us, one for whoever does the "release_task()" (usually parent) */
279 atomic_set(&tsk
->usage
,2);
284 static inline int dup_mmap(struct mm_struct
* mm
, struct mm_struct
* oldmm
)
286 struct vm_area_struct
* mpnt
, *tmp
, **pprev
;
287 struct rb_node
**rb_link
, *rb_parent
;
289 unsigned long charge
;
290 struct mempolicy
*pol
;
292 down_write(&oldmm
->mmap_sem
);
293 flush_cache_mm(current
->mm
);
296 mm
->mmap_cache
= NULL
;
297 mm
->free_area_cache
= oldmm
->mmap_base
;
300 cpus_clear(mm
->cpu_vm_mask
);
302 rb_link
= &mm
->mm_rb
.rb_node
;
307 * Add it to the mmlist after the parent.
308 * Doing it this way means that we can order the list,
309 * and fork() won't mess up the ordering significantly.
310 * Add it first so that swapoff can see any swap entries.
312 spin_lock(&mmlist_lock
);
313 list_add(&mm
->mmlist
, ¤t
->mm
->mmlist
);
315 spin_unlock(&mmlist_lock
);
317 for (mpnt
= current
->mm
->mmap
; mpnt
; mpnt
= mpnt
->vm_next
) {
320 if (mpnt
->vm_flags
& VM_DONTCOPY
) {
321 __vm_stat_account(mm
, mpnt
->vm_flags
, mpnt
->vm_file
,
326 if (mpnt
->vm_flags
& VM_ACCOUNT
) {
327 unsigned int len
= (mpnt
->vm_end
- mpnt
->vm_start
) >> PAGE_SHIFT
;
328 if (security_vm_enough_memory(len
))
332 tmp
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
336 pol
= mpol_copy(vma_policy(mpnt
));
337 retval
= PTR_ERR(pol
);
339 goto fail_nomem_policy
;
340 vma_set_policy(tmp
, pol
);
341 tmp
->vm_flags
&= ~VM_LOCKED
;
347 struct inode
*inode
= file
->f_dentry
->d_inode
;
349 if (tmp
->vm_flags
& VM_DENYWRITE
)
350 atomic_dec(&inode
->i_writecount
);
352 /* insert tmp into the share list, just after mpnt */
353 spin_lock(&file
->f_mapping
->i_mmap_lock
);
354 flush_dcache_mmap_lock(file
->f_mapping
);
355 vma_prio_tree_add(tmp
, mpnt
);
356 flush_dcache_mmap_unlock(file
->f_mapping
);
357 spin_unlock(&file
->f_mapping
->i_mmap_lock
);
361 * Link in the new vma and copy the page table entries:
362 * link in first so that swapoff can see swap entries,
363 * and try_to_unmap_one's find_vma find the new vma.
365 spin_lock(&mm
->page_table_lock
);
367 pprev
= &tmp
->vm_next
;
369 __vma_link_rb(mm
, tmp
, rb_link
, rb_parent
);
370 rb_link
= &tmp
->vm_rb
.rb_right
;
371 rb_parent
= &tmp
->vm_rb
;
374 retval
= copy_page_range(mm
, current
->mm
, tmp
);
375 spin_unlock(&mm
->page_table_lock
);
377 if (tmp
->vm_ops
&& tmp
->vm_ops
->open
)
378 tmp
->vm_ops
->open(tmp
);
386 flush_tlb_mm(current
->mm
);
387 up_write(&oldmm
->mmap_sem
);
390 kmem_cache_free(vm_area_cachep
, tmp
);
393 vm_unacct_memory(charge
);
397 static inline int mm_alloc_pgd(struct mm_struct
* mm
)
399 mm
->pgd
= pgd_alloc(mm
);
400 if (unlikely(!mm
->pgd
))
405 static inline void mm_free_pgd(struct mm_struct
* mm
)
410 #define dup_mmap(mm, oldmm) (0)
411 #define mm_alloc_pgd(mm) (0)
412 #define mm_free_pgd(mm)
413 #endif /* CONFIG_MMU */
415 spinlock_t mmlist_lock __cacheline_aligned_in_smp
= SPIN_LOCK_UNLOCKED
;
418 #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
419 #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
421 #include <linux/init_task.h>
423 static struct mm_struct
* mm_init(struct mm_struct
* mm
)
425 atomic_set(&mm
->mm_users
, 1);
426 atomic_set(&mm
->mm_count
, 1);
427 init_rwsem(&mm
->mmap_sem
);
428 mm
->core_waiters
= 0;
429 mm
->page_table_lock
= SPIN_LOCK_UNLOCKED
;
430 mm
->ioctx_list_lock
= RW_LOCK_UNLOCKED
;
431 mm
->ioctx_list
= NULL
;
432 mm
->default_kioctx
= (struct kioctx
)INIT_KIOCTX(mm
->default_kioctx
, *mm
);
433 mm
->free_area_cache
= TASK_UNMAPPED_BASE
;
435 if (likely(!mm_alloc_pgd(mm
))) {
444 * Allocate and initialize an mm_struct.
446 struct mm_struct
* mm_alloc(void)
448 struct mm_struct
* mm
;
452 memset(mm
, 0, sizeof(*mm
));
459 * Called when the last reference to the mm
460 * is dropped: either by a lazy thread or by
461 * mmput. Free the page directory and the mm.
463 void fastcall
__mmdrop(struct mm_struct
*mm
)
465 BUG_ON(mm
== &init_mm
);
472 * Decrement the use count and release all resources for an mm.
474 void mmput(struct mm_struct
*mm
)
476 if (atomic_dec_and_lock(&mm
->mm_users
, &mmlist_lock
)) {
477 list_del(&mm
->mmlist
);
479 spin_unlock(&mmlist_lock
);
486 EXPORT_SYMBOL_GPL(mmput
);
489 * get_task_mm - acquire a reference to the task's mm
491 * Returns %NULL if the task has no mm. Checks if the use count
492 * of the mm is non-zero and if so returns a reference to it, after
493 * bumping up the use count. User must release the mm via mmput()
494 * after use. Typically used by /proc and ptrace.
496 * If the use count is zero, it means that this mm is going away,
497 * so return %NULL. This only happens in the case of an AIO daemon
498 * which has temporarily adopted an mm (see use_mm), in the course
499 * of its final mmput, before exit_aio has completed.
501 struct mm_struct
*get_task_mm(struct task_struct
*task
)
503 struct mm_struct
*mm
;
508 spin_lock(&mmlist_lock
);
509 if (!atomic_read(&mm
->mm_users
))
512 atomic_inc(&mm
->mm_users
);
513 spin_unlock(&mmlist_lock
);
518 EXPORT_SYMBOL_GPL(get_task_mm
);
520 /* Please note the differences between mmput and mm_release.
521 * mmput is called whenever we stop holding onto a mm_struct,
522 * error success whatever.
524 * mm_release is called after a mm_struct has been removed
525 * from the current process.
527 * This difference is important for error handling, when we
528 * only half set up a mm_struct for a new process and need to restore
529 * the old one. Because we mmput the new mm_struct before
530 * restoring the old one. . .
531 * Eric Biederman 10 January 1998
533 void mm_release(struct task_struct
*tsk
, struct mm_struct
*mm
)
535 struct completion
*vfork_done
= tsk
->vfork_done
;
537 /* Get rid of any cached register state */
538 deactivate_mm(tsk
, mm
);
540 /* notify parent sleeping on vfork() */
542 tsk
->vfork_done
= NULL
;
543 complete(vfork_done
);
545 if (tsk
->clear_child_tid
&& atomic_read(&mm
->mm_users
) > 1) {
546 u32 __user
* tidptr
= tsk
->clear_child_tid
;
547 tsk
->clear_child_tid
= NULL
;
550 * We don't check the error code - if userspace has
551 * not set up a proper pointer then tough luck.
554 sys_futex(tidptr
, FUTEX_WAKE
, 1, NULL
, NULL
, 0);
558 static int copy_mm(unsigned long clone_flags
, struct task_struct
* tsk
)
560 struct mm_struct
* mm
, *oldmm
;
563 tsk
->min_flt
= tsk
->maj_flt
= 0;
564 tsk
->nvcsw
= tsk
->nivcsw
= 0;
567 tsk
->active_mm
= NULL
;
570 * Are we cloning a kernel thread?
572 * We need to steal a active VM for that..
578 if (clone_flags
& CLONE_VM
) {
579 atomic_inc(&oldmm
->mm_users
);
582 * There are cases where the PTL is held to ensure no
583 * new threads start up in user mode using an mm, which
584 * allows optimizing out ipis; the tlb_gather_mmu code
587 spin_unlock_wait(&oldmm
->page_table_lock
);
596 /* Copy the current MM stuff.. */
597 memcpy(mm
, oldmm
, sizeof(*mm
));
601 if (init_new_context(tsk
,mm
))
604 retval
= dup_mmap(mm
, oldmm
);
620 * If init_new_context() failed, we cannot use mmput() to free the mm
621 * because it calls destroy_context()
628 static inline struct fs_struct
*__copy_fs_struct(struct fs_struct
*old
)
630 struct fs_struct
*fs
= kmem_cache_alloc(fs_cachep
, GFP_KERNEL
);
631 /* We don't need to lock fs - think why ;-) */
633 atomic_set(&fs
->count
, 1);
634 fs
->lock
= RW_LOCK_UNLOCKED
;
635 fs
->umask
= old
->umask
;
636 read_lock(&old
->lock
);
637 fs
->rootmnt
= mntget(old
->rootmnt
);
638 fs
->root
= dget(old
->root
);
639 fs
->pwdmnt
= mntget(old
->pwdmnt
);
640 fs
->pwd
= dget(old
->pwd
);
642 fs
->altrootmnt
= mntget(old
->altrootmnt
);
643 fs
->altroot
= dget(old
->altroot
);
645 fs
->altrootmnt
= NULL
;
648 read_unlock(&old
->lock
);
653 struct fs_struct
*copy_fs_struct(struct fs_struct
*old
)
655 return __copy_fs_struct(old
);
658 EXPORT_SYMBOL_GPL(copy_fs_struct
);
660 static inline int copy_fs(unsigned long clone_flags
, struct task_struct
* tsk
)
662 if (clone_flags
& CLONE_FS
) {
663 atomic_inc(¤t
->fs
->count
);
666 tsk
->fs
= __copy_fs_struct(current
->fs
);
672 static int count_open_files(struct files_struct
*files
, int size
)
676 /* Find the last open fd */
677 for (i
= size
/(8*sizeof(long)); i
> 0; ) {
678 if (files
->open_fds
->fds_bits
[--i
])
681 i
= (i
+1) * 8 * sizeof(long);
685 static int copy_files(unsigned long clone_flags
, struct task_struct
* tsk
)
687 struct files_struct
*oldf
, *newf
;
688 struct file
**old_fds
, **new_fds
;
689 int open_files
, nfds
, size
, i
, error
= 0;
692 * A background process may not have any files ...
694 oldf
= current
->files
;
698 if (clone_flags
& CLONE_FILES
) {
699 atomic_inc(&oldf
->count
);
704 * Note: we may be using current for both targets (See exec.c)
705 * This works because we cache current->files (old) as oldf. Don't
710 newf
= kmem_cache_alloc(files_cachep
, SLAB_KERNEL
);
714 atomic_set(&newf
->count
, 1);
716 newf
->file_lock
= SPIN_LOCK_UNLOCKED
;
718 newf
->max_fds
= NR_OPEN_DEFAULT
;
719 newf
->max_fdset
= __FD_SETSIZE
;
720 newf
->close_on_exec
= &newf
->close_on_exec_init
;
721 newf
->open_fds
= &newf
->open_fds_init
;
722 newf
->fd
= &newf
->fd_array
[0];
724 /* We don't yet have the oldf readlock, but even if the old
725 fdset gets grown now, we'll only copy up to "size" fds */
726 size
= oldf
->max_fdset
;
727 if (size
> __FD_SETSIZE
) {
729 spin_lock(&newf
->file_lock
);
730 error
= expand_fdset(newf
, size
-1);
731 spin_unlock(&newf
->file_lock
);
735 spin_lock(&oldf
->file_lock
);
737 open_files
= count_open_files(oldf
, size
);
740 * Check whether we need to allocate a larger fd array.
741 * Note: we're not a clone task, so the open count won't
744 nfds
= NR_OPEN_DEFAULT
;
745 if (open_files
> nfds
) {
746 spin_unlock(&oldf
->file_lock
);
748 spin_lock(&newf
->file_lock
);
749 error
= expand_fd_array(newf
, open_files
-1);
750 spin_unlock(&newf
->file_lock
);
753 nfds
= newf
->max_fds
;
754 spin_lock(&oldf
->file_lock
);
760 memcpy(newf
->open_fds
->fds_bits
, oldf
->open_fds
->fds_bits
, open_files
/8);
761 memcpy(newf
->close_on_exec
->fds_bits
, oldf
->close_on_exec
->fds_bits
, open_files
/8);
763 for (i
= open_files
; i
!= 0; i
--) {
764 struct file
*f
= *old_fds
++;
769 spin_unlock(&oldf
->file_lock
);
771 /* compute the remainder to be cleared */
772 size
= (newf
->max_fds
- open_files
) * sizeof(struct file
*);
774 /* This is long word aligned thus could use a optimized version */
775 memset(new_fds
, 0, size
);
777 if (newf
->max_fdset
> open_files
) {
778 int left
= (newf
->max_fdset
-open_files
)/8;
779 int start
= open_files
/ (8 * sizeof(unsigned long));
781 memset(&newf
->open_fds
->fds_bits
[start
], 0, left
);
782 memset(&newf
->close_on_exec
->fds_bits
[start
], 0, left
);
791 free_fdset (newf
->close_on_exec
, newf
->max_fdset
);
792 free_fdset (newf
->open_fds
, newf
->max_fdset
);
793 kmem_cache_free(files_cachep
, newf
);
798 * Helper to unshare the files of the current task.
799 * We don't want to expose copy_files internals to
800 * the exec layer of the kernel.
803 int unshare_files(void)
805 struct files_struct
*files
= current
->files
;
811 /* This can race but the race causes us to copy when we don't
812 need to and drop the copy */
813 if(atomic_read(&files
->count
) == 1)
815 atomic_inc(&files
->count
);
818 rc
= copy_files(0, current
);
820 current
->files
= files
;
824 EXPORT_SYMBOL(unshare_files
);
826 static inline int copy_sighand(unsigned long clone_flags
, struct task_struct
* tsk
)
828 struct sighand_struct
*sig
;
830 if (clone_flags
& (CLONE_SIGHAND
| CLONE_THREAD
)) {
831 atomic_inc(¤t
->sighand
->count
);
834 sig
= kmem_cache_alloc(sighand_cachep
, GFP_KERNEL
);
838 spin_lock_init(&sig
->siglock
);
839 atomic_set(&sig
->count
, 1);
840 memcpy(sig
->action
, current
->sighand
->action
, sizeof(sig
->action
));
844 static inline int copy_signal(unsigned long clone_flags
, struct task_struct
* tsk
)
846 struct signal_struct
*sig
;
848 if (clone_flags
& CLONE_THREAD
) {
849 atomic_inc(¤t
->signal
->count
);
852 sig
= kmem_cache_alloc(signal_cachep
, GFP_KERNEL
);
856 atomic_set(&sig
->count
, 1);
858 sig
->group_exit_code
= 0;
859 sig
->group_exit_task
= NULL
;
860 sig
->group_stop_count
= 0;
861 sig
->curr_target
= NULL
;
862 init_sigpending(&sig
->shared_pending
);
863 INIT_LIST_HEAD(&sig
->posix_timers
);
865 sig
->tty
= current
->signal
->tty
;
866 sig
->pgrp
= process_group(current
);
867 sig
->session
= current
->signal
->session
;
868 sig
->leader
= 0; /* session leadership doesn't inherit */
869 sig
->tty_old_pgrp
= 0;
871 sig
->utime
= sig
->stime
= sig
->cutime
= sig
->cstime
= 0;
872 sig
->nvcsw
= sig
->nivcsw
= sig
->cnvcsw
= sig
->cnivcsw
= 0;
873 sig
->min_flt
= sig
->maj_flt
= sig
->cmin_flt
= sig
->cmaj_flt
= 0;
878 static inline void copy_flags(unsigned long clone_flags
, struct task_struct
*p
)
880 unsigned long new_flags
= p
->flags
;
882 new_flags
&= ~PF_SUPERPRIV
;
883 new_flags
|= PF_FORKNOEXEC
;
884 if (!(clone_flags
& CLONE_PTRACE
))
886 p
->flags
= new_flags
;
889 asmlinkage
long sys_set_tid_address(int __user
*tidptr
)
891 current
->clear_child_tid
= tidptr
;
897 * This creates a new process as a copy of the old one,
898 * but does not actually start it yet.
900 * It copies the registers, and all the appropriate
901 * parts of the process environment (as per the clone
902 * flags). The actual kick-off is left to the caller.
904 static task_t
*copy_process(unsigned long clone_flags
,
905 unsigned long stack_start
,
906 struct pt_regs
*regs
,
907 unsigned long stack_size
,
908 int __user
*parent_tidptr
,
909 int __user
*child_tidptr
,
913 struct task_struct
*p
= NULL
;
915 if ((clone_flags
& (CLONE_NEWNS
|CLONE_FS
)) == (CLONE_NEWNS
|CLONE_FS
))
916 return ERR_PTR(-EINVAL
);
919 * Thread groups must share signals as well, and detached threads
920 * can only be started up within the thread group.
922 if ((clone_flags
& CLONE_THREAD
) && !(clone_flags
& CLONE_SIGHAND
))
923 return ERR_PTR(-EINVAL
);
926 * Shared signal handlers imply shared VM. By way of the above,
927 * thread groups also imply shared VM. Blocking this case allows
928 * for various simplifications in other code.
930 if ((clone_flags
& CLONE_SIGHAND
) && !(clone_flags
& CLONE_VM
))
931 return ERR_PTR(-EINVAL
);
933 retval
= security_task_create(clone_flags
);
938 p
= dup_task_struct(current
);
943 if (atomic_read(&p
->user
->processes
) >=
944 p
->rlim
[RLIMIT_NPROC
].rlim_cur
) {
945 if (!capable(CAP_SYS_ADMIN
) && !capable(CAP_SYS_RESOURCE
) &&
946 p
->user
!= &root_user
)
950 atomic_inc(&p
->user
->__count
);
951 atomic_inc(&p
->user
->processes
);
952 get_group_info(p
->group_info
);
955 * If multiple threads are within copy_process(), then this check
956 * triggers too late. This doesn't hurt, the check is only there
957 * to stop root fork bombs.
959 if (nr_threads
>= max_threads
)
960 goto bad_fork_cleanup_count
;
962 if (!try_module_get(p
->thread_info
->exec_domain
->module
))
963 goto bad_fork_cleanup_count
;
965 if (p
->binfmt
&& !try_module_get(p
->binfmt
->module
))
966 goto bad_fork_cleanup_put_domain
;
969 copy_flags(clone_flags
, p
);
972 if (clone_flags
& CLONE_PARENT_SETTID
)
973 if (put_user(p
->pid
, parent_tidptr
))
974 goto bad_fork_cleanup
;
976 p
->proc_dentry
= NULL
;
978 INIT_LIST_HEAD(&p
->children
);
979 INIT_LIST_HEAD(&p
->sibling
);
980 init_waitqueue_head(&p
->wait_chldexit
);
981 p
->vfork_done
= NULL
;
982 spin_lock_init(&p
->alloc_lock
);
983 spin_lock_init(&p
->proc_lock
);
985 clear_tsk_thread_flag(p
, TIF_SIGPENDING
);
986 init_sigpending(&p
->pending
);
988 p
->it_real_value
= p
->it_virt_value
= p
->it_prof_value
= 0;
989 p
->it_real_incr
= p
->it_virt_incr
= p
->it_prof_incr
= 0;
990 init_timer(&p
->real_timer
);
991 p
->real_timer
.data
= (unsigned long) p
;
993 p
->utime
= p
->stime
= 0;
994 p
->lock_depth
= -1; /* -1 = no lock */
995 do_posix_clock_monotonic_gettime(&p
->start_time
);
997 p
->io_context
= NULL
;
999 p
->audit_context
= NULL
;
1001 p
->mempolicy
= mpol_copy(p
->mempolicy
);
1002 if (IS_ERR(p
->mempolicy
)) {
1003 retval
= PTR_ERR(p
->mempolicy
);
1004 p
->mempolicy
= NULL
;
1005 goto bad_fork_cleanup
;
1009 if ((retval
= security_task_alloc(p
)))
1010 goto bad_fork_cleanup_policy
;
1011 if ((retval
= audit_alloc(p
)))
1012 goto bad_fork_cleanup_security
;
1013 /* copy all the process information */
1014 if ((retval
= copy_semundo(clone_flags
, p
)))
1015 goto bad_fork_cleanup_audit
;
1016 if ((retval
= copy_files(clone_flags
, p
)))
1017 goto bad_fork_cleanup_semundo
;
1018 if ((retval
= copy_fs(clone_flags
, p
)))
1019 goto bad_fork_cleanup_files
;
1020 if ((retval
= copy_sighand(clone_flags
, p
)))
1021 goto bad_fork_cleanup_fs
;
1022 if ((retval
= copy_signal(clone_flags
, p
)))
1023 goto bad_fork_cleanup_sighand
;
1024 if ((retval
= copy_mm(clone_flags
, p
)))
1025 goto bad_fork_cleanup_signal
;
1026 if ((retval
= copy_namespace(clone_flags
, p
)))
1027 goto bad_fork_cleanup_mm
;
1028 retval
= copy_thread(0, clone_flags
, stack_start
, stack_size
, p
, regs
);
1030 goto bad_fork_cleanup_namespace
;
1032 p
->set_child_tid
= (clone_flags
& CLONE_CHILD_SETTID
) ? child_tidptr
: NULL
;
1034 * Clear TID on mm_release()?
1036 p
->clear_child_tid
= (clone_flags
& CLONE_CHILD_CLEARTID
) ? child_tidptr
: NULL
;
1039 * Syscall tracing should be turned off in the child regardless
1042 clear_tsk_thread_flag(p
, TIF_SYSCALL_TRACE
);
1044 /* Our parent execution domain becomes current domain
1045 These must match for thread signalling to apply */
1047 p
->parent_exec_id
= p
->self_exec_id
;
1049 /* ok, now we should be set up.. */
1050 p
->exit_signal
= (clone_flags
& CLONE_THREAD
) ? -1 : (clone_flags
& CSIGNAL
);
1051 p
->pdeath_signal
= 0;
1053 /* Perform scheduler related setup */
1057 * Ok, make it visible to the rest of the system.
1058 * We dont wake it up yet.
1061 p
->group_leader
= p
;
1062 INIT_LIST_HEAD(&p
->ptrace_children
);
1063 INIT_LIST_HEAD(&p
->ptrace_list
);
1065 /* Need tasklist lock for parent etc handling! */
1066 write_lock_irq(&tasklist_lock
);
1069 * The task hasn't been attached yet, so cpus_allowed mask cannot
1070 * have changed. The cpus_allowed mask of the parent may have
1071 * changed after it was copied first time, and it may then move to
1072 * another CPU - so we re-copy it here and set the child's CPU to
1073 * the parent's CPU. This avoids alot of nasty races.
1075 p
->cpus_allowed
= current
->cpus_allowed
;
1076 set_task_cpu(p
, smp_processor_id());
1079 * Check for pending SIGKILL! The new thread should not be allowed
1080 * to slip out of an OOM kill. (or normal SIGKILL.)
1082 if (sigismember(¤t
->pending
.signal
, SIGKILL
)) {
1083 write_unlock_irq(&tasklist_lock
);
1085 goto bad_fork_cleanup_namespace
;
1088 /* CLONE_PARENT re-uses the old parent */
1089 if (clone_flags
& (CLONE_PARENT
|CLONE_THREAD
))
1090 p
->real_parent
= current
->real_parent
;
1092 p
->real_parent
= current
;
1093 p
->parent
= p
->real_parent
;
1095 if (clone_flags
& CLONE_THREAD
) {
1096 spin_lock(¤t
->sighand
->siglock
);
1098 * Important: if an exit-all has been started then
1099 * do not create this new thread - the whole thread
1100 * group is supposed to exit anyway.
1102 if (current
->signal
->group_exit
) {
1103 spin_unlock(¤t
->sighand
->siglock
);
1104 write_unlock_irq(&tasklist_lock
);
1106 goto bad_fork_cleanup_namespace
;
1108 p
->tgid
= current
->tgid
;
1109 p
->group_leader
= current
->group_leader
;
1111 if (current
->signal
->group_stop_count
> 0) {
1113 * There is an all-stop in progress for the group.
1114 * We ourselves will stop as soon as we check signals.
1115 * Make the new thread part of that group stop too.
1117 current
->signal
->group_stop_count
++;
1118 set_tsk_thread_flag(p
, TIF_SIGPENDING
);
1121 spin_unlock(¤t
->sighand
->siglock
);
1125 if (unlikely(p
->ptrace
& PT_PTRACED
))
1126 __ptrace_link(p
, current
->parent
);
1128 attach_pid(p
, PIDTYPE_PID
, p
->pid
);
1129 attach_pid(p
, PIDTYPE_TGID
, p
->tgid
);
1130 if (thread_group_leader(p
)) {
1131 attach_pid(p
, PIDTYPE_PGID
, process_group(p
));
1132 attach_pid(p
, PIDTYPE_SID
, p
->signal
->session
);
1134 __get_cpu_var(process_counts
)++;
1138 write_unlock_irq(&tasklist_lock
);
1143 return ERR_PTR(retval
);
1146 bad_fork_cleanup_namespace
:
1148 bad_fork_cleanup_mm
:
1151 bad_fork_cleanup_signal
:
1153 bad_fork_cleanup_sighand
:
1155 bad_fork_cleanup_fs
:
1156 exit_fs(p
); /* blocking */
1157 bad_fork_cleanup_files
:
1158 exit_files(p
); /* blocking */
1159 bad_fork_cleanup_semundo
:
1161 bad_fork_cleanup_audit
:
1163 bad_fork_cleanup_security
:
1164 security_task_free(p
);
1165 bad_fork_cleanup_policy
:
1167 mpol_free(p
->mempolicy
);
1171 module_put(p
->binfmt
->module
);
1172 bad_fork_cleanup_put_domain
:
1173 module_put(p
->thread_info
->exec_domain
->module
);
1174 bad_fork_cleanup_count
:
1175 put_group_info(p
->group_info
);
1176 atomic_dec(&p
->user
->processes
);
1183 struct pt_regs
* __devinit
__attribute__((weak
)) idle_regs(struct pt_regs
*regs
)
1185 memset(regs
, 0, sizeof(struct pt_regs
));
1189 task_t
* __devinit
fork_idle(int cpu
)
1192 struct pt_regs regs
;
1194 task
= copy_process(CLONE_VM
, 0, idle_regs(®s
), 0, NULL
, NULL
, 0);
1196 return ERR_PTR(-ENOMEM
);
1197 init_idle(task
, cpu
);
1198 unhash_process(task
);
1202 static inline int fork_traceflag (unsigned clone_flags
)
1204 if (clone_flags
& CLONE_UNTRACED
)
1206 else if (clone_flags
& CLONE_VFORK
) {
1207 if (current
->ptrace
& PT_TRACE_VFORK
)
1208 return PTRACE_EVENT_VFORK
;
1209 } else if ((clone_flags
& CSIGNAL
) != SIGCHLD
) {
1210 if (current
->ptrace
& PT_TRACE_CLONE
)
1211 return PTRACE_EVENT_CLONE
;
1212 } else if (current
->ptrace
& PT_TRACE_FORK
)
1213 return PTRACE_EVENT_FORK
;
1219 * Ok, this is the main fork-routine.
1221 * It copies the process, and if successful kick-starts
1222 * it and waits for it to finish using the VM if required.
1224 long do_fork(unsigned long clone_flags
,
1225 unsigned long stack_start
,
1226 struct pt_regs
*regs
,
1227 unsigned long stack_size
,
1228 int __user
*parent_tidptr
,
1229 int __user
*child_tidptr
)
1231 struct task_struct
*p
;
1233 long pid
= alloc_pidmap();
1237 if (unlikely(current
->ptrace
)) {
1238 trace
= fork_traceflag (clone_flags
);
1240 clone_flags
|= CLONE_PTRACE
;
1243 p
= copy_process(clone_flags
, stack_start
, regs
, stack_size
, parent_tidptr
, child_tidptr
, pid
);
1245 * Do this prior waking up the new thread - the thread pointer
1246 * might get invalid after that point, if the thread exits quickly.
1249 struct completion vfork
;
1251 if (clone_flags
& CLONE_VFORK
) {
1252 p
->vfork_done
= &vfork
;
1253 init_completion(&vfork
);
1256 if ((p
->ptrace
& PT_PTRACED
) || (clone_flags
& CLONE_STOPPED
)) {
1258 * We'll start up with an immediate SIGSTOP.
1260 sigaddset(&p
->pending
.signal
, SIGSTOP
);
1261 set_tsk_thread_flag(p
, TIF_SIGPENDING
);
1264 if (!(clone_flags
& CLONE_STOPPED
))
1265 wake_up_new_task(p
, clone_flags
);
1267 p
->state
= TASK_STOPPED
;
1270 if (unlikely (trace
)) {
1271 current
->ptrace_message
= pid
;
1272 ptrace_notify ((trace
<< 8) | SIGTRAP
);
1275 if (clone_flags
& CLONE_VFORK
) {
1276 wait_for_completion(&vfork
);
1277 if (unlikely (current
->ptrace
& PT_TRACE_VFORK_DONE
))
1278 ptrace_notify ((PTRACE_EVENT_VFORK_DONE
<< 8) | SIGTRAP
);
1287 /* SLAB cache for signal_struct structures (tsk->signal) */
1288 kmem_cache_t
*signal_cachep
;
1290 /* SLAB cache for sighand_struct structures (tsk->sighand) */
1291 kmem_cache_t
*sighand_cachep
;
1293 /* SLAB cache for files_struct structures (tsk->files) */
1294 kmem_cache_t
*files_cachep
;
1296 /* SLAB cache for fs_struct structures (tsk->fs) */
1297 kmem_cache_t
*fs_cachep
;
1299 /* SLAB cache for vm_area_struct structures */
1300 kmem_cache_t
*vm_area_cachep
;
1302 /* SLAB cache for mm_struct structures (tsk->mm) */
1303 kmem_cache_t
*mm_cachep
;
1305 void __init
proc_caches_init(void)
1307 sighand_cachep
= kmem_cache_create("sighand_cache",
1308 sizeof(struct sighand_struct
), 0,
1309 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
, NULL
);
1310 signal_cachep
= kmem_cache_create("signal_cache",
1311 sizeof(struct signal_struct
), 0,
1312 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
, NULL
);
1313 files_cachep
= kmem_cache_create("files_cache",
1314 sizeof(struct files_struct
), 0,
1315 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
, NULL
);
1316 fs_cachep
= kmem_cache_create("fs_cache",
1317 sizeof(struct fs_struct
), 0,
1318 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
, NULL
);
1319 vm_area_cachep
= kmem_cache_create("vm_area_struct",
1320 sizeof(struct vm_area_struct
), 0,
1321 SLAB_PANIC
, NULL
, NULL
);
1322 mm_cachep
= kmem_cache_create("mm_struct",
1323 sizeof(struct mm_struct
), 0,
1324 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
, NULL
);