]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/kern_fork.c
xnu-2422.100.13.tar.gz
[apple/xnu.git] / bsd / kern / kern_fork.c
CommitLineData
1c79356b 1/*
2d21ac55 2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/* Copyright (c) 1995, 1997 Apple Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1989, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)kern_fork.c 8.8 (Berkeley) 2/14/95
67 */
2d21ac55
A
68/*
69 * NOTICE: This file was modified by McAfee Research in 2004 to introduce
70 * support for mandatory and extensible security protections. This notice
71 * is included in support of clause 2.2 (b) of the Apple Public License,
72 * Version 2.0.
73 */
74/*
75 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
76 * support for mandatory and extensible security protections. This notice
77 * is included in support of clause 2.2 (b) of the Apple Public License,
78 * Version 2.0.
79 */
1c79356b 80
55e303ae 81#include <kern/assert.h>
1c79356b
A
82#include <sys/param.h>
83#include <sys/systm.h>
84#include <sys/filedesc.h>
85#include <sys/kernel.h>
86#include <sys/malloc.h>
91447636
A
87#include <sys/proc_internal.h>
88#include <sys/kauth.h>
1c79356b
A
89#include <sys/user.h>
90#include <sys/resourcevar.h>
91447636
A
91#include <sys/vnode_internal.h>
92#include <sys/file_internal.h>
1c79356b 93#include <sys/acct.h>
2d21ac55
A
94#include <sys/codesign.h>
95#include <sys/sysproto.h>
96#if CONFIG_DTRACE
97/* Do not include dtrace.h, it redefines kmem_[alloc/free] */
98extern void dtrace_fasttrap_fork(proc_t, proc_t);
99extern void (*dtrace_helpers_fork)(proc_t, proc_t);
100extern void dtrace_lazy_dofs_duplicate(proc_t, proc_t);
101
102#include <sys/dtrace_ptss.h>
9bccf70c 103#endif
1c79356b 104
b0d623f7 105#include <security/audit/audit.h>
91447636 106
1c79356b 107#include <mach/mach_types.h>
91447636
A
108#include <kern/kern_types.h>
109#include <kern/kalloc.h>
1c79356b 110#include <kern/mach_param.h>
91447636 111#include <kern/task.h>
4b17d6b6 112#include <kern/thread.h>
2d21ac55 113#include <kern/thread_call.h>
91447636 114#include <kern/zalloc.h>
1c79356b
A
115
116#include <machine/spl.h>
117
2d21ac55
A
118#if CONFIG_MACF
119#include <security/mac.h>
120#include <security/mac_mach_internal.h>
121#endif
122
123#include <vm/vm_map.h>
124#include <vm/vm_protos.h>
125#include <vm/vm_shared_region.h>
126
127#include <sys/shm_internal.h> /* for shmfork() */
128#include <mach/task.h> /* for thread_create() */
129#include <mach/thread_act.h> /* for thread_resume() */
91447636 130
2d21ac55
A
131#include <sys/sdt.h>
132
316670eb
A
133#if CONFIG_MEMORYSTATUS
134#include <sys/kern_memorystatus.h>
135#endif
136
2d21ac55
A
137/* XXX routines which should have Mach prototypes, but don't */
138void thread_set_parent(thread_t parent, int pid);
139extern void act_thread_catt(void *ctx);
140void thread_set_child(thread_t child, int pid);
141void *act_thread_csave(void);
142
143
39236c6e 144thread_t cloneproc(task_t, proc_t, int, int);
b0d623f7
A
145proc_t forkproc(proc_t);
146void forkproc_free(proc_t);
2d21ac55 147thread_t fork_create_child(task_t parent_task, proc_t child, int inherit_memory, int is64bit);
b0d623f7
A
148void proc_vfork_begin(proc_t parent_proc);
149void proc_vfork_end(proc_t parent_proc);
1c79356b
A
150
151#define DOFORK 0x1 /* fork() system call */
152#define DOVFORK 0x2 /* vfork() system call */
1c79356b 153
b0d623f7
A
154/*
155 * proc_vfork_begin
156 *
157 * Description: start a vfork on a process
158 *
159 * Parameters: parent_proc process (re)entering vfork state
160 *
161 * Returns: (void)
162 *
163 * Notes: Although this function increments a count, a count in
164 * excess of 1 is not currently supported. According to the
165 * POSIX standard, calling anything other than execve() or
316670eb
A
166 * _exit() following a vfork(), including calling vfork()
167 * itself again, will result in undefined behaviour
b0d623f7
A
168 */
169void
170proc_vfork_begin(proc_t parent_proc)
171{
172 proc_lock(parent_proc);
173 parent_proc->p_lflag |= P_LVFORK;
174 parent_proc->p_vforkcnt++;
175 proc_unlock(parent_proc);
176}
177
178/*
179 * proc_vfork_end
180 *
181 * Description: stop a vfork on a process
182 *
183 * Parameters: parent_proc process leaving vfork state
184 *
185 * Returns: (void)
186 *
316670eb 187 * Notes: Decrements the count; currently, reentrancy of vfork()
b0d623f7
A
188 * is unsupported on the current process
189 */
190void
191proc_vfork_end(proc_t parent_proc)
192{
193 proc_lock(parent_proc);
194 parent_proc->p_vforkcnt--;
195 if (parent_proc->p_vforkcnt < 0)
196 panic("vfork cnt is -ve");
b0d623f7
A
197 if (parent_proc->p_vforkcnt == 0)
198 parent_proc->p_lflag &= ~P_LVFORK;
199 proc_unlock(parent_proc);
200}
201
1c79356b
A
202
203/*
2d21ac55
A
204 * vfork
205 *
206 * Description: vfork system call
207 *
208 * Parameters: void [no arguments]
209 *
210 * Retval: 0 (to child process)
211 * !0 pid of child (to parent process)
212 * -1 error (see "Returns:")
213 *
214 * Returns: EAGAIN Administrative limit reached
b0d623f7 215 * EINVAL vfork() called during vfork()
2d21ac55
A
216 * ENOMEM Failed to allocate new process
217 *
218 * Note: After a successful call to this function, the parent process
219 * has its task, thread, and uthread lent to the child process,
220 * and control is returned to the caller; if this function is
221 * invoked as a system call, the return is to user space, and
222 * is effectively running on the child process.
223 *
224 * Subsequent calls that operate on process state are permitted,
225 * though discouraged, and will operate on the child process; any
226 * operations on the task, thread, or uthread will result in
227 * changes in the parent state, and, if inheritable, the child
228 * state, when a task, thread, and uthread are realized for the
229 * child process at execve() time, will also be effected. Given
230 * this, it's recemmended that people use the posix_spawn() call
231 * instead.
b0d623f7
A
232 *
233 * BLOCK DIAGRAM OF VFORK
234 *
235 * Before:
236 *
237 * ,----------------. ,-------------.
238 * | | task | |
239 * | parent_thread | ------> | parent_task |
240 * | | <.list. | |
241 * `----------------' `-------------'
242 * uthread | ^ bsd_info | ^
243 * v | vc_thread v | task
244 * ,----------------. ,-------------.
245 * | | | |
246 * | parent_uthread | <.list. | parent_proc | <-- current_proc()
247 * | | | |
248 * `----------------' `-------------'
249 * uu_proc |
250 * v
251 * NULL
252 *
253 * After:
254 *
255 * ,----------------. ,-------------.
256 * | | task | |
257 * ,----> | parent_thread | ------> | parent_task |
258 * | | | <.list. | |
259 * | `----------------' `-------------'
260 * | uthread | ^ bsd_info | ^
261 * | v | vc_thread v | task
262 * | ,----------------. ,-------------.
263 * | | | | |
264 * | | parent_uthread | <.list. | parent_proc |
265 * | | | | |
266 * | `----------------' `-------------'
267 * | uu_proc | . list
268 * | v v
269 * | ,----------------.
270 * `----- | |
271 * p_vforkact | child_proc | <-- current_proc()
272 * | |
273 * `----------------'
274 */
275int
276vfork(proc_t parent_proc, __unused struct vfork_args *uap, int32_t *retval)
277{
278 thread_t child_thread;
279 int err;
280
281 if ((err = fork1(parent_proc, &child_thread, PROC_CREATE_VFORK)) != 0) {
282 retval[1] = 0;
283 } else {
39236c6e
A
284 uthread_t ut = get_bsdthread_info(current_thread());
285 proc_t child_proc = ut->uu_proc;
b0d623f7
A
286
287 retval[0] = child_proc->p_pid;
288 retval[1] = 1; /* flag child return for user space */
289
290 /*
291 * Drop the signal lock on the child which was taken on our
292 * behalf by forkproc()/cloneproc() to prevent signals being
293 * received by the child in a partially constructed state.
294 */
295 proc_signalend(child_proc, 0);
296 proc_transend(child_proc, 0);
297
b0d623f7
A
298 proc_knote(parent_proc, NOTE_FORK | child_proc->p_pid);
299 DTRACE_PROC1(create, proc_t, child_proc);
39236c6e 300 ut->uu_flag &= ~UT_VFORKING;
b0d623f7
A
301 }
302
39236c6e 303 return (err);
b0d623f7
A
304}
305
306
307/*
308 * fork1
309 *
310 * Description: common code used by all new process creation other than the
311 * bootstrap of the initial process on the system
312 *
313 * Parameters: parent_proc parent process of the process being
314 * child_threadp pointer to location to receive the
315 * Mach thread_t of the child process
316 * breated
317 * kind kind of creation being requested
318 *
319 * Notes: Permissable values for 'kind':
320 *
321 * PROC_CREATE_FORK Create a complete process which will
322 * return actively running in both the
323 * parent and the child; the child copies
324 * the parent address space.
325 * PROC_CREATE_SPAWN Create a complete process which will
326 * return actively running in the parent
327 * only after returning actively running
328 * in the child; the child address space
329 * is newly created by an image activator,
330 * after which the child is run.
331 * PROC_CREATE_VFORK Creates a partial process which will
332 * borrow the parent task, thread, and
333 * uthread to return running in the child;
334 * the child address space and other parts
335 * are lazily created at execve() time, or
336 * the child is terminated, and the parent
337 * does not actively run until that
338 * happens.
339 *
340 * At first it may seem strange that we return the child thread
341 * address rather than process structure, since the process is
342 * the only part guaranteed to be "new"; however, since we do
343 * not actualy adjust other references between Mach and BSD (see
344 * the block diagram above the implementation of vfork()), this
345 * is the only method which guarantees us the ability to get
346 * back to the other information.
1c79356b
A
347 */
348int
b0d623f7 349fork1(proc_t parent_proc, thread_t *child_threadp, int kind)
1c79356b 350{
b0d623f7
A
351 thread_t parent_thread = (thread_t)current_thread();
352 uthread_t parent_uthread = (uthread_t)get_bsdthread_info(parent_thread);
353 proc_t child_proc = NULL; /* set in switch, but compiler... */
354 thread_t child_thread = NULL;
2d21ac55 355 uid_t uid;
0b4e3aa0 356 int count;
b0d623f7
A
357 int err = 0;
358 int spawn = 0;
91447636 359
0b4e3aa0
A
360 /*
361 * Although process entries are dynamically created, we still keep
362 * a global limit on the maximum number we will create. Don't allow
363 * a nonprivileged user to use the last process; don't let root
364 * exceed the limit. The variable nprocs is the current number of
365 * processes, maxproc is the limit.
366 */
6d2010ae 367 uid = kauth_getruid();
2d21ac55 368 proc_list_lock();
0b4e3aa0 369 if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
2d21ac55 370 proc_list_unlock();
0b4e3aa0 371 tablefull("proc");
0b4e3aa0
A
372 return (EAGAIN);
373 }
2d21ac55 374 proc_list_unlock();
0b4e3aa0
A
375
376 /*
377 * Increment the count of procs running with this uid. Don't allow
2d21ac55
A
378 * a nonprivileged user to exceed their current limit, which is
379 * always less than what an rlim_t can hold.
380 * (locking protection is provided by list lock held in chgproccnt)
0b4e3aa0
A
381 */
382 count = chgproccnt(uid, 1);
2d21ac55 383 if (uid != 0 &&
b0d623f7
A
384 (rlim_t)count > parent_proc->p_rlimit[RLIMIT_NPROC].rlim_cur) {
385 err = EAGAIN;
386 goto bad;
0b4e3aa0 387 }
2d21ac55
A
388
389#if CONFIG_MACF
390 /*
391 * Determine if MAC policies applied to the process will allow
b0d623f7 392 * it to fork. This is an advisory-only check.
2d21ac55 393 */
b0d623f7 394 err = mac_proc_check_fork(parent_proc);
2d21ac55 395 if (err != 0) {
b0d623f7 396 goto bad;
2d21ac55
A
397 }
398#endif
399
b0d623f7
A
400 switch(kind) {
401 case PROC_CREATE_VFORK:
402 /*
403 * Prevent a vfork while we are in vfork(); we should
404 * also likely preventing a fork here as well, and this
405 * check should then be outside the switch statement,
406 * since the proc struct contents will copy from the
407 * child and the tash/thread/uthread from the parent in
408 * that case. We do not support vfork() in vfork()
409 * because we don't have to; the same non-requirement
410 * is true of both fork() and posix_spawn() and any
411 * call other than execve() amd _exit(), but we've
412 * been historically lenient, so we continue to be so
413 * (for now).
414 *
415 * <rdar://6640521> Probably a source of random panics
416 */
417 if (parent_uthread->uu_flag & UT_VFORK) {
418 printf("fork1 called within vfork by %s\n", parent_proc->p_comm);
419 err = EINVAL;
420 goto bad;
421 }
0b4e3aa0 422
2d21ac55 423 /*
b0d623f7
A
424 * Flag us in progress; if we chose to support vfork() in
425 * vfork(), we would chain our parent at this point (in
426 * effect, a stack push). We don't, since we actually want
427 * to disallow everything not specified in the standard
2d21ac55 428 */
b0d623f7
A
429 proc_vfork_begin(parent_proc);
430
431 /* The newly created process comes with signal lock held */
432 if ((child_proc = forkproc(parent_proc)) == NULL) {
433 /* Failed to allocate new process */
434 proc_vfork_end(parent_proc);
435 err = ENOMEM;
436 goto bad;
437 }
2d21ac55 438
b0d623f7 439// XXX BEGIN: wants to move to be common code (and safe)
2d21ac55 440#if CONFIG_MACF
b0d623f7
A
441 /*
442 * allow policies to associate the credential/label that
443 * we referenced from the parent ... with the child
444 * JMM - this really isn't safe, as we can drop that
445 * association without informing the policy in other
446 * situations (keep long enough to get policies changed)
447 */
448 mac_cred_label_associate_fork(child_proc->p_ucred, child_proc);
2d21ac55
A
449#endif
450
b0d623f7
A
451 /*
452 * Propogate change of PID - may get new cred if auditing.
453 *
454 * NOTE: This has no effect in the vfork case, since
455 * child_proc->task != current_task(), but we duplicate it
456 * because this is probably, ultimately, wrong, since we
457 * will be running in the "child" which is the parent task
458 * with the wrong token until we get to the execve() or
459 * _exit() call; a lot of "undefined" can happen before
460 * that.
461 *
462 * <rdar://6640530> disallow everything but exeve()/_exit()?
463 */
464 set_security_token(child_proc);
2d21ac55 465
b0d623f7 466 AUDIT_ARG(pid, child_proc->p_pid);
2d21ac55 467
b0d623f7 468// XXX END: wants to move to be common code (and safe)
2d21ac55 469
b0d623f7
A
470 /*
471 * BORROW PARENT TASK, THREAD, UTHREAD FOR CHILD
472 *
473 * Note: this is where we would "push" state instead of setting
474 * it for nested vfork() support (see proc_vfork_end() for
475 * description if issues here).
476 */
477 child_proc->task = parent_proc->task;
0b4e3aa0 478
b0d623f7
A
479 child_proc->p_lflag |= P_LINVFORK;
480 child_proc->p_vforkact = parent_thread;
481 child_proc->p_stat = SRUN;
0b4e3aa0 482
39236c6e
A
483 /*
484 * Until UT_VFORKING is cleared at the end of the vfork
485 * syscall, the process identity of this thread is slightly
486 * murky.
487 *
488 * As long as UT_VFORK and it's associated field (uu_proc)
489 * is set, current_proc() will always return the child process.
490 *
491 * However dtrace_proc_selfpid() returns the parent pid to
492 * ensure that e.g. the proc:::create probe actions accrue
493 * to the parent. (Otherwise the child magically seems to
494 * have created itself!)
495 */
496 parent_uthread->uu_flag |= UT_VFORK | UT_VFORKING;
b0d623f7
A
497 parent_uthread->uu_proc = child_proc;
498 parent_uthread->uu_userstate = (void *)act_thread_csave();
499 parent_uthread->uu_vforkmask = parent_uthread->uu_sigmask;
0b4e3aa0 500
b0d623f7
A
501 /* temporarily drop thread-set-id state */
502 if (parent_uthread->uu_flag & UT_SETUID) {
503 parent_uthread->uu_flag |= UT_WASSETUID;
504 parent_uthread->uu_flag &= ~UT_SETUID;
505 }
0b4e3aa0 506
b0d623f7
A
507 /* blow thread state information */
508 /* XXX is this actually necessary, given syscall return? */
509 thread_set_child(parent_thread, child_proc->p_pid);
510
511 child_proc->p_acflag = AFORK; /* forked but not exec'ed */
512
513 /*
514 * Preserve synchronization semantics of vfork. If
515 * waiting for child to exec or exit, set P_PPWAIT
516 * on child, and sleep on our proc (in case of exit).
517 */
518 child_proc->p_lflag |= P_LPPWAIT;
519 pinsertchild(parent_proc, child_proc); /* set visible */
520
521 break;
522
523 case PROC_CREATE_SPAWN:
524 /*
525 * A spawned process differs from a forked process in that
526 * the spawned process does not carry around the parents
527 * baggage with regard to address space copying, dtrace,
528 * and so on.
529 */
530 spawn = 1;
531
532 /* FALLSTHROUGH */
533
534 case PROC_CREATE_FORK:
535 /*
536 * When we clone the parent process, we are going to inherit
537 * its task attributes and memory, since when we fork, we
538 * will, in effect, create a duplicate of it, with only minor
539 * differences. Contrarily, spawned processes do not inherit.
540 */
39236c6e 541 if ((child_thread = cloneproc(parent_proc->task, parent_proc, spawn ? FALSE : TRUE, FALSE)) == NULL) {
b0d623f7
A
542 /* Failed to create thread */
543 err = EAGAIN;
544 goto bad;
545 }
546
547 /* copy current thread state into the child thread (only for fork) */
548 if (!spawn) {
549 thread_dup(child_thread);
550 }
551
552 /* child_proc = child_thread->task->proc; */
553 child_proc = (proc_t)(get_bsdtask_info(get_threadtask(child_thread)));
0b4e3aa0 554
b0d623f7
A
555// XXX BEGIN: wants to move to be common code (and safe)
556#if CONFIG_MACF
557 /*
558 * allow policies to associate the credential/label that
559 * we referenced from the parent ... with the child
560 * JMM - this really isn't safe, as we can drop that
561 * association without informing the policy in other
562 * situations (keep long enough to get policies changed)
563 */
564 mac_cred_label_associate_fork(child_proc->p_ucred, child_proc);
565#endif
566
567 /*
568 * Propogate change of PID - may get new cred if auditing.
569 *
570 * NOTE: This has no effect in the vfork case, since
571 * child_proc->task != current_task(), but we duplicate it
572 * because this is probably, ultimately, wrong, since we
573 * will be running in the "child" which is the parent task
574 * with the wrong token until we get to the execve() or
575 * _exit() call; a lot of "undefined" can happen before
576 * that.
577 *
578 * <rdar://6640530> disallow everything but exeve()/_exit()?
579 */
580 set_security_token(child_proc);
0b4e3aa0 581
b0d623f7 582 AUDIT_ARG(pid, child_proc->p_pid);
2d21ac55 583
b0d623f7
A
584// XXX END: wants to move to be common code (and safe)
585
586 /*
587 * Blow thread state information; this is what gives the child
588 * process its "return" value from a fork() call.
589 *
590 * Note: this should probably move to fork() proper, since it
591 * is not relevent to spawn, and the value won't matter
592 * until we resume the child there. If you are in here
593 * refactoring code, consider doing this at the same time.
594 */
595 thread_set_child(child_thread, child_proc->p_pid);
596
597 child_proc->p_acflag = AFORK; /* forked but not exec'ed */
598
599// <rdar://6598155> dtrace code cleanup needed
600#if CONFIG_DTRACE
601 /*
602 * This code applies to new processes who are copying the task
603 * and thread state and address spaces of their parent process.
604 */
605 if (!spawn) {
606// <rdar://6598155> call dtrace specific function here instead of all this...
607 /*
608 * APPLE NOTE: Solaris does a sprlock() and drops the
609 * proc_lock here. We're cheating a bit and only taking
610 * the p_dtrace_sprlock lock. A full sprlock would
611 * task_suspend the parent.
612 */
613 lck_mtx_lock(&parent_proc->p_dtrace_sprlock);
614
615 /*
616 * Remove all DTrace tracepoints from the child process. We
617 * need to do this _before_ duplicating USDT providers since
618 * any associated probes may be immediately enabled.
619 */
620 if (parent_proc->p_dtrace_count > 0) {
621 dtrace_fasttrap_fork(parent_proc, child_proc);
622 }
623
624 lck_mtx_unlock(&parent_proc->p_dtrace_sprlock);
625
626 /*
627 * Duplicate any lazy dof(s). This must be done while NOT
628 * holding the parent sprlock! Lock ordering is
629 * dtrace_dof_mode_lock, then sprlock. It is imperative we
630 * always call dtrace_lazy_dofs_duplicate, rather than null
631 * check and call if !NULL. If we NULL test, during lazy dof
632 * faulting we can race with the faulting code and proceed
633 * from here to beyond the helpers copy. The lazy dof
634 * faulting will then fail to copy the helpers to the child
635 * process.
636 */
637 dtrace_lazy_dofs_duplicate(parent_proc, child_proc);
638
639 /*
640 * Duplicate any helper actions and providers. The SFORKING
641 * we set above informs the code to enable USDT probes that
642 * sprlock() may fail because the child is being forked.
643 */
644 /*
645 * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent
646 * never fails to find the child. We do not set SFORKING.
647 */
648 if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) {
649 (*dtrace_helpers_fork)(parent_proc, child_proc);
650 }
651
652 }
653#endif /* CONFIG_DTRACE */
654
655 break;
656
657 default:
658 panic("fork1 called with unknown kind %d", kind);
659 break;
660 }
661
662
663 /* return the thread pointer to the caller */
664 *child_threadp = child_thread;
665
666bad:
667 /*
668 * In the error case, we return a 0 value for the returned pid (but
669 * it is ignored in the trampoline due to the error return); this
670 * is probably not necessary.
671 */
672 if (err) {
673 (void)chgproccnt(uid, -1);
674 }
0b4e3aa0 675
b0d623f7 676 return (err);
1c79356b
A
677}
678
b0d623f7 679
0b4e3aa0 680/*
2d21ac55
A
681 * vfork_return
682 *
683 * Description: "Return" to parent vfork thread() following execve/_exit;
684 * this is done by reassociating the parent process structure
685 * with the task, thread, and uthread.
686 *
316670eb
A
687 * Refer to the ASCII art above vfork() to figure out the
688 * state we're undoing.
689 *
b0d623f7 690 * Parameters: child_proc Child process
2d21ac55
A
691 * retval System call return value array
692 * rval Return value to present to parent
693 *
694 * Returns: void
695 *
316670eb
A
696 * Notes: The caller resumes or exits the parent, as appropriate, after
697 * calling this function.
0b4e3aa0
A
698 */
699void
b0d623f7 700vfork_return(proc_t child_proc, int32_t *retval, int rval)
0b4e3aa0 701{
316670eb
A
702 task_t parent_task = get_threadtask(child_proc->p_vforkact);
703 proc_t parent_proc = get_bsdtask_info(parent_task);
704 thread_t th = current_thread();
705 uthread_t uth = get_bsdthread_info(th);
0b4e3aa0 706
316670eb 707 act_thread_catt(uth->uu_userstate);
0b4e3aa0 708
316670eb 709 /* clear vfork state in parent proc structure */
b0d623f7
A
710 proc_vfork_end(parent_proc);
711
712 /* REPATRIATE PARENT TASK, THREAD, UTHREAD */
316670eb
A
713 uth->uu_userstate = 0;
714 uth->uu_flag &= ~UT_VFORK;
91447636 715 /* restore thread-set-id state */
316670eb
A
716 if (uth->uu_flag & UT_WASSETUID) {
717 uth->uu_flag |= UT_SETUID;
718 uth->uu_flag &= UT_WASSETUID;
91447636 719 }
316670eb
A
720 uth->uu_proc = 0;
721 uth->uu_sigmask = uth->uu_vforkmask;
722
723 proc_lock(child_proc);
724 child_proc->p_lflag &= ~P_LINVFORK;
725 child_proc->p_vforkact = 0;
726 proc_unlock(child_proc);
0b4e3aa0 727
316670eb 728 thread_set_parent(th, rval);
0b4e3aa0
A
729
730 if (retval) {
2d21ac55 731 retval[0] = rval;
0b4e3aa0
A
732 retval[1] = 0; /* mark parent */
733 }
0b4e3aa0
A
734}
735
2d21ac55
A
736
737/*
738 * fork_create_child
739 *
740 * Description: Common operations associated with the creation of a child
741 * process
742 *
743 * Parameters: parent_task parent task
b0d623f7 744 * child_proc child process
2d21ac55
A
745 * inherit_memory TRUE, if the parents address space is
746 * to be inherited by the child
747 * is64bit TRUE, if the child being created will
748 * be associated with a 64 bit process
749 * rather than a 32 bit process
750 *
751 * Note: This code is called in the fork() case, from the execve() call
752 * graph, if implementing an execve() following a vfork(), from
753 * the posix_spawn() call graph (which implicitly includes a
754 * vfork() equivalent call, and in the system bootstrap case.
755 *
756 * It creates a new task and thread (and as a side effect of the
757 * thread creation, a uthread), which is then associated with the
758 * process 'child'. If the parent process address space is to
759 * be inherited, then a flag indicates that the newly created
760 * task should inherit this from the child task.
761 *
762 * As a special concession to bootstrapping the initial process
763 * in the system, it's possible for 'parent_task' to be TASK_NULL;
764 * in this case, 'inherit_memory' MUST be FALSE.
765 */
91447636 766thread_t
b0d623f7 767fork_create_child(task_t parent_task, proc_t child_proc, int inherit_memory, int is64bit)
0b4e3aa0 768{
2d21ac55
A
769 thread_t child_thread = NULL;
770 task_t child_task;
771 kern_return_t result;
772
773 /* Create a new task for the child process */
774 result = task_create_internal(parent_task,
775 inherit_memory,
776 is64bit,
777 &child_task);
778 if (result != KERN_SUCCESS) {
39236c6e
A
779 printf("%s: task_create_internal failed. Code: %d\n",
780 __func__, result);
2d21ac55
A
781 goto bad;
782 }
0b4e3aa0 783
b0d623f7
A
784 /* Set the child process task to the new task */
785 child_proc->task = child_task;
2d21ac55 786
b0d623f7
A
787 /* Set child task process to child proc */
788 set_bsdtask_info(child_task, child_proc);
2d21ac55
A
789
790 /* Propagate CPU limit timer from parent */
b0d623f7 791 if (timerisset(&child_proc->p_rlim_cpu))
2d21ac55
A
792 task_vtimer_set(child_task, TASK_VTIMER_RLIM);
793
794 /* Set/clear 64 bit vm_map flag */
795 if (is64bit)
796 vm_map_set_64bit(get_task_map(child_task));
0b4e3aa0 797 else
2d21ac55
A
798 vm_map_set_32bit(get_task_map(child_task));
799
800#if CONFIG_MACF
801 /* Update task for MAC framework */
802 /* valid to use p_ucred as child is still not running ... */
b0d623f7 803 mac_task_label_update_cred(child_proc->p_ucred, child_task);
2d21ac55
A
804#endif
805
b0d623f7
A
806 /*
807 * Set child process BSD visible scheduler priority if nice value
808 * inherited from parent
809 */
810 if (child_proc->p_nice != 0)
811 resetpriority(child_proc);
0b4e3aa0 812
2d21ac55
A
813 /* Create a new thread for the child process */
814 result = thread_create(child_task, &child_thread);
815 if (result != KERN_SUCCESS) {
39236c6e
A
816 printf("%s: thread_create failed. Code: %d\n",
817 __func__, result);
2d21ac55
A
818 task_deallocate(child_task);
819 child_task = NULL;
820 }
4b17d6b6
A
821
822 /*
39236c6e
A
823 * Tag thread as being the first thread in its task.
824 */
4b17d6b6
A
825 thread_set_tag(child_thread, THREAD_TAG_MAINTHREAD);
826
2d21ac55
A
827bad:
828 thread_yield_internal(1);
829
830 return(child_thread);
0b4e3aa0
A
831}
832
833
2d21ac55
A
834/*
835 * fork
836 *
837 * Description: fork system call.
838 *
839 * Parameters: parent Parent process to fork
840 * uap (void) [unused]
841 * retval Return value
842 *
843 * Returns: 0 Success
844 * EAGAIN Resource unavailable, try again
b0d623f7
A
845 *
846 * Notes: Attempts to create a new child process which inherits state
847 * from the parent process. If successful, the call returns
848 * having created an initially suspended child process with an
849 * extra Mach task and thread reference, for which the thread
850 * is initially suspended. Until we resume the child process,
851 * it is not yet running.
852 *
853 * The return information to the child is contained in the
854 * thread state structure of the new child, and does not
855 * become visible to the child through a normal return process,
856 * since it never made the call into the kernel itself in the
857 * first place.
858 *
859 * After resuming the thread, this function returns directly to
860 * the parent process which invoked the fork() system call.
861 *
862 * Important: The child thread_resume occurs before the parent returns;
863 * depending on scheduling latency, this means that it is not
864 * deterministic as to whether the parent or child is scheduled
865 * to run first. It is entirely possible that the child could
866 * run to completion prior to the parent running.
2d21ac55
A
867 */
868int
b0d623f7 869fork(proc_t parent_proc, __unused struct fork_args *uap, int32_t *retval)
2d21ac55 870{
b0d623f7 871 thread_t child_thread;
2d21ac55 872 int err;
1c79356b 873
b0d623f7 874 retval[1] = 0; /* flag parent return for user space */
1c79356b 875
b0d623f7
A
876 if ((err = fork1(parent_proc, &child_thread, PROC_CREATE_FORK)) == 0) {
877 task_t child_task;
878 proc_t child_proc;
2d21ac55 879
b0d623f7
A
880 /* Return to the parent */
881 child_proc = (proc_t)get_bsdthreadtask_info(child_thread);
882 retval[0] = child_proc->p_pid;
2d21ac55 883
b0d623f7
A
884 /*
885 * Drop the signal lock on the child which was taken on our
886 * behalf by forkproc()/cloneproc() to prevent signals being
887 * received by the child in a partially constructed state.
888 */
889 proc_signalend(child_proc, 0);
890 proc_transend(child_proc, 0);
2d21ac55 891
b0d623f7
A
892 /* flag the fork has occurred */
893 proc_knote(parent_proc, NOTE_FORK | child_proc->p_pid);
894 DTRACE_PROC1(create, proc_t, child_proc);
2d21ac55 895
b0d623f7
A
896 /* "Return" to the child */
897 (void)thread_resume(child_thread);
2d21ac55 898
b0d623f7
A
899 /* drop the extra references we got during the creation */
900 if ((child_task = (task_t)get_threadtask(child_thread)) != NULL) {
901 task_deallocate(child_task);
902 }
903 thread_deallocate(child_thread);
2d21ac55
A
904 }
905
b0d623f7 906 return(err);
1c79356b
A
907}
908
b0d623f7 909
1c79356b 910/*
2d21ac55
A
911 * cloneproc
912 *
913 * Description: Create a new process from a specified process.
914 *
b0d623f7
A
915 * Parameters: parent_task The parent task to be cloned, or
916 * TASK_NULL is task characteristics
917 * are not to be inherited
918 * be cloned, or TASK_NULL if the new
919 * task is not to inherit the VM
920 * characteristics of the parent
921 * parent_proc The parent process to be cloned
922 * inherit_memory True if the child is to inherit
923 * memory from the parent; if this is
924 * non-NULL, then the parent_task must
925 * also be non-NULL
39236c6e
A
926 * memstat_internal Whether to track the process in the
927 * jetsam priority list (if configured)
1c79356b 928 *
2d21ac55
A
929 * Returns: !NULL pointer to new child thread
930 * NULL Failure (unspecified)
931 *
932 * Note: On return newly created child process has signal lock held
933 * to block delivery of signal to it if called with lock set.
934 * fork() code needs to explicity remove this lock before
935 * signals can be delivered
936 *
937 * In the case of bootstrap, this function can be called from
938 * bsd_utaskbootstrap() in order to bootstrap the first process;
939 * the net effect is to provide a uthread structure for the
b0d623f7
A
940 * kernel process associated with the kernel task.
941 *
942 * XXX: Tristating using the value parent_task as the major key
943 * and inherit_memory as the minor key is something we should
944 * refactor later; we owe the current semantics, ultimately,
945 * to the semantics of task_create_internal. For now, we will
946 * live with this being somewhat awkward.
1c79356b 947 */
91447636 948thread_t
39236c6e 949cloneproc(task_t parent_task, proc_t parent_proc, int inherit_memory, int memstat_internal)
0b4e3aa0 950{
39236c6e
A
951#if !CONFIG_MEMORYSTATUS
952#pragma unused(memstat_internal)
953#endif
b0d623f7
A
954 task_t child_task;
955 proc_t child_proc;
956 thread_t child_thread = NULL;
0b4e3aa0 957
b0d623f7 958 if ((child_proc = forkproc(parent_proc)) == NULL) {
2d21ac55
A
959 /* Failed to allocate new process */
960 goto bad;
961 }
9bccf70c 962
b0d623f7
A
963 child_thread = fork_create_child(parent_task, child_proc, inherit_memory, (parent_task == TASK_NULL) ? FALSE : (parent_proc->p_flag & P_LP64));
964
965 if (child_thread == NULL) {
2d21ac55
A
966 /*
967 * Failed to create thread; now we must deconstruct the new
968 * process previously obtained from forkproc().
969 */
b0d623f7 970 forkproc_free(child_proc);
2d21ac55
A
971 goto bad;
972 }
9bccf70c 973
b0d623f7
A
974 child_task = get_threadtask(child_thread);
975 if (parent_proc->p_flag & P_LP64) {
976 task_set_64bit(child_task, TRUE);
977 OSBitOrAtomic(P_LP64, (UInt32 *)&child_proc->p_flag);
b0d623f7
A
978 } else {
979 task_set_64bit(child_task, FALSE);
980 OSBitAndAtomic(~((uint32_t)P_LP64), (UInt32 *)&child_proc->p_flag);
981 }
982
39236c6e
A
983#if CONFIG_MEMORYSTATUS
984 if (memstat_internal) {
985 proc_list_lock();
986 child_proc->p_memstat_state |= P_MEMSTAT_INTERNAL;
987 proc_list_unlock();
988 }
989#endif
990
2d21ac55 991 /* make child visible */
b0d623f7 992 pinsertchild(parent_proc, child_proc);
0b4e3aa0 993
0b4e3aa0
A
994 /*
995 * Make child runnable, set start time.
996 */
b0d623f7 997 child_proc->p_stat = SRUN;
2d21ac55 998bad:
b0d623f7 999 return(child_thread);
0b4e3aa0
A
1000}
1001
b0d623f7 1002
2d21ac55
A
1003/*
1004 * Destroy a process structure that resulted from a call to forkproc(), but
1005 * which must be returned to the system because of a subsequent failure
1006 * preventing it from becoming active.
1007 *
1008 * Parameters: p The incomplete process from forkproc()
2d21ac55
A
1009 *
1010 * Returns: (void)
1011 *
1012 * Note: This function should only be used in an error handler following
b0d623f7 1013 * a call to forkproc().
2d21ac55
A
1014 *
1015 * Operations occur in reverse order of those in forkproc().
1016 */
1017void
b0d623f7 1018forkproc_free(proc_t p)
1c79356b 1019{
2d21ac55 1020
b0d623f7
A
1021 /* We held signal and a transition locks; drop them */
1022 proc_signalend(p, 0);
1023 proc_transend(p, 0);
1c79356b
A
1024
1025 /*
2d21ac55
A
1026 * If we have our own copy of the resource limits structure, we
1027 * need to free it. If it's a shared copy, we need to drop our
1028 * reference on it.
1c79356b 1029 */
2d21ac55
A
1030 proc_limitdrop(p, 0);
1031 p->p_limit = NULL;
1032
1033#if SYSV_SHM
1034 /* Need to drop references to the shared memory segment(s), if any */
1035 if (p->vm_shm) {
1036 /*
1037 * Use shmexec(): we have no address space, so no mappings
1038 *
1039 * XXX Yes, the routine is badly named.
1040 */
1041 shmexec(p);
1042 }
1043#endif
1044
1045 /* Need to undo the effects of the fdcopy(), if any */
1046 fdfree(p);
1047
1048 /*
1049 * Drop the reference on a text vnode pointer, if any
1050 * XXX This code is broken in forkproc(); see <rdar://4256419>;
1051 * XXX if anyone ever uses this field, we will be extremely unhappy.
1052 */
1053 if (p->p_textvp) {
1054 vnode_rele(p->p_textvp);
1055 p->p_textvp = NULL;
1056 }
1057
1058 /* Stop the profiling clock */
1059 stopprofclock(p);
1060
6d2010ae
A
1061 /* Update the audit session proc count */
1062 AUDIT_SESSION_PROCEXIT(p);
1063
2d21ac55
A
1064 /* Release the credential reference */
1065 kauth_cred_unref(&p->p_ucred);
1066
1067 proc_list_lock();
1068 /* Decrement the count of processes in the system */
1069 nprocs--;
1070 proc_list_unlock();
1071
1072 thread_call_free(p->p_rcall);
1073
1074 /* Free allocated memory */
1075 FREE_ZONE(p->p_sigacts, sizeof *p->p_sigacts, M_SIGACTS);
1076 FREE_ZONE(p->p_stats, sizeof *p->p_stats, M_PSTATS);
1077 proc_checkdeadrefs(p);
1078 FREE_ZONE(p, sizeof *p, M_PROC);
1079}
1080
1081
1082/*
1083 * forkproc
1084 *
1085 * Description: Create a new process structure, given a parent process
1086 * structure.
1087 *
b0d623f7 1088 * Parameters: parent_proc The parent process
2d21ac55
A
1089 *
1090 * Returns: !NULL The new process structure
1091 * NULL Error (insufficient free memory)
1092 *
1093 * Note: When successful, the newly created process structure is
1094 * partially initialized; if a caller needs to deconstruct the
1095 * returned structure, they must call forkproc_free() to do so.
1096 */
1097proc_t
b0d623f7 1098forkproc(proc_t parent_proc)
2d21ac55 1099{
b0d623f7 1100 proc_t child_proc; /* Our new process */
593a1d5f 1101 static int nextpid = 0, pidwrap = 0, nextpidversion = 0;
6d2010ae 1102 static uint64_t nextuniqueid = 0;
2d21ac55
A
1103 int error = 0;
1104 struct session *sessp;
b0d623f7 1105 uthread_t parent_uthread = (uthread_t)get_bsdthread_info(current_thread());
2d21ac55 1106
b0d623f7
A
1107 MALLOC_ZONE(child_proc, proc_t , sizeof *child_proc, M_PROC, M_WAITOK);
1108 if (child_proc == NULL) {
2d21ac55
A
1109 printf("forkproc: M_PROC zone exhausted\n");
1110 goto bad;
1111 }
1112 /* zero it out as we need to insert in hash */
b0d623f7 1113 bzero(child_proc, sizeof *child_proc);
2d21ac55 1114
b0d623f7
A
1115 MALLOC_ZONE(child_proc->p_stats, struct pstats *,
1116 sizeof *child_proc->p_stats, M_PSTATS, M_WAITOK);
1117 if (child_proc->p_stats == NULL) {
2d21ac55 1118 printf("forkproc: M_SUBPROC zone exhausted (p_stats)\n");
b0d623f7
A
1119 FREE_ZONE(child_proc, sizeof *child_proc, M_PROC);
1120 child_proc = NULL;
2d21ac55
A
1121 goto bad;
1122 }
b0d623f7
A
1123 MALLOC_ZONE(child_proc->p_sigacts, struct sigacts *,
1124 sizeof *child_proc->p_sigacts, M_SIGACTS, M_WAITOK);
1125 if (child_proc->p_sigacts == NULL) {
2d21ac55 1126 printf("forkproc: M_SUBPROC zone exhausted (p_sigacts)\n");
b0d623f7
A
1127 FREE_ZONE(child_proc->p_stats, sizeof *child_proc->p_stats, M_PSTATS);
1128 FREE_ZONE(child_proc, sizeof *child_proc, M_PROC);
1129 child_proc = NULL;
2d21ac55
A
1130 goto bad;
1131 }
b0d623f7
A
1132
1133 /* allocate a callout for use by interval timers */
1134 child_proc->p_rcall = thread_call_allocate((thread_call_func_t)realitexpire, child_proc);
1135 if (child_proc->p_rcall == NULL) {
1136 FREE_ZONE(child_proc->p_sigacts, sizeof *child_proc->p_sigacts, M_SIGACTS);
1137 FREE_ZONE(child_proc->p_stats, sizeof *child_proc->p_stats, M_PSTATS);
1138 FREE_ZONE(child_proc, sizeof *child_proc, M_PROC);
1139 child_proc = NULL;
2d21ac55
A
1140 goto bad;
1141 }
1142
1143
1144 /*
1145 * Find an unused PID.
1146 */
1147
1148 proc_list_lock();
1149
1c79356b
A
1150 nextpid++;
1151retry:
1152 /*
1153 * If the process ID prototype has wrapped around,
1154 * restart somewhat above 0, as the low-numbered procs
1155 * tend to include daemons that don't exit.
1156 */
1157 if (nextpid >= PID_MAX) {
1158 nextpid = 100;
2d21ac55 1159 pidwrap = 1;
1c79356b 1160 }
2d21ac55 1161 if (pidwrap != 0) {
1c79356b 1162
2d21ac55
A
1163 /* if the pid stays in hash both for zombie and runniing state */
1164 if (pfind_locked(nextpid) != PROC_NULL) {
1165 nextpid++;
1166 goto retry;
1c79356b 1167 }
1c79356b 1168
2d21ac55
A
1169 if (pgfind_internal(nextpid) != PGRP_NULL) {
1170 nextpid++;
1171 goto retry;
1172 }
1173 if (session_find_internal(nextpid) != SESSION_NULL) {
1174 nextpid++;
1175 goto retry;
1176 }
1177 }
1c79356b 1178 nprocs++;
b0d623f7
A
1179 child_proc->p_pid = nextpid;
1180 child_proc->p_idversion = nextpidversion++;
6d2010ae
A
1181 /* kernel process is handcrafted and not from fork, so start from 1 */
1182 child_proc->p_uniqueid = ++nextuniqueid;
2d21ac55 1183#if 1
b0d623f7
A
1184 if (child_proc->p_pid != 0) {
1185 if (pfind_locked(child_proc->p_pid) != PROC_NULL)
2d21ac55
A
1186 panic("proc in the list already\n");
1187 }
1188#endif
1189 /* Insert in the hash */
b0d623f7
A
1190 child_proc->p_listflag |= (P_LIST_INHASH | P_LIST_INCREATE);
1191 LIST_INSERT_HEAD(PIDHASH(child_proc->p_pid), child_proc, p_hash);
2d21ac55
A
1192 proc_list_unlock();
1193
1194
1195 /*
1196 * We've identified the PID we are going to use; initialize the new
1197 * process structure.
1198 */
b0d623f7
A
1199 child_proc->p_stat = SIDL;
1200 child_proc->p_pgrpid = PGRPID_DEAD;
1c79356b
A
1201
1202 /*
b0d623f7
A
1203 * The zero'ing of the proc was at the allocation time due to need
1204 * for insertion to hash. Copy the section that is to be copied
1205 * directly from the parent.
1c79356b 1206 */
b0d623f7
A
1207 bcopy(&parent_proc->p_startcopy, &child_proc->p_startcopy,
1208 (unsigned) ((caddr_t)&child_proc->p_endcopy - (caddr_t)&child_proc->p_startcopy));
1c79356b 1209
55e303ae 1210 /*
91447636 1211 * Some flags are inherited from the parent.
1c79356b
A
1212 * Duplicate sub-structures as needed.
1213 * Increase reference counts on shared objects.
1214 * The p_stats and p_sigacts substructs are set in vm_fork.
1215 */
316670eb 1216 child_proc->p_flag = (parent_proc->p_flag & (P_LP64 | P_TRANSLATED | P_AFFINITY | P_DISABLE_ASLR | P_DELAYIDLESLEEP));
b0d623f7
A
1217 if (parent_proc->p_flag & P_PROFIL)
1218 startprofclock(child_proc);
316670eb 1219
39236c6e 1220 child_proc->p_vfs_iopolicy = (parent_proc->p_vfs_iopolicy & (P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY));
316670eb 1221
91447636
A
1222 /*
1223 * Note that if the current thread has an assumed identity, this
1224 * credential will be granted to the new process.
1225 */
b0d623f7 1226 child_proc->p_ucred = kauth_cred_get_with_ref();
6d2010ae
A
1227 /* update cred on proc */
1228 PROC_UPDATE_CREDS_ONPROC(child_proc);
1229 /* update audit session proc count */
1230 AUDIT_SESSION_PROCNEW(child_proc);
91447636 1231
6d2010ae 1232#if CONFIG_FINE_LOCK_GROUPS
b0d623f7
A
1233 lck_mtx_init(&child_proc->p_mlock, proc_mlock_grp, proc_lck_attr);
1234 lck_mtx_init(&child_proc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr);
1235#if CONFIG_DTRACE
1236 lck_mtx_init(&child_proc->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr);
1237#endif
1238 lck_spin_init(&child_proc->p_slock, proc_slock_grp, proc_lck_attr);
6d2010ae
A
1239#else /* !CONFIG_FINE_LOCK_GROUPS */
1240 lck_mtx_init(&child_proc->p_mlock, proc_lck_grp, proc_lck_attr);
1241 lck_mtx_init(&child_proc->p_fdmlock, proc_lck_grp, proc_lck_attr);
1242#if CONFIG_DTRACE
1243 lck_mtx_init(&child_proc->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr);
1244#endif
1245 lck_spin_init(&child_proc->p_slock, proc_lck_grp, proc_lck_attr);
1246#endif /* !CONFIG_FINE_LOCK_GROUPS */
b0d623f7 1247 klist_init(&child_proc->p_klist);
2d21ac55 1248
b0d623f7 1249 if (child_proc->p_textvp != NULLVP) {
2d21ac55
A
1250 /* bump references to the text vnode */
1251 /* Need to hold iocount across the ref call */
b0d623f7
A
1252 if (vnode_getwithref(child_proc->p_textvp) == 0) {
1253 error = vnode_ref(child_proc->p_textvp);
1254 vnode_put(child_proc->p_textvp);
2d21ac55 1255 if (error != 0)
b0d623f7 1256 child_proc->p_textvp = NULLVP;
2d21ac55 1257 }
91447636 1258 }
2d21ac55 1259
b0d623f7
A
1260 /*
1261 * Copy the parents per process open file table to the child; if
1262 * there is a per-thread current working directory, set the childs
1263 * per-process current working directory to that instead of the
1264 * parents.
1265 *
1266 * XXX may fail to copy descriptors to child
1267 */
1268 child_proc->p_fd = fdcopy(parent_proc, parent_uthread->uu_cdir);
91447636 1269
2d21ac55 1270#if SYSV_SHM
b0d623f7 1271 if (parent_proc->vm_shm) {
91447636 1272 /* XXX may fail to attach shm to child */
b0d623f7 1273 (void)shmfork(parent_proc, child_proc);
1c79356b 1274 }
2d21ac55 1275#endif
1c79356b 1276 /*
2d21ac55 1277 * inherit the limit structure to child
1c79356b 1278 */
b0d623f7 1279 proc_limitfork(parent_proc, child_proc);
2d21ac55 1280
b0d623f7
A
1281 if (child_proc->p_limit->pl_rlimit[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
1282 uint64_t rlim_cur = child_proc->p_limit->pl_rlimit[RLIMIT_CPU].rlim_cur;
1283 child_proc->p_rlim_cpu.tv_sec = (rlim_cur > __INT_MAX__) ? __INT_MAX__ : rlim_cur;
1c79356b
A
1284 }
1285
b0d623f7
A
1286 /* Intialize new process stats, including start time */
1287 /* <rdar://6640543> non-zeroed portion contains garbage AFAICT */
39236c6e
A
1288 bzero(child_proc->p_stats, sizeof(*child_proc->p_stats));
1289 microtime_with_abstime(&child_proc->p_start, &child_proc->p_stats->ps_start);
b0d623f7
A
1290
1291 if (parent_proc->p_sigacts != NULL)
1292 (void)memcpy(child_proc->p_sigacts,
1293 parent_proc->p_sigacts, sizeof *child_proc->p_sigacts);
1c79356b 1294 else
b0d623f7 1295 (void)memset(child_proc->p_sigacts, 0, sizeof *child_proc->p_sigacts);
1c79356b 1296
b0d623f7
A
1297 sessp = proc_session(parent_proc);
1298 if (sessp->s_ttyvp != NULL && parent_proc->p_flag & P_CONTROLT)
1299 OSBitOrAtomic(P_CONTROLT, &child_proc->p_flag);
2d21ac55 1300 session_rele(sessp);
1c79356b 1301
b0d623f7
A
1302 /*
1303 * block all signals to reach the process.
1304 * no transition race should be occuring with the child yet,
1305 * but indicate that the process is in (the creation) transition.
1306 */
1307 proc_signalstart(child_proc, 0);
1308 proc_transstart(child_proc, 0);
1309
1310 child_proc->p_pcaction = (parent_proc->p_pcaction) & P_PCMAX;
1311 TAILQ_INIT(&child_proc->p_uthlist);
1312 TAILQ_INIT(&child_proc->p_aio_activeq);
1313 TAILQ_INIT(&child_proc->p_aio_doneq);
2d21ac55 1314
2d21ac55 1315 /* Inherit the parent flags for code sign */
c331a0be 1316 child_proc->p_csflags = (parent_proc->p_csflags & ~CS_KILLED);
b0d623f7
A
1317
1318 /*
1319 * All processes have work queue locks; cleaned up by
1320 * reap_child_locked()
1321 */
1322 workqueue_init_lock(child_proc);
1323
1324 /*
1325 * Copy work queue information
1326 *
1327 * Note: This should probably only happen in the case where we are
1328 * creating a child that is a copy of the parent; since this
1329 * routine is called in the non-duplication case of vfork()
1330 * or posix_spawn(), then this information should likely not
1331 * be duplicated.
1332 *
1333 * <rdar://6640553> Work queue pointers that no longer point to code
1334 */
1335 child_proc->p_wqthread = parent_proc->p_wqthread;
1336 child_proc->p_threadstart = parent_proc->p_threadstart;
1337 child_proc->p_pthsize = parent_proc->p_pthsize;
1338 child_proc->p_targconc = parent_proc->p_targconc;
1339 if ((parent_proc->p_lflag & P_LREGISTER) != 0) {
1340 child_proc->p_lflag |= P_LREGISTER;
1341 }
1342 child_proc->p_dispatchqueue_offset = parent_proc->p_dispatchqueue_offset;
39236c6e 1343 child_proc->p_dispatchqueue_serialno_offset = parent_proc->p_dispatchqueue_serialno_offset;
b0d623f7
A
1344#if PSYNCH
1345 pth_proc_hashinit(child_proc);
1346#endif /* PSYNCH */
2d21ac55
A
1347
1348#if CONFIG_LCTX
b0d623f7 1349 child_proc->p_lctx = NULL;
2d21ac55 1350 /* Add new process to login context (if any). */
b0d623f7
A
1351 if (parent_proc->p_lctx != NULL) {
1352 /*
1353 * <rdar://6640564> This should probably be delayed in the
1354 * vfork() or posix_spawn() cases.
1355 */
1356 LCTX_LOCK(parent_proc->p_lctx);
1357 enterlctx(child_proc, parent_proc->p_lctx, 0);
1c79356b
A
1358 }
1359#endif
1360
39236c6e
A
1361#if CONFIG_MEMORYSTATUS
1362 /* Memorystatus + jetsam init */
1363 child_proc->p_memstat_state = 0;
1364 child_proc->p_memstat_effectivepriority = JETSAM_PRIORITY_DEFAULT;
1365 child_proc->p_memstat_requestedpriority = JETSAM_PRIORITY_DEFAULT;
1366 child_proc->p_memstat_userdata = 0;
1367#if CONFIG_FREEZE
1368 child_proc->p_memstat_suspendedfootprint = 0;
1369#endif
1370 child_proc->p_memstat_dirty = 0;
1371 child_proc->p_memstat_idledeadline = 0;
1372#endif /* CONFIG_MEMORYSTATUS */
316670eb 1373
2d21ac55 1374bad:
b0d623f7 1375 return(child_proc);
1c79356b
A
1376}
1377
91447636
A
1378void
1379proc_lock(proc_t p)
1380{
1381 lck_mtx_lock(&p->p_mlock);
1382}
1383
1384void
1385proc_unlock(proc_t p)
1386{
1387 lck_mtx_unlock(&p->p_mlock);
1388}
1389
2d21ac55
A
1390void
1391proc_spinlock(proc_t p)
1392{
1393 lck_spin_lock(&p->p_slock);
1394}
1395
1396void
1397proc_spinunlock(proc_t p)
1398{
1399 lck_spin_unlock(&p->p_slock);
1400}
1401
1402void
1403proc_list_lock(void)
1404{
1405 lck_mtx_lock(proc_list_mlock);
1406}
1407
1408void
1409proc_list_unlock(void)
1410{
1411 lck_mtx_unlock(proc_list_mlock);
1412}
1413
1c79356b
A
1414#include <kern/zalloc.h>
1415
1416struct zone *uthread_zone;
2d21ac55 1417static int uthread_zone_inited = 0;
1c79356b 1418
2d21ac55 1419static void
91447636 1420uthread_zone_init(void)
1c79356b
A
1421{
1422 if (!uthread_zone_inited) {
1423 uthread_zone = zinit(sizeof(struct uthread),
b0d623f7 1424 thread_max * sizeof(struct uthread),
91447636
A
1425 THREAD_CHUNK * sizeof(struct uthread),
1426 "uthreads");
1c79356b
A
1427 uthread_zone_inited = 1;
1428 }
1429}
1430
1431void *
b0d623f7 1432uthread_alloc(task_t task, thread_t thread, int noinherit)
1c79356b 1433{
2d21ac55
A
1434 proc_t p;
1435 uthread_t uth;
1436 uthread_t uth_parent;
1c79356b
A
1437 void *ut;
1438
1439 if (!uthread_zone_inited)
1440 uthread_zone_init();
1441
1442 ut = (void *)zalloc(uthread_zone);
1443 bzero(ut, sizeof(struct uthread));
9bccf70c 1444
2d21ac55
A
1445 p = (proc_t) get_bsdtask_info(task);
1446 uth = (uthread_t)ut;
316670eb 1447 uth->uu_thread = thread;
9bccf70c 1448
91447636
A
1449 /*
1450 * Thread inherits credential from the creating thread, if both
1451 * are in the same task.
1452 *
1453 * If the creating thread has no credential or is from another
1454 * task we can leave the new thread credential NULL. If it needs
1455 * one later, it will be lazily assigned from the task's process.
1456 */
2d21ac55 1457 uth_parent = (uthread_t)get_bsdthread_info(current_thread());
b0d623f7 1458 if ((noinherit == 0) && task == current_task() &&
2d21ac55
A
1459 uth_parent != NULL &&
1460 IS_VALID_CRED(uth_parent->uu_ucred)) {
0c530ab8
A
1461 /*
1462 * XXX The new thread is, in theory, being created in context
1463 * XXX of parent thread, so a direct reference to the parent
1464 * XXX is OK.
1465 */
1466 kauth_cred_ref(uth_parent->uu_ucred);
91447636 1467 uth->uu_ucred = uth_parent->uu_ucred;
91447636
A
1468 /* the credential we just inherited is an assumed credential */
1469 if (uth_parent->uu_flag & UT_SETUID)
1470 uth->uu_flag |= UT_SETUID;
1471 } else {
b0d623f7
A
1472 /* sometimes workqueue threads are created out task context */
1473 if ((task != kernel_task) && (p != PROC_NULL))
1474 uth->uu_ucred = kauth_cred_proc_ref(p);
1475 else
1476 uth->uu_ucred = NOCRED;
91447636 1477 }
2d21ac55 1478
91447636 1479
2d21ac55 1480 if ((task != kernel_task) && p) {
91447636 1481
2d21ac55 1482 proc_lock(p);
b0d623f7
A
1483 if (noinherit != 0) {
1484 /* workq threads will not inherit masks */
1485 uth->uu_sigmask = ~workq_threadmask;
1486 } else if (uth_parent) {
91447636 1487 if (uth_parent->uu_flag & UT_SAS_OLDMASK)
9bccf70c
A
1488 uth->uu_sigmask = uth_parent->uu_oldmask;
1489 else
1490 uth->uu_sigmask = uth_parent->uu_sigmask;
1491 }
2d21ac55
A
1492 uth->uu_context.vc_thread = thread;
1493 TAILQ_INSERT_TAIL(&p->p_uthlist, uth, uu_list);
1494 proc_unlock(p);
1495
1496#if CONFIG_DTRACE
1497 if (p->p_dtrace_ptss_pages != NULL) {
1498 uth->t_dtrace_scratch = dtrace_ptss_claim_entry(p);
91447636 1499 }
316670eb
A
1500#endif
1501#if CONFIG_MACF
1502 mac_thread_label_init(uth);
2d21ac55 1503#endif
9bccf70c
A
1504 }
1505
1c79356b
A
1506 return (ut);
1507}
1508
0b4e3aa0 1509
2d21ac55
A
1510/*
1511 * This routine frees all the BSD context in uthread except the credential.
1512 * It does not free the uthread structure as well
1513 */
1c79356b 1514void
2d21ac55 1515uthread_cleanup(task_t task, void *uthread, void * bsd_info)
1c79356b
A
1516{
1517 struct _select *sel;
2d21ac55
A
1518 uthread_t uth = (uthread_t)uthread;
1519 proc_t p = (proc_t)bsd_info;
55e303ae 1520
593a1d5f 1521
b0d623f7 1522 if (uth->uu_lowpri_window || uth->uu_throttle_info) {
593a1d5f
A
1523 /*
1524 * task is marked as a low priority I/O type
1525 * and we've somehow managed to not dismiss the throttle
1526 * through the normal exit paths back to user space...
1527 * no need to throttle this thread since its going away
1528 * but we do need to update our bookeeping w/r to throttled threads
b0d623f7
A
1529 *
1530 * Calling this routine will clean up any throttle info reference
1531 * still inuse by the thread.
593a1d5f 1532 */
39236c6e 1533 throttle_lowpri_io(0);
593a1d5f 1534 }
55e303ae
A
1535 /*
1536 * Per-thread audit state should never last beyond system
1537 * call return. Since we don't audit the thread creation/
1538 * removal, the thread state pointer should never be
1539 * non-NULL when we get here.
1540 */
1541 assert(uth->uu_ar == NULL);
1c79356b 1542
91447636 1543 sel = &uth->uu_select;
1c79356b
A
1544 /* cleanup the select bit space */
1545 if (sel->nbytes) {
1546 FREE(sel->ibits, M_TEMP);
1547 FREE(sel->obits, M_TEMP);
2d21ac55
A
1548 sel->nbytes = 0;
1549 }
1550
1551 if (uth->uu_cdir) {
1552 vnode_rele(uth->uu_cdir);
1553 uth->uu_cdir = NULLVP;
1c79356b
A
1554 }
1555
2d21ac55
A
1556 if (uth->uu_allocsize && uth->uu_wqset){
1557 kfree(uth->uu_wqset, uth->uu_allocsize);
2d21ac55
A
1558 uth->uu_allocsize = 0;
1559 uth->uu_wqset = 0;
0b4e3aa0
A
1560 }
1561
b0d623f7
A
1562 if(uth->pth_name != NULL)
1563 {
1564 kfree(uth->pth_name, MAXTHREADNAMESIZE);
1565 uth->pth_name = 0;
1566 }
2d21ac55
A
1567 if ((task != kernel_task) && p) {
1568
1569 if (((uth->uu_flag & UT_VFORK) == UT_VFORK) && (uth->uu_proc != PROC_NULL)) {
1570 vfork_exit_internal(uth->uu_proc, 0, 1);
1571 }
b0d623f7
A
1572 /*
1573 * Remove the thread from the process list and
1574 * transfer [appropriate] pending signals to the process.
1575 */
2d21ac55
A
1576 if (get_bsdtask_info(task) == p) {
1577 proc_lock(p);
1578 TAILQ_REMOVE(&p->p_uthlist, uth, uu_list);
b0d623f7 1579 p->p_siglist |= (uth->uu_siglist & execmask & (~p->p_sigignore | sigcantmask));
2d21ac55
A
1580 proc_unlock(p);
1581 }
1582#if CONFIG_DTRACE
b0d623f7
A
1583 struct dtrace_ptss_page_entry *tmpptr = uth->t_dtrace_scratch;
1584 uth->t_dtrace_scratch = NULL;
1585 if (tmpptr != NULL) {
1586 dtrace_ptss_release_entry(p, tmpptr);
2d21ac55 1587 }
316670eb
A
1588#endif
1589#if CONFIG_MACF
1590 mac_thread_label_destroy(uth);
2d21ac55
A
1591#endif
1592 }
1593}
1594
1595/* This routine releases the credential stored in uthread */
1596void
1597uthread_cred_free(void *uthread)
1598{
1599 uthread_t uth = (uthread_t)uthread;
1600
1601 /* and free the uthread itself */
0c530ab8
A
1602 if (IS_VALID_CRED(uth->uu_ucred)) {
1603 kauth_cred_t oldcred = uth->uu_ucred;
1604 uth->uu_ucred = NOCRED;
1605 kauth_cred_unref(&oldcred);
1606 }
2d21ac55 1607}
e5568f75 1608
2d21ac55
A
1609/* This routine frees the uthread structure held in thread structure */
1610void
1611uthread_zone_free(void *uthread)
1612{
22ba694c
A
1613 uthread_t uth = (uthread_t)uthread;
1614
1615 if (uth->t_tombstone) {
1616 kfree(uth->t_tombstone, sizeof(struct doc_tombstone));
1617 uth->t_tombstone = NULL;
1618 }
1619
1c79356b 1620 /* and free the uthread itself */
91447636 1621 zfree(uthread_zone, uthread);
1c79356b 1622}