]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_fork.c
xnu-1228.15.4.tar.gz
[apple/xnu.git] / bsd / kern / kern_fork.c
1 /*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995, 1997 Apple Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1989, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)kern_fork.c 8.8 (Berkeley) 2/14/95
67 */
68 /*
69 * NOTICE: This file was modified by McAfee Research in 2004 to introduce
70 * support for mandatory and extensible security protections. This notice
71 * is included in support of clause 2.2 (b) of the Apple Public License,
72 * Version 2.0.
73 */
74 /*
75 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
76 * support for mandatory and extensible security protections. This notice
77 * is included in support of clause 2.2 (b) of the Apple Public License,
78 * Version 2.0.
79 */
80
81 #include <kern/assert.h>
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/filedesc.h>
85 #include <sys/kernel.h>
86 #include <sys/malloc.h>
87 #include <sys/proc_internal.h>
88 #include <sys/kauth.h>
89 #include <sys/user.h>
90 #include <sys/resourcevar.h>
91 #include <sys/vnode_internal.h>
92 #include <sys/file_internal.h>
93 #include <sys/acct.h>
94 #include <sys/codesign.h>
95 #include <sys/sysproto.h>
96 #if CONFIG_DTRACE
97 /* Do not include dtrace.h, it redefines kmem_[alloc/free] */
98 extern void dtrace_fasttrap_fork(proc_t, proc_t);
99 extern void (*dtrace_helpers_fork)(proc_t, proc_t);
100 extern void dtrace_lazy_dofs_duplicate(proc_t, proc_t);
101
102 #include <sys/dtrace_ptss.h>
103 #endif
104
105 #include <bsm/audit_kernel.h>
106
107 #include <mach/mach_types.h>
108 #include <kern/kern_types.h>
109 #include <kern/kalloc.h>
110 #include <kern/mach_param.h>
111 #include <kern/task.h>
112 #include <kern/thread_call.h>
113 #include <kern/zalloc.h>
114
115 #include <machine/spl.h>
116
117 #if CONFIG_MACF
118 #include <security/mac.h>
119 #include <security/mac_mach_internal.h>
120 #endif
121
122 #include <vm/vm_map.h>
123 #include <vm/vm_protos.h>
124 #include <vm/vm_shared_region.h>
125
126 #include <sys/shm_internal.h> /* for shmfork() */
127 #include <mach/task.h> /* for thread_create() */
128 #include <mach/thread_act.h> /* for thread_resume() */
129
130 #include <sys/sdt.h>
131
132 /* XXX routines which should have Mach prototypes, but don't */
133 void thread_set_parent(thread_t parent, int pid);
134 extern void act_thread_catt(void *ctx);
135 void thread_set_child(thread_t child, int pid);
136 void *act_thread_csave(void);
137
138
139 thread_t cloneproc(proc_t, int);
140 proc_t forkproc(proc_t, int);
141 void forkproc_free(proc_t, int);
142 thread_t procdup(proc_t parent, proc_t child);
143 thread_t fork_create_child(task_t parent_task, proc_t child, int inherit_memory, int is64bit);
144
145 #define DOFORK 0x1 /* fork() system call */
146 #define DOVFORK 0x2 /* vfork() system call */
147
148
149 /*
150 * vfork
151 *
152 * Description: vfork system call
153 *
154 * Parameters: void [no arguments]
155 *
156 * Retval: 0 (to child process)
157 * !0 pid of child (to parent process)
158 * -1 error (see "Returns:")
159 *
160 * Returns: EAGAIN Administrative limit reached
161 * EINVAL vfork() caled during vfork()
162 * ENOMEM Failed to allocate new process
163 *
164 * Note: After a successful call to this function, the parent process
165 * has its task, thread, and uthread lent to the child process,
166 * and control is returned to the caller; if this function is
167 * invoked as a system call, the return is to user space, and
168 * is effectively running on the child process.
169 *
170 * Subsequent calls that operate on process state are permitted,
171 * though discouraged, and will operate on the child process; any
172 * operations on the task, thread, or uthread will result in
173 * changes in the parent state, and, if inheritable, the child
174 * state, when a task, thread, and uthread are realized for the
175 * child process at execve() time, will also be effected. Given
176 * this, it's recemmended that people use the posix_spawn() call
177 * instead.
178 */
179 int
180 vfork(proc_t parent, __unused struct vfork_args *uap, register_t *retval)
181 {
182 proc_t child;
183 uid_t uid;
184 thread_t cur_act = (thread_t)current_thread();
185 int count;
186 uthread_t ut;
187 #if CONFIG_MACF
188 int err;
189 #endif
190
191 /*
192 * Although process entries are dynamically created, we still keep
193 * a global limit on the maximum number we will create. Don't allow
194 * a nonprivileged user to use the last process; don't let root
195 * exceed the limit. The variable nprocs is the current number of
196 * processes, maxproc is the limit.
197 */
198 uid = kauth_cred_get()->cr_ruid;
199 proc_list_lock();
200 if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
201 proc_list_unlock();
202 tablefull("proc");
203 retval[1] = 0;
204 return (EAGAIN);
205 }
206 proc_list_unlock();
207
208 /*
209 * Increment the count of procs running with this uid. Don't allow
210 * a nonprivileged user to exceed their current limit, which is
211 * always less than what an rlim_t can hold.
212 * (locking protection is provided by list lock held in chgproccnt)
213 */
214 count = chgproccnt(uid, 1);
215 if (uid != 0 &&
216 (rlim_t)count > parent->p_rlimit[RLIMIT_NPROC].rlim_cur) {
217 (void)chgproccnt(uid, -1);
218 return (EAGAIN);
219 }
220
221 ut = (uthread_t)get_bsdthread_info(cur_act);
222 if (ut->uu_flag & UT_VFORK) {
223 printf("vfork called recursively by %s\n", parent->p_comm);
224 (void)chgproccnt(uid, -1);
225 return (EINVAL);
226 }
227
228 #if CONFIG_MACF
229 /*
230 * Determine if MAC policies applied to the process will allow
231 * it to fork.
232 */
233 err = mac_proc_check_fork(parent);
234 if (err != 0) {
235 (void)chgproccnt(uid, -1);
236 return (err);
237 }
238 #endif
239
240 proc_lock(parent);
241 parent->p_lflag |= P_LVFORK;
242 parent->p_vforkcnt++;
243 proc_unlock(parent);
244
245 /* The newly created process comes with signal lock held */
246 if ((child = forkproc(parent,1)) == NULL) {
247 /* Failed to allocate new process */
248 (void)chgproccnt(uid, -1);
249 /*
250 * XXX kludgy, but necessary without a full flags audit...
251 * XXX these are inherited by the child, which depends on
252 * XXX P_VFORK being set.
253 */
254 proc_lock(parent);
255 parent->p_lflag &= ~P_LVFORK;
256 parent->p_vforkcnt--;
257 proc_unlock(parent);
258 return (ENOMEM);
259 }
260
261 #if CONFIG_MACF
262 /* allow policies to associate the credential/label */
263 /* that we referenced from the parent ... with the child */
264 /* JMM - this really isn't safe, as we can drop that */
265 /* association without informing the policy in other */
266 /* situations (keep long enough to get policies changed) */
267 mac_cred_label_associate_fork(child->p_ucred, child);
268 #endif
269
270 AUDIT_ARG(pid, child->p_pid);
271
272 child->task = parent->task;
273
274 /* make child visible */
275 pinsertchild(parent, child);
276
277 child->p_lflag |= P_LINVFORK;
278 child->p_vforkact = cur_act;
279 child->p_stat = SRUN;
280
281 ut->uu_flag |= UT_VFORK;
282 ut->uu_proc = child;
283 ut->uu_userstate = (void *)act_thread_csave();
284 ut->uu_vforkmask = ut->uu_sigmask;
285
286 /* temporarily drop thread-set-id state */
287 if (ut->uu_flag & UT_SETUID) {
288 ut->uu_flag |= UT_WASSETUID;
289 ut->uu_flag &= ~UT_SETUID;
290 }
291
292 thread_set_child(cur_act, child->p_pid);
293
294 microtime(&child->p_start);
295 microtime(&child->p_stats->p_start); /* for compat sake */
296 child->p_acflag = AFORK;
297
298 /*
299 * Preserve synchronization semantics of vfork. If waiting for
300 * child to exec or exit, set P_PPWAIT on child, and sleep on our
301 * proc (in case of exit).
302 */
303 child->p_lflag |= P_LPPWAIT;
304
305 /* drop the signal lock on the child */
306 proc_signalend(child, 0);
307 proc_transend(child, 0);
308
309 retval[0] = child->p_pid;
310 retval[1] = 1; /* flag child return for user space */
311
312 DTRACE_PROC1(create, proc_t, child);
313
314 return (0);
315 }
316
317 /*
318 * vfork_return
319 *
320 * Description: "Return" to parent vfork thread() following execve/_exit;
321 * this is done by reassociating the parent process structure
322 * with the task, thread, and uthread.
323 *
324 * Parameters: child Child process
325 * retval System call return value array
326 * rval Return value to present to parent
327 *
328 * Returns: void
329 *
330 * Note: The caller resumes or exits the parent, as appropriate, after
331 * callling this function.
332 */
333 void
334 vfork_return(proc_t child, register_t *retval, int rval)
335 {
336 proc_t parent = child->p_pptr;
337 thread_t cur_act = (thread_t)current_thread();
338 uthread_t ut;
339
340 ut = (uthread_t)get_bsdthread_info(cur_act);
341
342 act_thread_catt(ut->uu_userstate);
343
344 /* Make sure only one at this time */
345 proc_lock(parent);
346 parent->p_vforkcnt--;
347 if (parent->p_vforkcnt <0)
348 panic("vfork cnt is -ve");
349 if (parent->p_vforkcnt <=0)
350 parent->p_lflag &= ~P_LVFORK;
351 proc_unlock(parent);
352 ut->uu_userstate = 0;
353 ut->uu_flag &= ~UT_VFORK;
354 /* restore thread-set-id state */
355 if (ut->uu_flag & UT_WASSETUID) {
356 ut->uu_flag |= UT_SETUID;
357 ut->uu_flag &= UT_WASSETUID;
358 }
359 ut->uu_proc = 0;
360 ut->uu_sigmask = ut->uu_vforkmask;
361 child->p_lflag &= ~P_LINVFORK;
362 child->p_vforkact = (void *)0;
363
364 thread_set_parent(cur_act, rval);
365
366 if (retval) {
367 retval[0] = rval;
368 retval[1] = 0; /* mark parent */
369 }
370
371 return;
372 }
373
374
375 /*
376 * fork_create_child
377 *
378 * Description: Common operations associated with the creation of a child
379 * process
380 *
381 * Parameters: parent_task parent task
382 * child child process
383 * inherit_memory TRUE, if the parents address space is
384 * to be inherited by the child
385 * is64bit TRUE, if the child being created will
386 * be associated with a 64 bit process
387 * rather than a 32 bit process
388 *
389 * Note: This code is called in the fork() case, from the execve() call
390 * graph, if implementing an execve() following a vfork(), from
391 * the posix_spawn() call graph (which implicitly includes a
392 * vfork() equivalent call, and in the system bootstrap case.
393 *
394 * It creates a new task and thread (and as a side effect of the
395 * thread creation, a uthread), which is then associated with the
396 * process 'child'. If the parent process address space is to
397 * be inherited, then a flag indicates that the newly created
398 * task should inherit this from the child task.
399 *
400 * As a special concession to bootstrapping the initial process
401 * in the system, it's possible for 'parent_task' to be TASK_NULL;
402 * in this case, 'inherit_memory' MUST be FALSE.
403 */
404 thread_t
405 fork_create_child(task_t parent_task, proc_t child, int inherit_memory, int is64bit)
406 {
407 thread_t child_thread = NULL;
408 task_t child_task;
409 kern_return_t result;
410
411 /* Create a new task for the child process */
412 result = task_create_internal(parent_task,
413 inherit_memory,
414 is64bit,
415 &child_task);
416 if (result != KERN_SUCCESS) {
417 printf("execve: task_create_internal failed. Code: %d\n", result);
418 goto bad;
419 }
420
421 /* Set the child task to the new task */
422 child->task = child_task;
423
424 /* Set child task proc to child proc */
425 set_bsdtask_info(child_task, child);
426
427 /* Propagate CPU limit timer from parent */
428 if (timerisset(&child->p_rlim_cpu))
429 task_vtimer_set(child_task, TASK_VTIMER_RLIM);
430
431 /* Set/clear 64 bit vm_map flag */
432 if (is64bit)
433 vm_map_set_64bit(get_task_map(child_task));
434 else
435 vm_map_set_32bit(get_task_map(child_task));
436
437 #if CONFIG_MACF
438 /* Update task for MAC framework */
439 /* valid to use p_ucred as child is still not running ... */
440 mac_task_label_update_cred(child->p_ucred, child_task);
441 #endif
442
443 /* Set child scheduler priority if nice value inherited from parent */
444 if (child->p_nice != 0)
445 resetpriority(child);
446
447 /* Create a new thread for the child process */
448 result = thread_create(child_task, &child_thread);
449 if (result != KERN_SUCCESS) {
450 printf("execve: thread_create failed. Code: %d\n", result);
451 task_deallocate(child_task);
452 child_task = NULL;
453 }
454 bad:
455 thread_yield_internal(1);
456
457 return(child_thread);
458 }
459
460
461 /*
462 * procdup
463 *
464 * Description: Givben a parent process, provide a duplicate task and thread
465 * for a child process of that parent.
466 *
467 * Parameters: parent Parent process to use as the template
468 * child Child process to duplicate into
469 *
470 * Returns: !NULL Child process thread pointer
471 * NULL Failure (unspecified)
472 *
473 * Note: Most of the heavy lifting is done by fork_create_child(); this
474 * function exists more or less to deal with the 64 bit commpage,
475 * which requires explicit inheritance, the x86 commpage, which
476 * should not need explicit mapping any more, but apparently does,
477 * and to be variant for the bootstrap process.
478 *
479 * There is a special case where the system is being bootstraped,
480 * where this function will be called from cloneproc(), called in
481 * turn from bsd_utaskbootstrap(). In this case, we are acting
482 * to create a task and thread (and uthread) for the benefit of
483 * the kernel process - the first process in the system (PID 0).
484 *
485 * In that specific case, we will *not* pass a parent task, since
486 * there is *not* parent task present to pass.
487 *
488 * XXX: This function should go away; the variance can moved into
489 * XXX: cloneproc(), and the 64bit commpage code can be moved into
490 * XXX: fork_create_child(), after the x86 commpage inheritance is
491 * XXX: corrected.
492 */
493 thread_t
494 procdup(proc_t parent, proc_t child)
495 {
496 thread_t child_thread;
497 task_t child_task;
498
499 if (parent->task == kernel_task)
500 child_thread = fork_create_child(TASK_NULL, child, FALSE, FALSE);
501 else
502 child_thread = fork_create_child(parent->task, child, TRUE, (parent->p_flag & P_LP64));
503
504 if (child_thread != NULL) {
505 child_task = get_threadtask(child_thread);
506 if (parent->p_flag & P_LP64) {
507 task_set_64bit(child_task, TRUE);
508 OSBitOrAtomic(P_LP64, (UInt32 *)&child->p_flag);
509 #ifdef __ppc__
510 /* LP64todo - clean up hacked mapping of commpage */
511 /*
512 * PPC51: ppc64 is limited to 51-bit addresses.
513 * Memory above that limit is handled specially at
514 * the pmap level.
515 */
516 pmap_map_sharedpage(child_task, get_map_pmap(get_task_map(child_task)));
517 #endif /* __ppc__ */
518 } else {
519 task_set_64bit(child_task, FALSE);
520 OSBitAndAtomic(~((uint32_t)P_LP64), (UInt32 *)&child->p_flag);
521 }
522 }
523
524 return(child_thread);
525 }
526
527
528 /*
529 * fork
530 *
531 * Description: fork system call.
532 *
533 * Parameters: parent Parent process to fork
534 * uap (void) [unused]
535 * retval Return value
536 *
537 * Returns: 0 Success
538 * EAGAIN Resource unavailable, try again
539 */
540 int
541 fork(proc_t parent, __unused struct fork_args *uap, register_t *retval)
542 {
543 proc_t child;
544 uid_t uid;
545 thread_t newth;
546 int count;
547 task_t t;
548 #if CONFIG_MACF
549 int err;
550 #endif
551
552 /*
553 * Although process entries are dynamically created, we still keep
554 * a global limit on the maximum number we will create. Don't allow
555 * a nonprivileged user to use the last process; don't let root
556 * exceed the limit. The variable nprocs is the current number of
557 * processes, maxproc is the limit.
558 */
559 uid = kauth_cred_get()->cr_ruid;
560 proc_list_lock();
561 if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
562 proc_list_unlock();
563 tablefull("proc");
564 retval[1] = 0;
565 return (EAGAIN);
566 }
567 proc_list_unlock();
568
569 /*
570 * Increment the count of procs running with this uid. Don't allow
571 * a nonprivileged user to exceed their current limit, which is
572 * always less than what an rlim_t can hold.
573 * (locking protection is provided by list lock held in chgproccnt)
574 */
575 count = chgproccnt(uid, 1);
576 if (uid != 0 &&
577 (rlim_t)count > parent->p_rlimit[RLIMIT_NPROC].rlim_cur) {
578 (void)chgproccnt(uid, -1);
579 return (EAGAIN);
580 }
581
582 #if CONFIG_MACF
583 /*
584 * Determine if MAC policies applied to the process will allow
585 * it to fork.
586 */
587 err = mac_proc_check_fork(parent);
588 if (err != 0) {
589 (void)chgproccnt(uid, -1);
590 return (err);
591 }
592 #endif
593
594 /* The newly created process comes with signal lock held */
595 if ((newth = cloneproc(parent, 1)) == NULL) {
596 /* Failed to create thread */
597 (void)chgproccnt(uid, -1);
598 return (EAGAIN);
599 }
600
601 thread_dup(newth);
602 /* child = newth->task->proc; */
603 child = (proc_t)(get_bsdtask_info(get_threadtask(newth)));
604
605 #if CONFIG_MACF
606 /* inform policies of new process sharing this cred/label */
607 /* safe to use p_ucred here since child is not running */
608 /* JMM - unsafe to assume the association will stay - as */
609 /* there are other ways it can be dropped without */
610 /* informing the policies. */
611 mac_cred_label_associate_fork(child->p_ucred, child);
612 #endif
613
614 /* propogate change of PID - may get new cred if auditing */
615 set_security_token(child);
616
617 AUDIT_ARG(pid, child->p_pid);
618
619 thread_set_child(newth, child->p_pid);
620
621 microtime(&child->p_start);
622 microtime(&child->p_stats->p_start); /* for compat sake */
623 child->p_acflag = AFORK;
624
625 #if CONFIG_DTRACE
626 /*
627 * APPLE NOTE: Solaris does a sprlock() and drops the proc_lock
628 * here. We're cheating a bit and only taking the p_dtrace_sprlock
629 * lock. A full sprlock would task_suspend the parent.
630 */
631 lck_mtx_lock(&parent->p_dtrace_sprlock);
632
633 /*
634 * Remove all DTrace tracepoints from the child process. We
635 * need to do this _before_ duplicating USDT providers since
636 * any associated probes may be immediately enabled.
637 */
638 if (parent->p_dtrace_count > 0) {
639 dtrace_fasttrap_fork(parent, child);
640 }
641
642 lck_mtx_unlock(&parent->p_dtrace_sprlock);
643
644 /*
645 * Duplicate any lazy dof(s). This must be done while NOT
646 * holding the parent sprlock! Lock ordering is dtrace_dof_mode_lock,
647 * then sprlock. It is imperative we always call
648 * dtrace_lazy_dofs_duplicate, rather than null check and
649 * call if !NULL. If we NULL test, during lazy dof faulting
650 * we can race with the faulting code and proceed from here to
651 * beyond the helpers copy. The lazy dof faulting will then
652 * fail to copy the helpers to the child process.
653 */
654 dtrace_lazy_dofs_duplicate(parent, child);
655
656 /*
657 * Duplicate any helper actions and providers. The SFORKING
658 * we set above informs the code to enable USDT probes that
659 * sprlock() may fail because the child is being forked.
660 */
661 /*
662 * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent
663 * never fails to find the child. We do not set SFORKING.
664 */
665 if (parent->p_dtrace_helpers != NULL && dtrace_helpers_fork) {
666 (*dtrace_helpers_fork)(parent, child);
667 }
668
669 #endif
670
671 /* drop the signal lock on the child */
672 proc_signalend(child, 0);
673 proc_transend(child, 0);
674
675 /* "Return" to the child */
676 (void)thread_resume(newth);
677
678 /* drop the extra references we got during the creation */
679 if ((t = (task_t)get_threadtask(newth)) != NULL) {
680 task_deallocate(t);
681 }
682 thread_deallocate(newth);
683
684 proc_knote(parent, NOTE_FORK | child->p_pid);
685
686 retval[0] = child->p_pid;
687 retval[1] = 0; /* flag parent */
688
689 DTRACE_PROC1(create, proc_t, child);
690
691 return (0);
692 }
693
694 /*
695 * cloneproc
696 *
697 * Description: Create a new process from a specified process.
698 *
699 * Parameters: parent The parent process of the process to
700 * be cloned
701 * lock Whether or not the signal lock was held
702 * when calling cloneproc().
703 *
704 * Returns: !NULL pointer to new child thread
705 * NULL Failure (unspecified)
706 *
707 * Note: On return newly created child process has signal lock held
708 * to block delivery of signal to it if called with lock set.
709 * fork() code needs to explicity remove this lock before
710 * signals can be delivered
711 *
712 * In the case of bootstrap, this function can be called from
713 * bsd_utaskbootstrap() in order to bootstrap the first process;
714 * the net effect is to provide a uthread structure for the
715 * kernel process associated with the kernel task. This results
716 * in a side effect in procdup(), which is why the code is more
717 * complicated at the top of that function.
718 */
719 thread_t
720 cloneproc(proc_t parent, int lock)
721 {
722 proc_t child;
723 thread_t th = NULL;
724
725 if ((child = forkproc(parent,lock)) == NULL) {
726 /* Failed to allocate new process */
727 goto bad;
728 }
729
730 if ((th = procdup(parent, child)) == NULL) {
731 /*
732 * Failed to create thread; now we must deconstruct the new
733 * process previously obtained from forkproc().
734 */
735 forkproc_free(child, lock);
736 goto bad;
737 }
738
739 /* make child visible */
740 pinsertchild(parent, child);
741
742 /*
743 * Make child runnable, set start time.
744 */
745 child->p_stat = SRUN;
746
747 bad:
748 return(th);
749 }
750
751 /*
752 * Destroy a process structure that resulted from a call to forkproc(), but
753 * which must be returned to the system because of a subsequent failure
754 * preventing it from becoming active.
755 *
756 * Parameters: p The incomplete process from forkproc()
757 * lock Whether or not the signal lock was held
758 * when calling forkproc().
759 *
760 * Returns: (void)
761 *
762 * Note: This function should only be used in an error handler following
763 * a call to forkproc(). The 'lock' paramenter should be the same
764 * as the lock parameter passed to forkproc().
765 *
766 * Operations occur in reverse order of those in forkproc().
767 */
768 void
769 forkproc_free(proc_t p, int lock)
770 {
771
772 /* Drop the signal lock, if it was held */
773 if (lock) {
774 proc_signalend(p, 0);
775 proc_transend(p, 0);
776 }
777
778 /*
779 * If we have our own copy of the resource limits structure, we
780 * need to free it. If it's a shared copy, we need to drop our
781 * reference on it.
782 */
783 proc_limitdrop(p, 0);
784 p->p_limit = NULL;
785
786 #if SYSV_SHM
787 /* Need to drop references to the shared memory segment(s), if any */
788 if (p->vm_shm) {
789 /*
790 * Use shmexec(): we have no address space, so no mappings
791 *
792 * XXX Yes, the routine is badly named.
793 */
794 shmexec(p);
795 }
796 #endif
797
798 /* Need to undo the effects of the fdcopy(), if any */
799 fdfree(p);
800
801 /*
802 * Drop the reference on a text vnode pointer, if any
803 * XXX This code is broken in forkproc(); see <rdar://4256419>;
804 * XXX if anyone ever uses this field, we will be extremely unhappy.
805 */
806 if (p->p_textvp) {
807 vnode_rele(p->p_textvp);
808 p->p_textvp = NULL;
809 }
810
811 /* Stop the profiling clock */
812 stopprofclock(p);
813
814 /* Release the credential reference */
815 kauth_cred_unref(&p->p_ucred);
816
817 proc_list_lock();
818 /* Decrement the count of processes in the system */
819 nprocs--;
820 proc_list_unlock();
821
822 thread_call_free(p->p_rcall);
823
824 /* Free allocated memory */
825 FREE_ZONE(p->p_sigacts, sizeof *p->p_sigacts, M_SIGACTS);
826 FREE_ZONE(p->p_stats, sizeof *p->p_stats, M_PSTATS);
827 proc_checkdeadrefs(p);
828 FREE_ZONE(p, sizeof *p, M_PROC);
829 }
830
831
832 /*
833 * forkproc
834 *
835 * Description: Create a new process structure, given a parent process
836 * structure.
837 *
838 * Parameters: parent The parent process
839 * lock If the signal lock should be taken on
840 * the newly created process.
841 *
842 * Returns: !NULL The new process structure
843 * NULL Error (insufficient free memory)
844 *
845 * Note: When successful, the newly created process structure is
846 * partially initialized; if a caller needs to deconstruct the
847 * returned structure, they must call forkproc_free() to do so.
848 */
849 proc_t
850 forkproc(proc_t parent, int lock)
851 {
852 struct proc * child; /* Our new process */
853 static int nextpid = 0, pidwrap = 0, nextpidversion = 0;
854 int error = 0;
855 struct session *sessp;
856 uthread_t uth_parent = (uthread_t)get_bsdthread_info(current_thread());
857
858 MALLOC_ZONE(child, proc_t , sizeof *child, M_PROC, M_WAITOK);
859 if (child == NULL) {
860 printf("forkproc: M_PROC zone exhausted\n");
861 goto bad;
862 }
863 /* zero it out as we need to insert in hash */
864 bzero(child, sizeof *child);
865
866 MALLOC_ZONE(child->p_stats, struct pstats *,
867 sizeof *child->p_stats, M_PSTATS, M_WAITOK);
868 if (child->p_stats == NULL) {
869 printf("forkproc: M_SUBPROC zone exhausted (p_stats)\n");
870 FREE_ZONE(child, sizeof *child, M_PROC);
871 child = NULL;
872 goto bad;
873 }
874 MALLOC_ZONE(child->p_sigacts, struct sigacts *,
875 sizeof *child->p_sigacts, M_SIGACTS, M_WAITOK);
876 if (child->p_sigacts == NULL) {
877 printf("forkproc: M_SUBPROC zone exhausted (p_sigacts)\n");
878 FREE_ZONE(child->p_stats, sizeof *child->p_stats, M_PSTATS);
879 FREE_ZONE(child, sizeof *child, M_PROC);
880 child = NULL;
881 goto bad;
882 }
883 child->p_rcall = thread_call_allocate((thread_call_func_t)realitexpire, child);
884 if (child->p_rcall == NULL) {
885 FREE_ZONE(child->p_sigacts, sizeof *child->p_sigacts, M_SIGACTS);
886 FREE_ZONE(child->p_stats, sizeof *child->p_stats, M_PSTATS);
887 FREE_ZONE(child, sizeof *child, M_PROC);
888 child = NULL;
889 goto bad;
890 }
891
892
893 /*
894 * Find an unused PID.
895 */
896
897 proc_list_lock();
898
899 nextpid++;
900 retry:
901 /*
902 * If the process ID prototype has wrapped around,
903 * restart somewhat above 0, as the low-numbered procs
904 * tend to include daemons that don't exit.
905 */
906 if (nextpid >= PID_MAX) {
907 nextpid = 100;
908 pidwrap = 1;
909 }
910 if (pidwrap != 0) {
911
912 /* if the pid stays in hash both for zombie and runniing state */
913 if (pfind_locked(nextpid) != PROC_NULL) {
914 nextpid++;
915 goto retry;
916 }
917
918 if (pgfind_internal(nextpid) != PGRP_NULL) {
919 nextpid++;
920 goto retry;
921 }
922 if (session_find_internal(nextpid) != SESSION_NULL) {
923 nextpid++;
924 goto retry;
925 }
926 }
927 nprocs++;
928 child->p_pid = nextpid;
929 child->p_idversion = nextpidversion++;
930 #if 1
931 if (child->p_pid != 0) {
932 if (pfind_locked(child->p_pid) != PROC_NULL)
933 panic("proc in the list already\n");
934 }
935 #endif
936 /* Insert in the hash */
937 child->p_listflag |= (P_LIST_INHASH | P_LIST_INCREATE);
938 LIST_INSERT_HEAD(PIDHASH(child->p_pid), child, p_hash);
939 proc_list_unlock();
940
941
942 /*
943 * We've identified the PID we are going to use; initialize the new
944 * process structure.
945 */
946 child->p_stat = SIDL;
947 child->p_pgrpid = PGRPID_DEAD;
948
949 /*
950 * The zero'ing of the proc was at the allocation time due to need for insertion
951 * to hash. Copy the section that is to be copied directly from the parent.
952 */
953 bcopy(&parent->p_startcopy, &child->p_startcopy,
954 (unsigned) ((caddr_t)&child->p_endcopy - (caddr_t)&child->p_startcopy));
955
956 /*
957 * Some flags are inherited from the parent.
958 * Duplicate sub-structures as needed.
959 * Increase reference counts on shared objects.
960 * The p_stats and p_sigacts substructs are set in vm_fork.
961 */
962 child->p_flag = (parent->p_flag & (P_LP64 | P_TRANSLATED | P_AFFINITY));
963 if (parent->p_flag & P_PROFIL)
964 startprofclock(child);
965 /*
966 * Note that if the current thread has an assumed identity, this
967 * credential will be granted to the new process.
968 */
969 child->p_ucred = kauth_cred_get_with_ref();
970
971 lck_mtx_init(&child->p_mlock, proc_lck_grp, proc_lck_attr);
972 lck_mtx_init(&child->p_fdmlock, proc_lck_grp, proc_lck_attr);
973 #if CONFIG_DTRACE
974 lck_mtx_init(&child->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr);
975 #endif
976 lck_spin_init(&child->p_slock, proc_lck_grp, proc_lck_attr);
977 klist_init(&child->p_klist);
978
979 if (child->p_textvp != NULLVP) {
980 /* bump references to the text vnode */
981 /* Need to hold iocount across the ref call */
982 if (vnode_getwithref(child->p_textvp) == 0) {
983 error = vnode_ref(child->p_textvp);
984 vnode_put(child->p_textvp);
985 if (error != 0)
986 child->p_textvp = NULLVP;
987 }
988 }
989
990 /* XXX may fail to copy descriptors to child */
991 child->p_fd = fdcopy(parent, uth_parent->uu_cdir);
992
993 #if SYSV_SHM
994 if (parent->vm_shm) {
995 /* XXX may fail to attach shm to child */
996 (void)shmfork(parent,child);
997 }
998 #endif
999 /*
1000 * inherit the limit structure to child
1001 */
1002 proc_limitfork(parent, child);
1003
1004 if (child->p_limit->pl_rlimit[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
1005 uint64_t rlim_cur = child->p_limit->pl_rlimit[RLIMIT_CPU].rlim_cur;
1006 child->p_rlim_cpu.tv_sec = (rlim_cur > __INT_MAX__) ? __INT_MAX__ : rlim_cur;
1007 }
1008
1009 bzero(&child->p_stats->pstat_startzero,
1010 (unsigned) ((caddr_t)&child->p_stats->pstat_endzero -
1011 (caddr_t)&child->p_stats->pstat_startzero));
1012
1013 bzero(&child->p_stats->user_p_prof, sizeof(struct user_uprof));
1014
1015 if (parent->p_sigacts != NULL)
1016 (void)memcpy(child->p_sigacts,
1017 parent->p_sigacts, sizeof *child->p_sigacts);
1018 else
1019 (void)memset(child->p_sigacts, 0, sizeof *child->p_sigacts);
1020
1021 sessp = proc_session(parent);
1022 if (sessp->s_ttyvp != NULL && parent->p_flag & P_CONTROLT)
1023 OSBitOrAtomic(P_CONTROLT, (UInt32 *)&child->p_flag);
1024 session_rele(sessp);
1025
1026 /* block all signals to reach the process */
1027 if (lock) {
1028 proc_signalstart(child, 0);
1029 proc_transstart(child, 0);
1030 }
1031
1032 TAILQ_INIT(&child->p_uthlist);
1033 TAILQ_INIT(&child->aio_activeq);
1034 TAILQ_INIT(&child->aio_doneq);
1035 /* Inherit the parent flags for code sign */
1036 child->p_csflags = parent->p_csflags;
1037 child->p_wqthread = parent->p_wqthread;
1038 child->p_threadstart = parent->p_threadstart;
1039 child->p_pthsize = parent->p_pthsize;
1040 workqueue_init_lock(child);
1041
1042 #if CONFIG_LCTX
1043 child->p_lctx = NULL;
1044 /* Add new process to login context (if any). */
1045 if (parent->p_lctx != NULL) {
1046 LCTX_LOCK(parent->p_lctx);
1047 enterlctx(child, parent->p_lctx, 0);
1048 }
1049 #endif
1050
1051 bad:
1052 return(child);
1053 }
1054
1055 void
1056 proc_lock(proc_t p)
1057 {
1058 lck_mtx_lock(&p->p_mlock);
1059 }
1060
1061 void
1062 proc_unlock(proc_t p)
1063 {
1064 lck_mtx_unlock(&p->p_mlock);
1065 }
1066
1067 void
1068 proc_spinlock(proc_t p)
1069 {
1070 lck_spin_lock(&p->p_slock);
1071 }
1072
1073 void
1074 proc_spinunlock(proc_t p)
1075 {
1076 lck_spin_unlock(&p->p_slock);
1077 }
1078
1079 void
1080 proc_list_lock(void)
1081 {
1082 lck_mtx_lock(proc_list_mlock);
1083 }
1084
1085 void
1086 proc_list_unlock(void)
1087 {
1088 lck_mtx_unlock(proc_list_mlock);
1089 }
1090
1091 #include <kern/zalloc.h>
1092
1093 struct zone *uthread_zone;
1094 static int uthread_zone_inited = 0;
1095
1096 static void
1097 uthread_zone_init(void)
1098 {
1099 if (!uthread_zone_inited) {
1100 uthread_zone = zinit(sizeof(struct uthread),
1101 THREAD_MAX * sizeof(struct uthread),
1102 THREAD_CHUNK * sizeof(struct uthread),
1103 "uthreads");
1104 uthread_zone_inited = 1;
1105 }
1106 }
1107
1108 void *
1109 uthread_alloc(task_t task, thread_t thread)
1110 {
1111 proc_t p;
1112 uthread_t uth;
1113 uthread_t uth_parent;
1114 void *ut;
1115
1116 if (!uthread_zone_inited)
1117 uthread_zone_init();
1118
1119 ut = (void *)zalloc(uthread_zone);
1120 bzero(ut, sizeof(struct uthread));
1121
1122 p = (proc_t) get_bsdtask_info(task);
1123 uth = (uthread_t)ut;
1124
1125 /*
1126 * Thread inherits credential from the creating thread, if both
1127 * are in the same task.
1128 *
1129 * If the creating thread has no credential or is from another
1130 * task we can leave the new thread credential NULL. If it needs
1131 * one later, it will be lazily assigned from the task's process.
1132 */
1133 uth_parent = (uthread_t)get_bsdthread_info(current_thread());
1134 if (task == current_task() &&
1135 uth_parent != NULL &&
1136 IS_VALID_CRED(uth_parent->uu_ucred)) {
1137 /*
1138 * XXX The new thread is, in theory, being created in context
1139 * XXX of parent thread, so a direct reference to the parent
1140 * XXX is OK.
1141 */
1142 kauth_cred_ref(uth_parent->uu_ucred);
1143 uth->uu_ucred = uth_parent->uu_ucred;
1144 /* the credential we just inherited is an assumed credential */
1145 if (uth_parent->uu_flag & UT_SETUID)
1146 uth->uu_flag |= UT_SETUID;
1147 } else {
1148 uth->uu_ucred = NOCRED;
1149 }
1150
1151
1152 if ((task != kernel_task) && p) {
1153
1154 proc_lock(p);
1155 if (uth_parent) {
1156 if (uth_parent->uu_flag & UT_SAS_OLDMASK)
1157 uth->uu_sigmask = uth_parent->uu_oldmask;
1158 else
1159 uth->uu_sigmask = uth_parent->uu_sigmask;
1160 }
1161 uth->uu_context.vc_thread = thread;
1162 TAILQ_INSERT_TAIL(&p->p_uthlist, uth, uu_list);
1163 proc_unlock(p);
1164
1165 #if CONFIG_DTRACE
1166 if (p->p_dtrace_ptss_pages != NULL) {
1167 uth->t_dtrace_scratch = dtrace_ptss_claim_entry(p);
1168 }
1169 #endif
1170 }
1171
1172 return (ut);
1173 }
1174
1175
1176 /*
1177 * This routine frees all the BSD context in uthread except the credential.
1178 * It does not free the uthread structure as well
1179 */
1180 void
1181 uthread_cleanup(task_t task, void *uthread, void * bsd_info)
1182 {
1183 struct _select *sel;
1184 uthread_t uth = (uthread_t)uthread;
1185 proc_t p = (proc_t)bsd_info;
1186
1187
1188 if (uth->uu_lowpri_window) {
1189 /*
1190 * task is marked as a low priority I/O type
1191 * and we've somehow managed to not dismiss the throttle
1192 * through the normal exit paths back to user space...
1193 * no need to throttle this thread since its going away
1194 * but we do need to update our bookeeping w/r to throttled threads
1195 */
1196 throttle_lowpri_io(FALSE);
1197 }
1198 /*
1199 * Per-thread audit state should never last beyond system
1200 * call return. Since we don't audit the thread creation/
1201 * removal, the thread state pointer should never be
1202 * non-NULL when we get here.
1203 */
1204 assert(uth->uu_ar == NULL);
1205
1206 sel = &uth->uu_select;
1207 /* cleanup the select bit space */
1208 if (sel->nbytes) {
1209 FREE(sel->ibits, M_TEMP);
1210 FREE(sel->obits, M_TEMP);
1211 sel->nbytes = 0;
1212 }
1213
1214 if (uth->uu_cdir) {
1215 vnode_rele(uth->uu_cdir);
1216 uth->uu_cdir = NULLVP;
1217 }
1218
1219 if (uth->uu_allocsize && uth->uu_wqset){
1220 kfree(uth->uu_wqset, uth->uu_allocsize);
1221 sel->count = 0;
1222 uth->uu_allocsize = 0;
1223 uth->uu_wqset = 0;
1224 sel->wql = 0;
1225 }
1226
1227
1228 if ((task != kernel_task) && p) {
1229
1230 if (((uth->uu_flag & UT_VFORK) == UT_VFORK) && (uth->uu_proc != PROC_NULL)) {
1231 vfork_exit_internal(uth->uu_proc, 0, 1);
1232 }
1233 if (get_bsdtask_info(task) == p) {
1234 proc_lock(p);
1235 TAILQ_REMOVE(&p->p_uthlist, uth, uu_list);
1236 proc_unlock(p);
1237 }
1238 #if CONFIG_DTRACE
1239 if (uth->t_dtrace_scratch != NULL) {
1240 dtrace_ptss_release_entry(p, uth->t_dtrace_scratch);
1241 }
1242 #endif
1243 }
1244 }
1245
1246 /* This routine releases the credential stored in uthread */
1247 void
1248 uthread_cred_free(void *uthread)
1249 {
1250 uthread_t uth = (uthread_t)uthread;
1251
1252 /* and free the uthread itself */
1253 if (IS_VALID_CRED(uth->uu_ucred)) {
1254 kauth_cred_t oldcred = uth->uu_ucred;
1255 uth->uu_ucred = NOCRED;
1256 kauth_cred_unref(&oldcred);
1257 }
1258 }
1259
1260 /* This routine frees the uthread structure held in thread structure */
1261 void
1262 uthread_zone_free(void *uthread)
1263 {
1264 /* and free the uthread itself */
1265 zfree(uthread_zone, uthread);
1266 }