bsd/kern/kern_exec.c

   1 /*
   2  * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Mach Operating System
  31  * Copyright (c) 1987 Carnegie-Mellon University
  32  * All rights reserved.  The CMU software License Agreement specifies
  33  * the terms and conditions for use and redistribution.
  34  */
  35
  36 /*-
  37  * Copyright (c) 1982, 1986, 1991, 1993
  38  *      The Regents of the University of California.  All rights reserved.
  39  * (c) UNIX System Laboratories, Inc.
  40  * All or some portions of this file are derived from material licensed
  41  * to the University of California by American Telephone and Telegraph
  42  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  43  * the permission of UNIX System Laboratories, Inc.
  44  *
  45  * Redistribution and use in source and binary forms, with or without
  46  * modification, are permitted provided that the following conditions
  47  * are met:
  48  * 1. Redistributions of source code must retain the above copyright
  49  *    notice, this list of conditions and the following disclaimer.
  50  * 2. Redistributions in binary form must reproduce the above copyright
  51  *    notice, this list of conditions and the following disclaimer in the
  52  *    documentation and/or other materials provided with the distribution.
  53  * 3. All advertising materials mentioning features or use of this software
  54  *    must display the following acknowledgement:
  55  *      This product includes software developed by the University of
  56  *      California, Berkeley and its contributors.
  57  * 4. Neither the name of the University nor the names of its contributors
  58  *    may be used to endorse or promote products derived from this software
  59  *    without specific prior written permission.
  60  *
  61  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  62  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  63  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  64  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  65  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  66  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  67  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  68  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  69  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  70  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  71  * SUCH DAMAGE.
  72  *
  73  *      from: @(#)kern_exec.c   8.1 (Berkeley) 6/10/93
  74  */
  75 /*
  76  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  77  * support for mandatory and extensible security protections.  This notice
  78  * is included in support of clause 2.2 (b) of the Apple Public License,
  79  * Version 2.0.
  80  */
  81 #include <machine/reg.h>
  82 #include <machine/cpu_capabilities.h>
  83
  84 #include <sys/param.h>
  85 #include <sys/systm.h>
  86 #include <sys/filedesc.h>
  87 #include <sys/kernel.h>
  88 #include <sys/proc_internal.h>
  89 #include <sys/kauth.h>
  90 #include <sys/user.h>
  91 #include <sys/socketvar.h>
  92 #include <sys/malloc.h>
  93 #include <sys/namei.h>
  94 #include <sys/mount_internal.h>
  95 #include <sys/vnode_internal.h>
  96 #include <sys/file_internal.h>
  97 #include <sys/stat.h>
  98 #include <sys/uio_internal.h>
  99 #include <sys/acct.h>
 100 #include <sys/exec.h>
 101 #include <sys/kdebug.h>
 102 #include <sys/signal.h>
 103 #include <sys/aio_kern.h>
 104 #include <sys/sysproto.h>
 105 #if SYSV_SHM
 106 #include <sys/shm_internal.h>           /* shmexec() */
 107 #endif
 108 #include <sys/ubc_internal.h>           /* ubc_map() */
 109 #include <sys/spawn.h>
 110 #include <sys/spawn_internal.h>
 111 #include <sys/process_policy.h>
 112 #include <sys/codesign.h>
 113 #include <crypto/sha1.h>
 114
 115 #include <libkern/libkern.h>
 116
 117 #include <security/audit/audit.h>
 118
 119 #include <ipc/ipc_types.h>
 120
 121 #include <mach/mach_types.h>
 122 #include <mach/port.h>
 123 #include <mach/task.h>
 124 #include <mach/task_access.h>
 125 #include <mach/thread_act.h>
 126 #include <mach/vm_map.h>
 127 #include <mach/mach_vm.h>
 128 #include <mach/vm_param.h>
 129
 130 #include <kern/sched_prim.h> /* thread_wakeup() */
 131 #include <kern/affinity.h>
 132 #include <kern/assert.h>
 133 #include <kern/task.h>
 134 #include <kern/coalition.h>
 135
 136 #if CONFIG_MACF
 137 #include <security/mac.h>
 138 #include <security/mac_mach_internal.h>
 139 #endif
 140
 141 #include <vm/vm_map.h>
 142 #include <vm/vm_kern.h>
 143 #include <vm/vm_protos.h>
 144 #include <vm/vm_kern.h>
 145 #include <vm/vm_fault.h>
 146 #include <vm/vm_pageout.h>
 147
 148 #include <kdp/kdp_dyld.h>
 149
 150 #include <machine/pal_routines.h>
 151
 152 #include <pexpert/pexpert.h>
 153
 154 #if CONFIG_MEMORYSTATUS
 155 #include <sys/kern_memorystatus.h>
 156 #endif
 157
 158 #if CONFIG_DTRACE
 159 /* Do not include dtrace.h, it redefines kmem_[alloc/free] */
 160 extern void (*dtrace_fasttrap_exec_ptr)(proc_t);
 161 extern void (*dtrace_proc_waitfor_exec_ptr)(proc_t);
 162 extern void (*dtrace_helpers_cleanup)(proc_t);
 163 extern void dtrace_lazy_dofs_destroy(proc_t);
 164
 165 /*
 166  * Since dtrace_proc_waitfor_exec_ptr can be added/removed in dtrace_subr.c,
 167  * we will store its value before actually calling it.
 168  */
 169 static void (*dtrace_proc_waitfor_hook)(proc_t) = NULL;
 170
 171 #include <sys/dtrace_ptss.h>
 172 #endif
 173
 174 /* support for child creation in exec after vfork */
 175 thread_t fork_create_child(task_t parent_task, coalition_t *parent_coalition, proc_t child_proc, int inherit_memory, int is64bit);
 176 void vfork_exit(proc_t p, int rv);
 177 extern void proc_apply_task_networkbg_internal(proc_t, thread_t);
 178
 179 /*
 180  * Mach things for which prototypes are unavailable from Mach headers
 181  */
 182 void            ipc_task_reset(
 183                         task_t          task);
 184 void            ipc_thread_reset(
 185                         thread_t        thread);
 186 kern_return_t ipc_object_copyin(
 187         ipc_space_t             space,
 188         mach_port_name_t        name,
 189         mach_msg_type_name_t    msgt_name,
 190         ipc_object_t            *objectp);
 191 void ipc_port_release_send(ipc_port_t);
 192
 193 #if DEVELOPMENT || DEBUG
 194 void task_importance_update_owner_info(task_t);
 195 #endif
 196
 197 extern struct savearea *get_user_regs(thread_t);
 198
 199 __attribute__((noinline)) int __EXEC_WAITING_ON_TASKGATED_CODE_SIGNATURE_UPCALL__(mach_port_t task_access_port, int32_t new_pid);
 200
 201 #include <kern/thread.h>
 202 #include <kern/task.h>
 203 #include <kern/ast.h>
 204 #include <kern/mach_loader.h>
 205 #include <kern/mach_fat.h>
 206 #include <mach-o/fat.h>
 207 #include <mach-o/loader.h>
 208 #include <machine/vmparam.h>
 209 #include <sys/imgact.h>
 210
 211 #include <sys/sdt.h>
 212
 213
 214 /*
 215  * EAI_ITERLIMIT        The maximum number of times to iterate an image
 216  *                      activator in exec_activate_image() before treating
 217  *                      it as malformed/corrupt.
 218  */
 219 #define EAI_ITERLIMIT           3
 220
 221 /*
 222  * For #! interpreter parsing
 223  */
 224 #define IS_WHITESPACE(ch) ((ch == ' ') || (ch == '\t'))
 225 #define IS_EOL(ch) ((ch == '#') || (ch == '\n'))
 226
 227 extern vm_map_t bsd_pageable_map;
 228 extern const struct fileops vnops;
 229
 230 #define USER_ADDR_ALIGN(addr, val) \
 231         ( ( (user_addr_t)(addr) + (val) - 1) \
 232                 & ~((val) - 1) )
 233
 234 struct image_params;    /* Forward */
 235 static int exec_activate_image(struct image_params *imgp);
 236 static int exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp);
 237 static int load_return_to_errno(load_return_t lrtn);
 238 static int execargs_alloc(struct image_params *imgp);
 239 static int execargs_free(struct image_params *imgp);
 240 static int exec_check_permissions(struct image_params *imgp);
 241 static int exec_extract_strings(struct image_params *imgp);
 242 static int exec_add_apple_strings(struct image_params *imgp);
 243 static int exec_handle_sugid(struct image_params *imgp);
 244 static int sugid_scripts = 0;
 245 SYSCTL_INT (_kern, OID_AUTO, sugid_scripts, CTLFLAG_RW | CTLFLAG_LOCKED, &sugid_scripts, 0, "");
 246 static kern_return_t create_unix_stack(vm_map_t map, load_result_t* load_result, proc_t p);
 247 static int copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size);
 248 static void exec_resettextvp(proc_t, struct image_params *);
 249 static int check_for_signature(proc_t, struct image_params *);
 250 static void exec_prefault_data(proc_t, struct image_params *, load_result_t *);
 251 static errno_t exec_handle_port_actions(struct image_params *imgp, short psa_flags, boolean_t * portwatch_present, ipc_port_t * portwatch_ports);
 252 static errno_t exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp, uint64_t psa_darwin_role,
 253                              ipc_port_t * portwatch_ports, int portwatch_count);
 254
 255 /*
 256  * exec_add_user_string
 257  *
 258  * Add the requested string to the string space area.
 259  *
 260  * Parameters;  struct image_params *           image parameter block
 261  *              user_addr_t                     string to add to strings area
 262  *              int                             segment from which string comes
 263  *              boolean_t                       TRUE if string contributes to NCARGS
 264  *
 265  * Returns:     0                       Success
 266  *              !0                      Failure errno from copyinstr()
 267  *
 268  * Implicit returns:
 269  *              (imgp->ip_strendp)      updated location of next add, if any
 270  *              (imgp->ip_strspace)     updated byte count of space remaining
 271  *              (imgp->ip_argspace) updated byte count of space in NCARGS
 272  */
 273 static int
 274 exec_add_user_string(struct image_params *imgp, user_addr_t str, int seg, boolean_t is_ncargs)
 275 {
 276         int error = 0;
 277
 278         do {
 279                 size_t len = 0;
 280                 int space;
 281
 282                 if (is_ncargs)
 283                         space = imgp->ip_argspace; /* by definition smaller than ip_strspace */
 284                 else
 285                         space = imgp->ip_strspace;
 286
 287                 if (space <= 0) {
 288                         error = E2BIG;
 289                         break;
 290                 }
 291
 292                 if (!UIO_SEG_IS_USER_SPACE(seg)) {
 293                         char *kstr = CAST_DOWN(char *,str);     /* SAFE */
 294                         error = copystr(kstr, imgp->ip_strendp, space, &len);
 295                 } else  {
 296                         error = copyinstr(str, imgp->ip_strendp, space, &len);
 297                 }
 298
 299                 imgp->ip_strendp += len;
 300                 imgp->ip_strspace -= len;
 301                 if (is_ncargs)
 302                         imgp->ip_argspace -= len;
 303
 304         } while (error == ENAMETOOLONG);
 305
 306         return error;
 307 }
 308
 309 /*
 310  * dyld is now passed the executable path as a getenv-like variable
 311  * in the same fashion as the stack_guard and malloc_entropy keys.
 312  */
 313 #define EXECUTABLE_KEY "executable_path="
 314
 315 /*
 316  * exec_save_path
 317  *
 318  * To support new app package launching for Mac OS X, the dyld needs the
 319  * first argument to execve() stored on the user stack.
 320  *
 321  * Save the executable path name at the bottom of the strings area and set
 322  * the argument vector pointer to the location following that to indicate
 323  * the start of the argument and environment tuples, setting the remaining
 324  * string space count to the size of the string area minus the path length.
 325  *
 326  * Parameters;  struct image_params *           image parameter block
 327  *              char *                          path used to invoke program
 328  *              int                             segment from which path comes
 329  *
 330  * Returns:     int                     0       Success
 331  *              EFAULT                          Bad address
 332  *      copy[in]str:EFAULT                      Bad address
 333  *      copy[in]str:ENAMETOOLONG                Filename too long
 334  *
 335  * Implicit returns:
 336  *              (imgp->ip_strings)              saved path
 337  *              (imgp->ip_strspace)             space remaining in ip_strings
 338  *              (imgp->ip_strendp)              start of remaining copy area
 339  *              (imgp->ip_argspace)             space remaining of NCARGS
 340  *              (imgp->ip_applec)               Initial applev[0]
 341  *
 342  * Note:        We have to do this before the initial namei() since in the
 343  *              path contains symbolic links, namei() will overwrite the
 344  *              original path buffer contents.  If the last symbolic link
 345  *              resolved was a relative pathname, we would lose the original
 346  *              "path", which could be an absolute pathname. This might be
 347  *              unacceptable for dyld.
 348  */
 349 static int
 350 exec_save_path(struct image_params *imgp, user_addr_t path, int seg, const char **excpath)
 351 {
 352         int error;
 353         size_t len;
 354         char *kpath;
 355
 356         // imgp->ip_strings can come out of a cache, so we need to obliterate the
 357         // old path.
 358         memset(imgp->ip_strings, '\0', strlen(EXECUTABLE_KEY) + MAXPATHLEN);
 359
 360         len = MIN(MAXPATHLEN, imgp->ip_strspace);
 361
 362         switch(seg) {
 363         case UIO_USERSPACE32:
 364         case UIO_USERSPACE64:   /* Same for copyin()... */
 365                 error = copyinstr(path, imgp->ip_strings + strlen(EXECUTABLE_KEY), len, &len);
 366                 break;
 367         case UIO_SYSSPACE:
 368                 kpath = CAST_DOWN(char *,path); /* SAFE */
 369                 error = copystr(kpath, imgp->ip_strings + strlen(EXECUTABLE_KEY), len, &len);
 370                 break;
 371         default:
 372                 error = EFAULT;
 373                 break;
 374         }
 375
 376         if (!error) {
 377                 bcopy(EXECUTABLE_KEY, imgp->ip_strings, strlen(EXECUTABLE_KEY));
 378                 len += strlen(EXECUTABLE_KEY);
 379
 380                 imgp->ip_strendp += len;
 381                 imgp->ip_strspace -= len;
 382
 383                 if (excpath) {
 384                         *excpath = imgp->ip_strings + strlen(EXECUTABLE_KEY);
 385                 }
 386         }
 387
 388         return(error);
 389 }
 390
 391 /*
 392  * exec_reset_save_path
 393  *
 394  * If we detect a shell script, we need to reset the string area
 395  * state so that the interpreter can be saved onto the stack.
 396
 397  * Parameters;  struct image_params *           image parameter block
 398  *
 399  * Returns:     int                     0       Success
 400  *
 401  * Implicit returns:
 402  *              (imgp->ip_strings)              saved path
 403  *              (imgp->ip_strspace)             space remaining in ip_strings
 404  *              (imgp->ip_strendp)              start of remaining copy area
 405  *              (imgp->ip_argspace)             space remaining of NCARGS
 406  *
 407  */
 408 static int
 409 exec_reset_save_path(struct image_params *imgp)
 410 {
 411         imgp->ip_strendp = imgp->ip_strings;
 412         imgp->ip_argspace = NCARGS;
 413         imgp->ip_strspace = ( NCARGS + PAGE_SIZE );
 414
 415         return (0);
 416 }
 417
 418 /*
 419  * exec_shell_imgact
 420  *
 421  * Image activator for interpreter scripts.  If the image begins with
 422  * the characters "#!", then it is an interpreter script.  Verify the
 423  * length of the script line indicating the interpreter is not in
 424  * excess of the maximum allowed size.  If this is the case, then
 425  * break out the arguments, if any, which are separated by white
 426  * space, and copy them into the argument save area as if they were
 427  * provided on the command line before all other arguments.  The line
 428  * ends when we encounter a comment character ('#') or newline.
 429  *
 430  * Parameters;  struct image_params *   image parameter block
 431  *
 432  * Returns:     -1                      not an interpreter (keep looking)
 433  *              -3                      Success: interpreter: relookup
 434  *              >0                      Failure: interpreter: error number
 435  *
 436  * A return value other than -1 indicates subsequent image activators should
 437  * not be given the opportunity to attempt to activate the image.
 438  */
 439 static int
 440 exec_shell_imgact(struct image_params *imgp)
 441 {
 442         char *vdata = imgp->ip_vdata;
 443         char *ihp;
 444         char *line_startp, *line_endp;
 445         char *interp;
 446         proc_t p;
 447         struct fileproc *fp;
 448         int fd;
 449         int error;
 450
 451         /*
 452          * Make sure it's a shell script.  If we've already redirected
 453          * from an interpreted file once, don't do it again.
 454          */
 455         if (vdata[0] != '#' ||
 456             vdata[1] != '!' ||
 457             (imgp->ip_flags & IMGPF_INTERPRET) != 0) {
 458                 return (-1);
 459         }
 460
 461         if (imgp->ip_origcputype != 0) {
 462                 /* Fat header previously matched, don't allow shell script inside */
 463                 return (-1);
 464         }
 465
 466         imgp->ip_flags |= IMGPF_INTERPRET;
 467         imgp->ip_interp_sugid_fd = -1;
 468         imgp->ip_interp_buffer[0] = '\0';
 469
 470         /* Check to see if SUGID scripts are permitted.  If they aren't then
 471          * clear the SUGID bits.
 472          * imgp->ip_vattr is known to be valid.
 473          */
 474         if (sugid_scripts == 0) {
 475                 imgp->ip_origvattr->va_mode &= ~(VSUID | VSGID);
 476         }
 477
 478         /* Try to find the first non-whitespace character */
 479         for( ihp = &vdata[2]; ihp < &vdata[IMG_SHSIZE]; ihp++ ) {
 480                 if (IS_EOL(*ihp)) {
 481                         /* Did not find interpreter, "#!\n" */
 482                         return (ENOEXEC);
 483                 } else if (IS_WHITESPACE(*ihp)) {
 484                         /* Whitespace, like "#!    /bin/sh\n", keep going. */
 485                 } else {
 486                         /* Found start of interpreter */
 487                         break;
 488                 }
 489         }
 490
 491         if (ihp == &vdata[IMG_SHSIZE]) {
 492                 /* All whitespace, like "#!           " */
 493                 return (ENOEXEC);
 494         }
 495
 496         line_startp = ihp;
 497
 498         /* Try to find the end of the interpreter+args string */
 499         for ( ; ihp < &vdata[IMG_SHSIZE]; ihp++ ) {
 500                 if (IS_EOL(*ihp)) {
 501                         /* Got it */
 502                         break;
 503                 } else {
 504                         /* Still part of interpreter or args */
 505                 }
 506         }
 507
 508         if (ihp == &vdata[IMG_SHSIZE]) {
 509                 /* A long line, like "#! blah blah blah" without end */
 510                 return (ENOEXEC);
 511         }
 512
 513         /* Backtrack until we find the last non-whitespace */
 514         while (IS_EOL(*ihp) || IS_WHITESPACE(*ihp)) {
 515                 ihp--;
 516         }
 517
 518         /* The character after the last non-whitespace is our logical end of line */
 519         line_endp = ihp + 1;
 520
 521         /*
 522          * Now we have pointers to the usable part of:
 523          *
 524          * "#!  /usr/bin/int first    second   third    \n"
 525          *      ^ line_startp                       ^ line_endp
 526          */
 527
 528         /* copy the interpreter name */
 529         interp = imgp->ip_interp_buffer;
 530         for ( ihp = line_startp; (ihp < line_endp) && !IS_WHITESPACE(*ihp); ihp++)
 531                 *interp++ = *ihp;
 532         *interp = '\0';
 533
 534         exec_reset_save_path(imgp);
 535         exec_save_path(imgp, CAST_USER_ADDR_T(imgp->ip_interp_buffer),
 536                                                         UIO_SYSSPACE, NULL);
 537
 538         /* Copy the entire interpreter + args for later processing into argv[] */
 539         interp = imgp->ip_interp_buffer;
 540         for ( ihp = line_startp; (ihp < line_endp); ihp++)
 541                 *interp++ = *ihp;
 542         *interp = '\0';
 543
 544         /*
 545          * If we have a SUID oder SGID script, create a file descriptor
 546          * from the vnode and pass /dev/fd/%d instead of the actual
 547          * path name so that the script does not get opened twice
 548          */
 549         if (imgp->ip_origvattr->va_mode & (VSUID | VSGID)) {
 550                 p = vfs_context_proc(imgp->ip_vfs_context);
 551                 error = falloc(p, &fp, &fd, imgp->ip_vfs_context);
 552                 if (error)
 553                         return(error);
 554
 555                 fp->f_fglob->fg_flag = FREAD;
 556                 fp->f_fglob->fg_ops = &vnops;
 557                 fp->f_fglob->fg_data = (caddr_t)imgp->ip_vp;
 558
 559                 proc_fdlock(p);
 560                 procfdtbl_releasefd(p, fd, NULL);
 561                 fp_drop(p, fd, fp, 1);
 562                 proc_fdunlock(p);
 563                 vnode_ref(imgp->ip_vp);
 564
 565                 imgp->ip_interp_sugid_fd = fd;
 566         }
 567
 568         return (-3);
 569 }
 570
 571
 572
 573 /*
 574  * exec_fat_imgact
 575  *
 576  * Image activator for fat 1.0 binaries.  If the binary is fat, then we
 577  * need to select an image from it internally, and make that the image
 578  * we are going to attempt to execute.  At present, this consists of
 579  * reloading the first page for the image with a first page from the
 580  * offset location indicated by the fat header.
 581  *
 582  * Parameters;  struct image_params *   image parameter block
 583  *
 584  * Returns:     -1                      not a fat binary (keep looking)
 585  *              -2                      Success: encapsulated binary: reread
 586  *              >0                      Failure: error number
 587  *
 588  * Important:   This image activator is byte order neutral.
 589  *
 590  * Note:        A return value other than -1 indicates subsequent image
 591  *              activators should not be given the opportunity to attempt
 592  *              to activate the image.
 593  *
 594  *              If we find an encapsulated binary, we make no assertions
 595  *              about its  validity; instead, we leave that up to a rescan
 596  *              for an activator to claim it, and, if it is claimed by one,
 597  *              that activator is responsible for determining validity.
 598  */
 599 static int
 600 exec_fat_imgact(struct image_params *imgp)
 601 {
 602         proc_t p = vfs_context_proc(imgp->ip_vfs_context);
 603         kauth_cred_t cred = kauth_cred_proc_ref(p);
 604         struct fat_header *fat_header = (struct fat_header *)imgp->ip_vdata;
 605         struct _posix_spawnattr *psa = NULL;
 606         struct fat_arch fat_arch;
 607         int resid, error;
 608         load_return_t lret;
 609
 610         if (imgp->ip_origcputype != 0) {
 611                 /* Fat header previously matched, don't allow another fat file inside */
 612                 return (-1);
 613         }
 614
 615         /* Make sure it's a fat binary */
 616         if (OSSwapBigToHostInt32(fat_header->magic) != FAT_MAGIC) {
 617                 error = -1; /* not claimed */
 618                 goto bad;
 619         }
 620
 621         /* imgp->ip_vdata has PAGE_SIZE, zerofilled if the file is smaller */
 622         lret = fatfile_validate_fatarches((vm_offset_t)fat_header, PAGE_SIZE);
 623         if (lret != LOAD_SUCCESS) {
 624                 error = load_return_to_errno(lret);
 625                 goto bad;
 626         }
 627
 628         /* If posix_spawn binprefs exist, respect those prefs. */
 629         psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
 630         if (psa != NULL && psa->psa_binprefs[0] != 0) {
 631                 uint32_t pr = 0;
 632
 633                 /* Check each preference listed against all arches in header */
 634                 for (pr = 0; pr < NBINPREFS; pr++) {
 635                         cpu_type_t pref = psa->psa_binprefs[pr];
 636                         if (pref == 0) {
 637                                 /* No suitable arch in the pref list */
 638                                 error = EBADARCH;
 639                                 goto bad;
 640                         }
 641
 642                         if (pref == CPU_TYPE_ANY) {
 643                                 /* Fall through to regular grading */
 644                                 goto regular_grading;
 645                         }
 646
 647                         lret = fatfile_getbestarch_for_cputype(pref,
 648                                                         (vm_offset_t)fat_header,
 649                                                         PAGE_SIZE,
 650                                                         &fat_arch);
 651                         if (lret == LOAD_SUCCESS) {
 652                                 goto use_arch;
 653                         }
 654                 }
 655
 656                 /* Requested binary preference was not honored */
 657                 error = EBADEXEC;
 658                 goto bad;
 659         }
 660
 661 regular_grading:
 662         /* Look up our preferred architecture in the fat file. */
 663         lret = fatfile_getbestarch((vm_offset_t)fat_header,
 664                                 PAGE_SIZE,
 665                                 &fat_arch);
 666         if (lret != LOAD_SUCCESS) {
 667                 error = load_return_to_errno(lret);
 668                 goto bad;
 669         }
 670
 671 use_arch:
 672         /* Read the Mach-O header out of fat_arch */
 673         error = vn_rdwr(UIO_READ, imgp->ip_vp, imgp->ip_vdata,
 674                         PAGE_SIZE, fat_arch.offset,
 675                         UIO_SYSSPACE, (IO_UNIT|IO_NODELOCKED),
 676                         cred, &resid, p);
 677         if (error) {
 678                 goto bad;
 679         }
 680
 681         if (resid) {
 682                 memset(imgp->ip_vdata + (PAGE_SIZE - resid), 0x0, resid);
 683         }
 684
 685         /* Success.  Indicate we have identified an encapsulated binary */
 686         error = -2;
 687         imgp->ip_arch_offset = (user_size_t)fat_arch.offset;
 688         imgp->ip_arch_size = (user_size_t)fat_arch.size;
 689         imgp->ip_origcputype = fat_arch.cputype;
 690         imgp->ip_origcpusubtype = fat_arch.cpusubtype;
 691
 692 bad:
 693         kauth_cred_unref(&cred);
 694         return (error);
 695 }
 696
 697 /*
 698  * exec_mach_imgact
 699  *
 700  * Image activator for mach-o 1.0 binaries.
 701  *
 702  * Parameters;  struct image_params *   image parameter block
 703  *
 704  * Returns:     -1                      not a fat binary (keep looking)
 705  *              -2                      Success: encapsulated binary: reread
 706  *              >0                      Failure: error number
 707  *              EBADARCH                Mach-o binary, but with an unrecognized
 708  *                                      architecture
 709  *              ENOMEM                  No memory for child process after -
 710  *                                      can only happen after vfork()
 711  *
 712  * Important:   This image activator is NOT byte order neutral.
 713  *
 714  * Note:        A return value other than -1 indicates subsequent image
 715  *              activators should not be given the opportunity to attempt
 716  *              to activate the image.
 717  *
 718  * TODO:        More gracefully handle failures after vfork
 719  */
 720 static int
 721 exec_mach_imgact(struct image_params *imgp)
 722 {
 723         struct mach_header *mach_header = (struct mach_header *)imgp->ip_vdata;
 724         proc_t                  p = vfs_context_proc(imgp->ip_vfs_context);
 725         int                     error = 0;
 726         task_t                  task;
 727         task_t                  new_task = NULL; /* protected by vfexec */
 728         thread_t                thread;
 729         struct uthread          *uthread;
 730         vm_map_t old_map = VM_MAP_NULL;
 731         vm_map_t map;
 732         load_return_t           lret;
 733         load_result_t           load_result;
 734         struct _posix_spawnattr *psa = NULL;
 735         int                     spawn = (imgp->ip_flags & IMGPF_SPAWN);
 736         int                     vfexec = (imgp->ip_flags & IMGPF_VFORK_EXEC);
 737         int                     p_name_len;
 738
 739         /*
 740          * make sure it's a Mach-O 1.0 or Mach-O 2.0 binary; the difference
 741          * is a reserved field on the end, so for the most part, we can
 742          * treat them as if they were identical. Reverse-endian Mach-O
 743          * binaries are recognized but not compatible.
 744          */
 745         if ((mach_header->magic == MH_CIGAM) ||
 746             (mach_header->magic == MH_CIGAM_64)) {
 747                 error = EBADARCH;
 748                 goto bad;
 749         }
 750
 751         if ((mach_header->magic != MH_MAGIC) &&
 752             (mach_header->magic != MH_MAGIC_64)) {
 753                 error = -1;
 754                 goto bad;
 755         }
 756
 757         if (mach_header->filetype != MH_EXECUTE) {
 758                 error = -1;
 759                 goto bad;
 760         }
 761
 762         if (imgp->ip_origcputype != 0) {
 763                 /* Fat header previously had an idea about this thin file */
 764                 if (imgp->ip_origcputype != mach_header->cputype ||
 765                         imgp->ip_origcpusubtype != mach_header->cpusubtype) {
 766                         error = EBADARCH;
 767                         goto bad;
 768                 }
 769         } else {
 770                 imgp->ip_origcputype = mach_header->cputype;
 771                 imgp->ip_origcpusubtype = mach_header->cpusubtype;
 772         }
 773
 774         task = current_task();
 775         thread = current_thread();
 776         uthread = get_bsdthread_info(thread);
 777
 778         if ((mach_header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64)
 779                 imgp->ip_flags |= IMGPF_IS_64BIT;
 780
 781         /* If posix_spawn binprefs exist, respect those prefs. */
 782         psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
 783         if (psa != NULL && psa->psa_binprefs[0] != 0) {
 784                 int pr = 0;
 785                 for (pr = 0; pr < NBINPREFS; pr++) {
 786                         cpu_type_t pref = psa->psa_binprefs[pr];
 787                         if (pref == 0) {
 788                                 /* No suitable arch in the pref list */
 789                                 error = EBADARCH;
 790                                 goto bad;
 791                         }
 792
 793                         if (pref == CPU_TYPE_ANY) {
 794                                 /* Jump to regular grading */
 795                                 goto grade;
 796                         }
 797
 798                         if (pref == imgp->ip_origcputype) {
 799                                 /* We have a match! */
 800                                 goto grade;
 801                         }
 802                 }
 803                 error = EBADARCH;
 804                 goto bad;
 805         }
 806 grade:
 807         if (!grade_binary(imgp->ip_origcputype, imgp->ip_origcpusubtype & ~CPU_SUBTYPE_MASK)) {
 808                 error = EBADARCH;
 809                 goto bad;
 810         }
 811
 812         /* Copy in arguments/environment from the old process */
 813         error = exec_extract_strings(imgp);
 814         if (error)
 815                 goto bad;
 816
 817         error = exec_add_apple_strings(imgp);
 818         if (error)
 819                 goto bad;
 820
 821         AUDIT_ARG(argv, imgp->ip_startargv, imgp->ip_argc,
 822             imgp->ip_endargv - imgp->ip_startargv);
 823         AUDIT_ARG(envv, imgp->ip_endargv, imgp->ip_envc,
 824             imgp->ip_endenvv - imgp->ip_endargv);
 825
 826         /*
 827          * We are being called to activate an image subsequent to a vfork()
 828          * operation; in this case, we know that our task, thread, and
 829          * uthread are actually those of our parent, and our proc, which we
 830          * obtained indirectly from the image_params vfs_context_t, is the
 831          * new child process.
 832          */
 833         if (vfexec || spawn) {
 834                 if (vfexec) {
 835                         imgp->ip_new_thread = fork_create_child(task, NULL, p, FALSE, (imgp->ip_flags & IMGPF_IS_64BIT));
 836                         if (imgp->ip_new_thread == NULL) {
 837                                 error = ENOMEM;
 838                                 goto bad;
 839                         }
 840                 }
 841
 842                 /* reset local idea of thread, uthread, task */
 843                 thread = imgp->ip_new_thread;
 844                 uthread = get_bsdthread_info(thread);
 845                 task = new_task = get_threadtask(thread);
 846                 map = get_task_map(task);
 847         } else {
 848                 map = VM_MAP_NULL;
 849         }
 850
 851         /*
 852          * We set these flags here; this is OK, since if we fail after
 853          * this point, we have already destroyed the parent process anyway.
 854          */
 855         task_set_dyld_info(task, MACH_VM_MIN_ADDRESS, 0);
 856         if (imgp->ip_flags & IMGPF_IS_64BIT) {
 857                 task_set_64bit(task, TRUE);
 858                 OSBitOrAtomic(P_LP64, &p->p_flag);
 859         } else {
 860                 task_set_64bit(task, FALSE);
 861                 OSBitAndAtomic(~((uint32_t)P_LP64), &p->p_flag);
 862         }
 863
 864         /*
 865          *      Load the Mach-O file.
 866          *
 867          * NOTE: An error after this point  indicates we have potentially
 868          * destroyed or overwritten some process state while attempting an
 869          * execve() following a vfork(), which is an unrecoverable condition.
 870          * We send the new process an immediate SIGKILL to avoid it executing
 871          * any instructions in the mutated address space. For true spawns,
 872          * this is not the case, and "too late" is still not too late to
 873          * return an error code to the parent process.
 874          */
 875
 876         /*
 877          * Actually load the image file we previously decided to load.
 878          */
 879         lret = load_machfile(imgp, mach_header, thread, map, &load_result);
 880
 881         if (lret != LOAD_SUCCESS) {
 882                 error = load_return_to_errno(lret);
 883                 goto badtoolate;
 884         }
 885
 886         proc_lock(p);
 887         p->p_cputype = imgp->ip_origcputype;
 888         p->p_cpusubtype = imgp->ip_origcpusubtype;
 889         proc_unlock(p);
 890
 891         vm_map_set_user_wire_limit(get_task_map(task), p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
 892
 893         /*
 894          * Set code-signing flags if this binary is signed, or if parent has
 895          * requested them on exec.
 896          */
 897         if (load_result.csflags & CS_VALID) {
 898                 imgp->ip_csflags |= load_result.csflags &
 899                         (CS_VALID|
 900                          CS_HARD|CS_KILL|CS_RESTRICT|CS_ENFORCEMENT|CS_REQUIRE_LV|CS_DYLD_PLATFORM|
 901                          CS_EXEC_SET_HARD|CS_EXEC_SET_KILL|CS_EXEC_SET_ENFORCEMENT);
 902         } else {
 903                 imgp->ip_csflags &= ~CS_VALID;
 904         }
 905
 906         if (p->p_csflags & CS_EXEC_SET_HARD)
 907                 imgp->ip_csflags |= CS_HARD;
 908         if (p->p_csflags & CS_EXEC_SET_KILL)
 909                 imgp->ip_csflags |= CS_KILL;
 910         if (p->p_csflags & CS_EXEC_SET_ENFORCEMENT)
 911                 imgp->ip_csflags |= CS_ENFORCEMENT;
 912         if (p->p_csflags & CS_EXEC_SET_INSTALLER)
 913                 imgp->ip_csflags |= CS_INSTALLER;
 914
 915
 916         /*
 917          * Set up the system reserved areas in the new address space.
 918          */
 919         vm_map_exec(get_task_map(task),
 920                     task,
 921                     (void *) p->p_fd->fd_rdir,
 922                     cpu_type());
 923
 924         /*
 925          * Close file descriptors which specify close-on-exec.
 926          */
 927         fdexec(p, psa != NULL ? psa->psa_flags : 0);
 928
 929         /*
 930          * deal with set[ug]id.
 931          */
 932         error = exec_handle_sugid(imgp);
 933         if (error) {
 934                 goto badtoolate;
 935         }
 936
 937         /*
 938          * deal with voucher on exec-calling thread.
 939          */
 940         if (imgp->ip_new_thread == NULL)
 941                 thread_set_mach_voucher(current_thread(), IPC_VOUCHER_NULL);
 942
 943         /* Make sure we won't interrupt ourself signalling a partial process */
 944         if (!vfexec && !spawn && (p->p_lflag & P_LTRACED))
 945                 psignal(p, SIGTRAP);
 946
 947         if (load_result.unixproc &&
 948                 create_unix_stack(get_task_map(task),
 949                                   &load_result,
 950                                   p) != KERN_SUCCESS) {
 951                 error = load_return_to_errno(LOAD_NOSPACE);
 952                 goto badtoolate;
 953         }
 954
 955         if (vfexec || spawn) {
 956                 old_map = vm_map_switch(get_task_map(task));
 957         }
 958
 959         if (load_result.unixproc) {
 960                 user_addr_t     ap;
 961
 962                 /*
 963                  * Copy the strings area out into the new process address
 964                  * space.
 965                  */
 966                 ap = p->user_stack;
 967                 error = exec_copyout_strings(imgp, &ap);
 968                 if (error) {
 969                         if (vfexec || spawn)
 970                                 vm_map_switch(old_map);
 971                         goto badtoolate;
 972                 }
 973                 /* Set the stack */
 974                 thread_setuserstack(thread, ap);
 975         }
 976
 977         if (load_result.dynlinker) {
 978                 uint64_t        ap;
 979                 int                     new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4;
 980
 981                 /* Adjust the stack */
 982                 ap = thread_adjuserstack(thread, -new_ptr_size);
 983                 error = copyoutptr(load_result.mach_header, ap, new_ptr_size);
 984
 985                 if (error) {
 986                         if (vfexec || spawn)
 987                                 vm_map_switch(old_map);
 988                         goto badtoolate;
 989                 }
 990                 task_set_dyld_info(task, load_result.all_image_info_addr,
 991                     load_result.all_image_info_size);
 992         }
 993
 994         /* Avoid immediate VM faults back into kernel */
 995         exec_prefault_data(p, imgp, &load_result);
 996
 997         if (vfexec || spawn) {
 998                 vm_map_switch(old_map);
 999         }
1000         /* Set the entry point */
1001         thread_setentrypoint(thread, load_result.entry_point);
1002
1003         /* Stop profiling */
1004         stopprofclock(p);
1005
1006         /*
1007          * Reset signal state.
1008          */
1009         execsigs(p, thread);
1010
1011         /*
1012          * need to cancel async IO requests that can be cancelled and wait for those
1013          * already active.  MAY BLOCK!
1014          */
1015         _aio_exec( p );
1016
1017 #if SYSV_SHM
1018         /* FIXME: Till vmspace inherit is fixed: */
1019         if (!vfexec && p->vm_shm)
1020                 shmexec(p);
1021 #endif
1022 #if SYSV_SEM
1023         /* Clean up the semaphores */
1024         semexit(p);
1025 #endif
1026
1027         /*
1028          * Remember file name for accounting.
1029          */
1030         p->p_acflag &= ~AFORK;
1031
1032         /*
1033          * Set p->p_comm and p->p_name to the name passed to exec
1034          */
1035         p_name_len = sizeof(p->p_name) - 1;
1036         if(imgp->ip_ndp->ni_cnd.cn_namelen > p_name_len)
1037                 imgp->ip_ndp->ni_cnd.cn_namelen = p_name_len;
1038         bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_name,
1039                 (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen);
1040         p->p_name[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0';
1041
1042         if (imgp->ip_ndp->ni_cnd.cn_namelen > MAXCOMLEN)
1043                 imgp->ip_ndp->ni_cnd.cn_namelen = MAXCOMLEN;
1044         bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_comm,
1045                 (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen);
1046         p->p_comm[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0';
1047
1048         pal_dbg_set_task_name( p->task );
1049
1050 #if DEVELOPMENT || DEBUG
1051         /*
1052          * Update the pid an proc name for importance base if any
1053          */
1054         task_importance_update_owner_info(p->task);
1055 #endif
1056
1057         memcpy(&p->p_uuid[0], &load_result.uuid[0], sizeof(p->p_uuid));
1058
1059 // <rdar://6598155> dtrace code cleanup needed
1060 #if CONFIG_DTRACE
1061         /*
1062          * Invalidate any predicate evaluation already cached for this thread by DTrace.
1063          * That's because we've just stored to p_comm and DTrace refers to that when it
1064          * evaluates the "execname" special variable. uid and gid may have changed as well.
1065          */
1066         dtrace_set_thread_predcache(current_thread(), 0);
1067
1068         /*
1069          * Free any outstanding lazy dof entries. It is imperative we
1070          * always call dtrace_lazy_dofs_destroy, rather than null check
1071          * and call if !NULL. If we NULL test, during lazy dof faulting
1072          * we can race with the faulting code and proceed from here to
1073          * beyond the helpers cleanup. The lazy dof faulting will then
1074          * install new helpers which no longer belong to this process!
1075          */
1076         dtrace_lazy_dofs_destroy(p);
1077
1078
1079         /*
1080          * Clean up any DTrace helpers for the process.
1081          */
1082         if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
1083                 (*dtrace_helpers_cleanup)(p);
1084         }
1085
1086         /*
1087          * Cleanup the DTrace provider associated with this process.
1088          */
1089         proc_lock(p);
1090         if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
1091                 (*dtrace_fasttrap_exec_ptr)(p);
1092         }
1093         proc_unlock(p);
1094 #endif
1095
1096         if (kdebug_enable) {
1097                 long dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4;
1098
1099                 /*
1100                  * Collect the pathname for tracing
1101                  */
1102                 kdbg_trace_string(p, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4);
1103
1104                 if (vfexec || spawn) {
1105                         KERNEL_DEBUG_CONSTANT1(TRACE_DATA_EXEC | DBG_FUNC_NONE,
1106                                         p->p_pid ,0,0,0, (uintptr_t)thread_tid(thread));
1107                         KERNEL_DEBUG_CONSTANT1(TRACE_STRING_EXEC | DBG_FUNC_NONE,
1108                                         dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, (uintptr_t)thread_tid(thread));
1109                 } else {
1110                         KERNEL_DEBUG_CONSTANT(TRACE_DATA_EXEC | DBG_FUNC_NONE,
1111                                         p->p_pid ,0,0,0,0);
1112                         KERNEL_DEBUG_CONSTANT(TRACE_STRING_EXEC | DBG_FUNC_NONE,
1113                                         dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0);
1114                 }
1115         }
1116
1117         /*
1118          * If posix_spawned with the START_SUSPENDED flag, stop the
1119          * process before it runs.
1120          */
1121         if (imgp->ip_px_sa != NULL) {
1122                 psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
1123                 if (psa->psa_flags & POSIX_SPAWN_START_SUSPENDED) {
1124                         proc_lock(p);
1125                         p->p_stat = SSTOP;
1126                         proc_unlock(p);
1127                         (void) task_suspend_internal(p->task);
1128                 }
1129         }
1130
1131         /*
1132          * mark as execed, wakeup the process that vforked (if any) and tell
1133          * it that it now has its own resources back
1134          */
1135         OSBitOrAtomic(P_EXEC, &p->p_flag);
1136         proc_resetregister(p);
1137         if (p->p_pptr && (p->p_lflag & P_LPPWAIT)) {
1138                 proc_lock(p);
1139                 p->p_lflag &= ~P_LPPWAIT;
1140                 proc_unlock(p);
1141                 wakeup((caddr_t)p->p_pptr);
1142         }
1143
1144         /*
1145          * Pay for our earlier safety; deliver the delayed signals from
1146          * the incomplete vfexec process now that it's complete.
1147          */
1148         if (vfexec && (p->p_lflag & P_LTRACED)) {
1149                 psignal_vfork(p, new_task, thread, SIGTRAP);
1150         }
1151
1152         goto done;
1153
1154 badtoolate:
1155         /* Don't allow child process to execute any instructions */
1156         if (!spawn) {
1157                 if (vfexec) {
1158                         psignal_vfork(p, new_task, thread, SIGKILL);
1159                 } else {
1160                         psignal(p, SIGKILL);
1161                 }
1162
1163                 /* We can't stop this system call at this point, so just pretend we succeeded */
1164                 error = 0;
1165         }
1166
1167 done:
1168         if (!spawn) {
1169                 /* notify only if it has not failed due to FP Key error */
1170                 if ((p->p_lflag & P_LTERM_DECRYPTFAIL) == 0)
1171                         proc_knote(p, NOTE_EXEC);
1172         }
1173
1174         /* Drop extra references for cases where we don't expect the caller to clean up */
1175         if (vfexec || (spawn && error == 0)) {
1176                 task_deallocate(new_task);
1177                 thread_deallocate(thread);
1178         }
1179
1180 bad:
1181         return(error);
1182 }
1183
1184
1185
1186
1187 /*
1188  * Our image activator table; this is the table of the image types we are
1189  * capable of loading.  We list them in order of preference to ensure the
1190  * fastest image load speed.
1191  *
1192  * XXX hardcoded, for now; should use linker sets
1193  */
1194 struct execsw {
1195         int (*ex_imgact)(struct image_params *);
1196         const char *ex_name;
1197 } execsw[] = {
1198         { exec_mach_imgact,             "Mach-o Binary" },
1199         { exec_fat_imgact,              "Fat Binary" },
1200         { exec_shell_imgact,            "Interpreter Script" },
1201         { NULL, NULL}
1202 };
1203
1204
1205 /*
1206  * exec_activate_image
1207  *
1208  * Description: Iterate through the available image activators, and activate
1209  *              the image associated with the imgp structure.  We start with
1210  *              the
1211  *
1212  * Parameters:  struct image_params *   Image parameter block
1213  *
1214  * Returns:     0                       Success
1215  *              EBADEXEC                The executable is corrupt/unknown
1216  *      execargs_alloc:EINVAL           Invalid argument
1217  *      execargs_alloc:EACCES           Permission denied
1218  *      execargs_alloc:EINTR            Interrupted function
1219  *      execargs_alloc:ENOMEM           Not enough space
1220  *      exec_save_path:EFAULT           Bad address
1221  *      exec_save_path:ENAMETOOLONG     Filename too long
1222  *      exec_check_permissions:EACCES   Permission denied
1223  *      exec_check_permissions:ENOEXEC  Executable file format error
1224  *      exec_check_permissions:ETXTBSY  Text file busy [misuse of error code]
1225  *      exec_check_permissions:???
1226  *      namei:???
1227  *      vn_rdwr:???                     [anything vn_rdwr can return]
1228  *      <ex_imgact>:???                 [anything an imgact can return]
1229  */
1230 static int
1231 exec_activate_image(struct image_params *imgp)
1232 {
1233         struct nameidata *ndp = NULL;
1234         const char *excpath;
1235         int error;
1236         int resid;
1237         int once = 1;   /* save SGUID-ness for interpreted files */
1238         int i;
1239         int itercount = 0;
1240         proc_t p = vfs_context_proc(imgp->ip_vfs_context);
1241
1242         error = execargs_alloc(imgp);
1243         if (error)
1244                 goto bad_notrans;
1245
1246         error = exec_save_path(imgp, imgp->ip_user_fname, imgp->ip_seg, &excpath);
1247         if (error) {
1248                 goto bad_notrans;
1249         }
1250
1251         /* Use excpath, which contains the copyin-ed exec path */
1252         DTRACE_PROC1(exec, uintptr_t, excpath);
1253
1254         MALLOC(ndp, struct nameidata *, sizeof(*ndp), M_TEMP, M_WAITOK | M_ZERO);
1255         if (ndp == NULL) {
1256                 error = ENOMEM;
1257                 goto bad_notrans;
1258         }
1259
1260         NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1,
1261                    UIO_SYSSPACE, CAST_USER_ADDR_T(excpath), imgp->ip_vfs_context);
1262
1263 again:
1264         error = namei(ndp);
1265         if (error)
1266                 goto bad_notrans;
1267         imgp->ip_ndp = ndp;     /* successful namei(); call nameidone() later */
1268         imgp->ip_vp = ndp->ni_vp;       /* if set, need to vnode_put() at some point */
1269
1270         /*
1271          * Before we start the transition from binary A to binary B, make
1272          * sure another thread hasn't started exiting the process.  We grab
1273          * the proc lock to check p_lflag initially, and the transition
1274          * mechanism ensures that the value doesn't change after we release
1275          * the lock.
1276          */
1277         proc_lock(p);
1278         if (p->p_lflag & P_LEXIT) {
1279                 proc_unlock(p);
1280                 goto bad_notrans;
1281         }
1282         error = proc_transstart(p, 1, 0);
1283         proc_unlock(p);
1284         if (error)
1285                 goto bad_notrans;
1286
1287         error = exec_check_permissions(imgp);
1288         if (error)
1289                 goto bad;
1290
1291         /* Copy; avoid invocation of an interpreter overwriting the original */
1292         if (once) {
1293                 once = 0;
1294                 *imgp->ip_origvattr = *imgp->ip_vattr;
1295         }
1296
1297         error = vn_rdwr(UIO_READ, imgp->ip_vp, imgp->ip_vdata, PAGE_SIZE, 0,
1298                         UIO_SYSSPACE, IO_NODELOCKED,
1299                         vfs_context_ucred(imgp->ip_vfs_context),
1300                         &resid, vfs_context_proc(imgp->ip_vfs_context));
1301         if (error)
1302                 goto bad;
1303
1304         if (resid) {
1305                 memset(imgp->ip_vdata + (PAGE_SIZE - resid), 0x0, resid);
1306         }
1307
1308 encapsulated_binary:
1309         /* Limit the number of iterations we will attempt on each binary */
1310         if (++itercount > EAI_ITERLIMIT) {
1311                 error = EBADEXEC;
1312                 goto bad;
1313         }
1314         error = -1;
1315         for(i = 0; error == -1 && execsw[i].ex_imgact != NULL; i++) {
1316
1317                 error = (*execsw[i].ex_imgact)(imgp);
1318
1319                 switch (error) {
1320                 /* case -1: not claimed: continue */
1321                 case -2:                /* Encapsulated binary, imgp->ip_XXX set for next iteration */
1322                         goto encapsulated_binary;
1323
1324                 case -3:                /* Interpreter */
1325 #if CONFIG_MACF
1326                         /*
1327                          * Copy the script label for later use. Note that
1328                          * the label can be different when the script is
1329                          * actually read by the interpreter.
1330                          */
1331                         if (imgp->ip_scriptlabelp)
1332                                 mac_vnode_label_free(imgp->ip_scriptlabelp);
1333                         imgp->ip_scriptlabelp = mac_vnode_label_alloc();
1334                         if (imgp->ip_scriptlabelp == NULL) {
1335                                 error = ENOMEM;
1336                                 break;
1337                         }
1338                         mac_vnode_label_copy(imgp->ip_vp->v_label,
1339                                              imgp->ip_scriptlabelp);
1340
1341                         /*
1342                          * Take a ref of the script vnode for later use.
1343                          */
1344                         if (imgp->ip_scriptvp)
1345                                 vnode_put(imgp->ip_scriptvp);
1346                         if (vnode_getwithref(imgp->ip_vp) == 0)
1347                                 imgp->ip_scriptvp = imgp->ip_vp;
1348 #endif
1349
1350                         nameidone(ndp);
1351
1352                         vnode_put(imgp->ip_vp);
1353                         imgp->ip_vp = NULL;     /* already put */
1354                         imgp->ip_ndp = NULL; /* already nameidone */
1355
1356                         /* Use excpath, which exec_shell_imgact reset to the interpreter */
1357                         NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF,
1358                                    UIO_SYSSPACE, CAST_USER_ADDR_T(excpath), imgp->ip_vfs_context);
1359
1360                         proc_transend(p, 0);
1361                         goto again;
1362
1363                 default:
1364                         break;
1365                 }
1366         }
1367
1368         /*
1369          * Call out to allow 3rd party notification of exec.
1370          * Ignore result of kauth_authorize_fileop call.
1371          */
1372         if (error == 0 && kauth_authorize_fileop_has_listeners()) {
1373                 kauth_authorize_fileop(vfs_context_ucred(imgp->ip_vfs_context),
1374                                         KAUTH_FILEOP_EXEC,
1375                                         (uintptr_t)ndp->ni_vp, 0);
1376         }
1377
1378 bad:
1379         proc_transend(p, 0);
1380
1381 bad_notrans:
1382         if (imgp->ip_strings)
1383                 execargs_free(imgp);
1384         if (imgp->ip_ndp)
1385                 nameidone(imgp->ip_ndp);
1386         if (ndp)
1387                 FREE(ndp, M_TEMP);
1388
1389         return (error);
1390 }
1391
1392
1393 /*
1394  * exec_handle_spawnattr_policy
1395  *
1396  * Description: Decode and apply the posix_spawn apptype, qos clamp, and watchport ports to the task.
1397  *
1398  * Parameters:  proc_t p                process to apply attributes to
1399  *              int psa_apptype         posix spawn attribute apptype
1400  *
1401  * Returns:     0                       Success
1402  */
1403 static errno_t
1404 exec_handle_spawnattr_policy(proc_t p, int psa_apptype, uint64_t psa_qos_clamp, uint64_t psa_darwin_role,
1405                              ipc_port_t * portwatch_ports, int portwatch_count)
1406 {
1407         int apptype     = TASK_APPTYPE_NONE;
1408         int qos_clamp   = THREAD_QOS_UNSPECIFIED;
1409         int role        = TASK_UNSPECIFIED;
1410
1411         if ((psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK) != 0) {
1412                 int proctype = psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK;
1413
1414                 switch(proctype) {
1415                         case POSIX_SPAWN_PROC_TYPE_DAEMON_INTERACTIVE:
1416                                 apptype = TASK_APPTYPE_DAEMON_INTERACTIVE;
1417                                 break;
1418                         case POSIX_SPAWN_PROC_TYPE_DAEMON_STANDARD:
1419                                 apptype = TASK_APPTYPE_DAEMON_STANDARD;
1420                                 break;
1421                         case POSIX_SPAWN_PROC_TYPE_DAEMON_ADAPTIVE:
1422                                 apptype = TASK_APPTYPE_DAEMON_ADAPTIVE;
1423                                 break;
1424                         case POSIX_SPAWN_PROC_TYPE_DAEMON_BACKGROUND:
1425                                 apptype = TASK_APPTYPE_DAEMON_BACKGROUND;
1426                                 break;
1427                         case POSIX_SPAWN_PROC_TYPE_APP_DEFAULT:
1428                                 apptype = TASK_APPTYPE_APP_DEFAULT;
1429                                 break;
1430                         case POSIX_SPAWN_PROC_TYPE_APP_TAL:
1431                                 apptype = TASK_APPTYPE_APP_TAL;
1432                                 break;
1433                         default:
1434                                 apptype = TASK_APPTYPE_NONE;
1435                                 /* TODO: Should an invalid value here fail the spawn? */
1436                                 break;
1437                 }
1438         }
1439
1440         if (psa_qos_clamp != POSIX_SPAWN_PROC_CLAMP_NONE) {
1441                 switch (psa_qos_clamp) {
1442                         case POSIX_SPAWN_PROC_CLAMP_UTILITY:
1443                                 qos_clamp = THREAD_QOS_UTILITY;
1444                                 break;
1445                         case POSIX_SPAWN_PROC_CLAMP_BACKGROUND:
1446                                 qos_clamp = THREAD_QOS_BACKGROUND;
1447                                 break;
1448                         case POSIX_SPAWN_PROC_CLAMP_MAINTENANCE:
1449                                 qos_clamp = THREAD_QOS_MAINTENANCE;
1450                                 break;
1451                         default:
1452                                 qos_clamp = THREAD_QOS_UNSPECIFIED;
1453                                 /* TODO: Should an invalid value here fail the spawn? */
1454                                 break;
1455                 }
1456         }
1457
1458         if (psa_darwin_role != PRIO_DARWIN_ROLE_DEFAULT) {
1459                 proc_darwin_role_to_task_role(psa_darwin_role, &role);
1460         }
1461
1462         if (apptype   != TASK_APPTYPE_NONE      ||
1463             qos_clamp != THREAD_QOS_UNSPECIFIED ||
1464             role      != TASK_UNSPECIFIED) {
1465                 proc_set_task_spawnpolicy(p->task, apptype, qos_clamp, role,
1466                                           portwatch_ports, portwatch_count);
1467         }
1468
1469         return (0);
1470 }
1471
1472
1473 /*
1474  * exec_handle_port_actions
1475  *
1476  * Description: Go through the _posix_port_actions_t contents,
1477  *              calling task_set_special_port, task_set_exception_ports
1478  *              and/or audit_session_spawnjoin for the current task.
1479  *
1480  * Parameters:  struct image_params *   Image parameter block
1481  *              short psa_flags         posix spawn attribute flags
1482  *
1483  * Returns:     0                       Success
1484  *              EINVAL                  Failure
1485  *              ENOTSUP                 Illegal posix_spawn attr flag was set
1486  */
1487 static errno_t
1488 exec_handle_port_actions(struct image_params *imgp, short psa_flags, boolean_t * portwatch_present, ipc_port_t * portwatch_ports)
1489 {
1490         _posix_spawn_port_actions_t pacts = imgp->ip_px_spa;
1491         proc_t p = vfs_context_proc(imgp->ip_vfs_context);
1492         _ps_port_action_t *act = NULL;
1493         task_t task = p->task;
1494         ipc_port_t port = NULL;
1495         errno_t ret = 0;
1496         int i;
1497
1498         *portwatch_present = FALSE;
1499
1500         for (i = 0; i < pacts->pspa_count; i++) {
1501                 act = &pacts->pspa_actions[i];
1502
1503                 if (ipc_object_copyin(get_task_ipcspace(current_task()),
1504                     act->new_port, MACH_MSG_TYPE_COPY_SEND,
1505                     (ipc_object_t *) &port) != KERN_SUCCESS) {
1506                         ret = EINVAL;
1507                         goto done;
1508                 }
1509
1510                 switch (act->port_type) {
1511                 case PSPA_SPECIAL:
1512                         /* Only allowed when not under vfork */
1513                         if (!(psa_flags & POSIX_SPAWN_SETEXEC))
1514                                 ret = ENOTSUP;
1515                         else if (task_set_special_port(task,
1516                         act->which, port) != KERN_SUCCESS)
1517                                 ret = EINVAL;
1518                         break;
1519
1520                 case PSPA_EXCEPTION:
1521                         /* Only allowed when not under vfork */
1522                         if (!(psa_flags & POSIX_SPAWN_SETEXEC))
1523                                 ret = ENOTSUP;
1524                         else if (task_set_exception_ports(task,
1525                         act->mask, port, act->behavior,
1526                         act->flavor) != KERN_SUCCESS)
1527                                 ret = EINVAL;
1528                         break;
1529 #if CONFIG_AUDIT
1530                 case PSPA_AU_SESSION:
1531                         ret = audit_session_spawnjoin(p, port);
1532                         break;
1533 #endif
1534                 case PSPA_IMP_WATCHPORTS:
1535                         if (portwatch_ports != NULL) {
1536                                 *portwatch_present = TRUE;
1537                                 /* hold on to this till end of spawn */
1538                                 portwatch_ports[i] = port;
1539                                 ret = 0;
1540                         } else
1541                                 ipc_port_release_send(port);
1542                         break;
1543                 default:
1544                         ret = EINVAL;
1545                         break;
1546                 }
1547
1548                 /* action failed, so release port resources */
1549
1550                 if (ret) {
1551                         ipc_port_release_send(port);
1552                         break;
1553                 }
1554         }
1555
1556 done:
1557         if (0 != ret)
1558                 DTRACE_PROC1(spawn__port__failure, mach_port_name_t, act->new_port);
1559         return (ret);
1560 }
1561
1562 /*
1563  * exec_handle_file_actions
1564  *
1565  * Description: Go through the _posix_file_actions_t contents applying the
1566  *              open, close, and dup2 operations to the open file table for
1567  *              the current process.
1568  *
1569  * Parameters:  struct image_params *   Image parameter block
1570  *
1571  * Returns:     0                       Success
1572  *              ???
1573  *
1574  * Note:        Actions are applied in the order specified, with the credential
1575  *              of the parent process.  This is done to permit the parent
1576  *              process to utilize POSIX_SPAWN_RESETIDS to drop privilege in
1577  *              the child following operations the child may in fact not be
1578  *              normally permitted to perform.
1579  */
1580 static int
1581 exec_handle_file_actions(struct image_params *imgp, short psa_flags)
1582 {
1583         int error = 0;
1584         int action;
1585         proc_t p = vfs_context_proc(imgp->ip_vfs_context);
1586         _posix_spawn_file_actions_t px_sfap = imgp->ip_px_sfa;
1587         int ival[2];            /* dummy retval for system calls) */
1588
1589         for (action = 0; action < px_sfap->psfa_act_count; action++) {
1590                 _psfa_action_t *psfa = &px_sfap->psfa_act_acts[ action];
1591
1592                 switch(psfa->psfaa_type) {
1593                 case PSFA_OPEN: {
1594                         /*
1595                          * Open is different, in that it requires the use of
1596                          * a path argument, which is normally copied in from
1597                          * user space; because of this, we have to support an
1598                          * open from kernel space that passes an address space
1599                          * context of UIO_SYSSPACE, and casts the address
1600                          * argument to a user_addr_t.
1601                          */
1602                         char *bufp = NULL;
1603                         struct vnode_attr *vap;
1604                         struct nameidata *ndp;
1605                         int mode = psfa->psfaa_openargs.psfao_mode;
1606                         struct dup2_args dup2a;
1607                         struct close_nocancel_args ca;
1608                         int origfd;
1609
1610                         MALLOC(bufp, char *, sizeof(*vap) + sizeof(*ndp), M_TEMP, M_WAITOK | M_ZERO);
1611                         if (bufp == NULL) {
1612                                 error = ENOMEM;
1613                                 break;
1614                         }
1615
1616                         vap = (struct vnode_attr *) bufp;
1617                         ndp = (struct nameidata *) (bufp + sizeof(*vap));
1618
1619                         VATTR_INIT(vap);
1620                         /* Mask off all but regular access permissions */
1621                         mode = ((mode &~ p->p_fd->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1622                         VATTR_SET(vap, va_mode, mode & ACCESSPERMS);
1623
1624                         NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_SYSSPACE,
1625                                CAST_USER_ADDR_T(psfa->psfaa_openargs.psfao_path),
1626                                imgp->ip_vfs_context);
1627
1628                         error = open1(imgp->ip_vfs_context,
1629                                         ndp,
1630                                         psfa->psfaa_openargs.psfao_oflag,
1631                                         vap,
1632                                         fileproc_alloc_init, NULL,
1633                                         ival);
1634
1635                         FREE(bufp, M_TEMP);
1636
1637                         /*
1638                          * If there's an error, or we get the right fd by
1639                          * accident, then drop out here.  This is easier than
1640                          * reworking all the open code to preallocate fd
1641                          * slots, and internally taking one as an argument.
1642                          */
1643                         if (error || ival[0] == psfa->psfaa_filedes)
1644                                 break;
1645
1646                         origfd = ival[0];
1647                         /*
1648                          * If we didn't fall out from an error, we ended up
1649                          * with the wrong fd; so now we've got to try to dup2
1650                          * it to the right one.
1651                          */
1652                         dup2a.from = origfd;
1653                         dup2a.to = psfa->psfaa_filedes;
1654
1655                         /*
1656                          * The dup2() system call implementation sets
1657                          * ival to newfd in the success case, but we
1658                          * can ignore that, since if we didn't get the
1659                          * fd we wanted, the error will stop us.
1660                          */
1661                         error = dup2(p, &dup2a, ival);
1662                         if (error)
1663                                 break;
1664
1665                         /*
1666                          * Finally, close the original fd.
1667                          */
1668                         ca.fd = origfd;
1669
1670                         error = close_nocancel(p, &ca, ival);
1671                         }
1672                         break;
1673
1674                 case PSFA_DUP2: {
1675                         struct dup2_args dup2a;
1676
1677                         dup2a.from = psfa->psfaa_filedes;
1678                         dup2a.to = psfa->psfaa_openargs.psfao_oflag;
1679
1680                         /*
1681                          * The dup2() system call implementation sets
1682                          * ival to newfd in the success case, but we
1683                          * can ignore that, since if we didn't get the
1684                          * fd we wanted, the error will stop us.
1685                          */
1686                         error = dup2(p, &dup2a, ival);
1687                         }
1688                         break;
1689
1690                 case PSFA_CLOSE: {
1691                         struct close_nocancel_args ca;
1692
1693                         ca.fd = psfa->psfaa_filedes;
1694
1695                         error = close_nocancel(p, &ca, ival);
1696                         }
1697                         break;
1698
1699                 case PSFA_INHERIT: {
1700                         struct fcntl_nocancel_args fcntla;
1701
1702                         /*
1703                          * Check to see if the descriptor exists, and
1704                          * ensure it's -not- marked as close-on-exec.
1705                          *
1706                          * Attempting to "inherit" a guarded fd will
1707                          * result in a error.
1708                          */
1709                         fcntla.fd = psfa->psfaa_filedes;
1710                         fcntla.cmd = F_GETFD;
1711                         if ((error = fcntl_nocancel(p, &fcntla, ival)) != 0)
1712                                 break;
1713
1714                         if ((ival[0] & FD_CLOEXEC) == FD_CLOEXEC) {
1715                                 fcntla.fd = psfa->psfaa_filedes;
1716                                 fcntla.cmd = F_SETFD;
1717                                 fcntla.arg = ival[0] & ~FD_CLOEXEC;
1718                                 error = fcntl_nocancel(p, &fcntla, ival);
1719                         }
1720
1721                         }
1722                         break;
1723
1724                 default:
1725                         error = EINVAL;
1726                         break;
1727                 }
1728
1729                 /* All file actions failures are considered fatal, per POSIX */
1730
1731                 if (error) {
1732                         if (PSFA_OPEN == psfa->psfaa_type) {
1733                                 DTRACE_PROC1(spawn__open__failure, uintptr_t,
1734                                     psfa->psfaa_openargs.psfao_path);
1735                         } else {
1736                                 DTRACE_PROC1(spawn__fd__failure, int, psfa->psfaa_filedes);
1737                         }
1738                         break;
1739                 }
1740         }
1741
1742         if (error != 0 || (psa_flags & POSIX_SPAWN_CLOEXEC_DEFAULT) == 0)
1743                 return (error);
1744
1745         /*
1746          * If POSIX_SPAWN_CLOEXEC_DEFAULT is set, behave (during
1747          * this spawn only) as if "close on exec" is the default
1748          * disposition of all pre-existing file descriptors.  In this case,
1749          * the list of file descriptors mentioned in the file actions
1750          * are the only ones that can be inherited, so mark them now.
1751          *
1752          * The actual closing part comes later, in fdexec().
1753          */
1754         proc_fdlock(p);
1755         for (action = 0; action < px_sfap->psfa_act_count; action++) {
1756                 _psfa_action_t *psfa = &px_sfap->psfa_act_acts[action];
1757                 int fd = psfa->psfaa_filedes;
1758
1759                 switch (psfa->psfaa_type) {
1760                 case PSFA_DUP2:
1761                         fd = psfa->psfaa_openargs.psfao_oflag;
1762                         /*FALLTHROUGH*/
1763                 case PSFA_OPEN:
1764                 case PSFA_INHERIT:
1765                         *fdflags(p, fd) |= UF_INHERIT;
1766                         break;
1767
1768                 case PSFA_CLOSE:
1769                         break;
1770                 }
1771         }
1772         proc_fdunlock(p);
1773
1774         return (0);
1775 }
1776
1777 #if CONFIG_MACF
1778 /*
1779  * exec_spawnattr_getmacpolicyinfo
1780  */
1781 void *
1782 exec_spawnattr_getmacpolicyinfo(const void *macextensions, const char *policyname, size_t *lenp)
1783 {
1784         const struct _posix_spawn_mac_policy_extensions *psmx = macextensions;
1785         int i;
1786
1787         if (psmx == NULL)
1788                 return NULL;
1789
1790         for (i = 0; i < psmx->psmx_count; i++) {
1791                 const _ps_mac_policy_extension_t *extension = &psmx->psmx_extensions[i];
1792                 if (strncmp(extension->policyname, policyname, sizeof(extension->policyname)) == 0) {
1793                         if (lenp != NULL)
1794                                 *lenp = extension->datalen;
1795                         return extension->datap;
1796                 }
1797         }
1798
1799         if (lenp != NULL)
1800                 *lenp = 0;
1801         return NULL;
1802 }
1803
1804 static int
1805 spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args, _posix_spawn_mac_policy_extensions_t *psmxp)
1806 {
1807         _posix_spawn_mac_policy_extensions_t psmx = NULL;
1808         int error = 0;
1809         int copycnt = 0;
1810         int i = 0;
1811
1812         *psmxp = NULL;
1813
1814         if (px_args->mac_extensions_size < PS_MAC_EXTENSIONS_SIZE(1) ||
1815             px_args->mac_extensions_size > PAGE_SIZE) {
1816                 error = EINVAL;
1817                 goto bad;
1818         }
1819
1820         MALLOC(psmx, _posix_spawn_mac_policy_extensions_t, px_args->mac_extensions_size, M_TEMP, M_WAITOK);
1821         if ((error = copyin(px_args->mac_extensions, psmx, px_args->mac_extensions_size)) != 0)
1822                 goto bad;
1823
1824         if (PS_MAC_EXTENSIONS_SIZE(psmx->psmx_count) > px_args->mac_extensions_size) {
1825                 error = EINVAL;
1826                 goto bad;
1827         }
1828
1829         for (i = 0; i < psmx->psmx_count; i++) {
1830                 _ps_mac_policy_extension_t *extension = &psmx->psmx_extensions[i];
1831                 if (extension->datalen == 0 || extension->datalen > PAGE_SIZE) {
1832                         error = EINVAL;
1833                         goto bad;
1834                 }
1835         }
1836
1837         for (copycnt = 0; copycnt < psmx->psmx_count; copycnt++) {
1838                 _ps_mac_policy_extension_t *extension = &psmx->psmx_extensions[copycnt];
1839                 void *data = NULL;
1840
1841                 MALLOC(data, void *, extension->datalen, M_TEMP, M_WAITOK);
1842                 if ((error = copyin(extension->data, data, extension->datalen)) != 0) {
1843                         FREE(data, M_TEMP);
1844                         goto bad;
1845                 }
1846                 extension->datap = data;
1847         }
1848
1849         *psmxp = psmx;
1850         return 0;
1851
1852 bad:
1853         if (psmx != NULL) {
1854                 for (i = 0; i < copycnt; i++)
1855                         FREE(psmx->psmx_extensions[i].datap, M_TEMP);
1856                 FREE(psmx, M_TEMP);
1857         }
1858         return error;
1859 }
1860
1861 static void
1862 spawn_free_macpolicyinfo(_posix_spawn_mac_policy_extensions_t psmx)
1863 {
1864         int i;
1865
1866         if (psmx == NULL)
1867                 return;
1868         for (i = 0; i < psmx->psmx_count; i++)
1869                 FREE(psmx->psmx_extensions[i].datap, M_TEMP);
1870         FREE(psmx, M_TEMP);
1871 }
1872 #endif /* CONFIG_MACF */
1873
1874 #if CONFIG_COALITIONS
1875 static inline void spawn_coalitions_release_all(coalition_t coal[COALITION_NUM_TYPES])
1876 {
1877         for (int c = 0; c < COALITION_NUM_TYPES; c++) {
1878                 if (coal[c]) {
1879                         coalition_remove_active(coal[c]);
1880                         coalition_release(coal[c]);
1881                 }
1882         }
1883 }
1884 #endif
1885
1886 void
1887 proc_set_return_wait(proc_t p)
1888 {
1889         proc_lock(p);
1890         p->p_lflag |= P_LRETURNWAIT;
1891         proc_unlock(p);
1892 }
1893
1894 void
1895 proc_clear_return_wait(proc_t p, thread_t child_thread)
1896 {
1897         proc_lock(p);
1898
1899         p->p_lflag &= ~P_LRETURNWAIT;
1900         if (p->p_lflag & P_LRETURNWAITER) {
1901                 wakeup(&p->p_lflag);
1902         }
1903
1904         proc_unlock(p);
1905
1906         (void)thread_resume(child_thread);
1907 }
1908
1909 void
1910 proc_wait_to_return()
1911 {
1912         proc_t  p;
1913
1914         p = current_proc();
1915         proc_lock(p);
1916
1917         if (p->p_lflag & P_LRETURNWAIT) {
1918                 p->p_lflag |= P_LRETURNWAITER;
1919                 do {
1920                         msleep(&p->p_lflag, &p->p_mlock, 0,
1921                                 "thread_check_setup_complete", NULL);
1922                 } while (p->p_lflag & P_LRETURNWAIT);
1923                 p->p_lflag &= ~P_LRETURNWAITER;
1924         }
1925
1926         proc_unlock(p);
1927         thread_bootstrap_return();
1928 }
1929
1930 /*
1931  * posix_spawn
1932  *
1933  * Parameters:  uap->pid                Pointer to pid return area
1934  *              uap->fname              File name to exec
1935  *              uap->argp               Argument list
1936  *              uap->envp               Environment list
1937  *
1938  * Returns:     0                       Success
1939  *              EINVAL                  Invalid argument
1940  *              ENOTSUP                 Not supported
1941  *              ENOEXEC                 Executable file format error
1942  *      exec_activate_image:EINVAL      Invalid argument
1943  *      exec_activate_image:EACCES      Permission denied
1944  *      exec_activate_image:EINTR       Interrupted function
1945  *      exec_activate_image:ENOMEM      Not enough space
1946  *      exec_activate_image:EFAULT      Bad address
1947  *      exec_activate_image:ENAMETOOLONG        Filename too long
1948  *      exec_activate_image:ENOEXEC     Executable file format error
1949  *      exec_activate_image:ETXTBSY     Text file busy [misuse of error code]
1950  *      exec_activate_image:EBADEXEC    The executable is corrupt/unknown
1951  *      exec_activate_image:???
1952  *      mac_execve_enter:???
1953  *
1954  * TODO:        Expect to need __mac_posix_spawn() at some point...
1955  *              Handle posix_spawnattr_t
1956  *              Handle posix_spawn_file_actions_t
1957  */
1958 int
1959 posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
1960 {
1961         proc_t p = ap;          /* quiet bogus GCC vfork() warning */
1962         user_addr_t pid = uap->pid;
1963         int ival[2];            /* dummy retval for setpgid() */
1964         char *bufp = NULL;
1965         struct image_params *imgp;
1966         struct vnode_attr *vap;
1967         struct vnode_attr *origvap;
1968         struct uthread  *uthread = 0;   /* compiler complains if not set to 0*/
1969         int error, sig;
1970         int is_64 = IS_64BIT_PROCESS(p);
1971         struct vfs_context context;
1972         struct user__posix_spawn_args_desc px_args;
1973         struct _posix_spawnattr px_sa;
1974         _posix_spawn_file_actions_t px_sfap = NULL;
1975         _posix_spawn_port_actions_t px_spap = NULL;
1976         struct __kern_sigaction vec;
1977         boolean_t spawn_no_exec = FALSE;
1978         boolean_t proc_transit_set = TRUE;
1979         boolean_t exec_done = FALSE;
1980         int portwatch_count = 0;
1981         ipc_port_t * portwatch_ports = NULL;
1982         vm_size_t px_sa_offset = offsetof(struct _posix_spawnattr, psa_ports);
1983
1984         /*
1985          * Allocate a big chunk for locals instead of using stack since these
1986          * structures are pretty big.
1987          */
1988         MALLOC(bufp, char *, (sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)), M_TEMP, M_WAITOK | M_ZERO);
1989         imgp = (struct image_params *) bufp;
1990         if (bufp == NULL) {
1991                 error = ENOMEM;
1992                 goto bad;
1993         }
1994         vap = (struct vnode_attr *) (bufp + sizeof(*imgp));
1995         origvap = (struct vnode_attr *) (bufp + sizeof(*imgp) + sizeof(*vap));
1996
1997         /* Initialize the common data in the image_params structure */
1998         imgp->ip_user_fname = uap->path;
1999         imgp->ip_user_argv = uap->argv;
2000         imgp->ip_user_envv = uap->envp;
2001         imgp->ip_vattr = vap;
2002         imgp->ip_origvattr = origvap;
2003         imgp->ip_vfs_context = &context;
2004         imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE);
2005         imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32);
2006         imgp->ip_mac_return = 0;
2007         imgp->ip_reserved = NULL;
2008
2009         if (uap->adesc != USER_ADDR_NULL) {
2010                 if(is_64) {
2011                         error = copyin(uap->adesc, &px_args, sizeof(px_args));
2012                 } else {
2013                         struct user32__posix_spawn_args_desc px_args32;
2014
2015                         error = copyin(uap->adesc, &px_args32, sizeof(px_args32));
2016
2017                         /*
2018                          * Convert arguments descriptor from external 32 bit
2019                          * representation to internal 64 bit representation
2020                          */
2021                         px_args.attr_size = px_args32.attr_size;
2022                         px_args.attrp = CAST_USER_ADDR_T(px_args32.attrp);
2023                         px_args.file_actions_size = px_args32.file_actions_size;
2024                         px_args.file_actions = CAST_USER_ADDR_T(px_args32.file_actions);
2025                         px_args.port_actions_size = px_args32.port_actions_size;
2026                         px_args.port_actions = CAST_USER_ADDR_T(px_args32.port_actions);
2027                         px_args.mac_extensions_size = px_args32.mac_extensions_size;
2028                         px_args.mac_extensions = CAST_USER_ADDR_T(px_args32.mac_extensions);
2029                         px_args.coal_info_size = px_args32.coal_info_size;
2030                         px_args.coal_info = CAST_USER_ADDR_T(px_args32.coal_info);
2031                         px_args.reserved = 0;
2032                         px_args.reserved_size = 0;
2033                 }
2034                 if (error)
2035                         goto bad;
2036
2037                 if (px_args.attr_size != 0) {
2038                         /*
2039                          * We are not copying the port_actions pointer,
2040                          * because we already have it from px_args.
2041                          * This is a bit fragile: <rdar://problem/16427422>
2042                          */
2043
2044                         if ((error = copyin(px_args.attrp, &px_sa, px_sa_offset) != 0))
2045                         goto bad;
2046
2047                         bzero( (void *)( (unsigned long) &px_sa + px_sa_offset), sizeof(px_sa) - px_sa_offset );
2048
2049                         imgp->ip_px_sa = &px_sa;
2050                 }
2051                 if (px_args.file_actions_size != 0) {
2052                         /* Limit file_actions to allowed number of open files */
2053                         int maxfa = (p->p_limit ? p->p_rlimit[RLIMIT_NOFILE].rlim_cur : NOFILE);
2054                         if (px_args.file_actions_size < PSF_ACTIONS_SIZE(1) ||
2055                                 px_args.file_actions_size > PSF_ACTIONS_SIZE(maxfa)) {
2056                                 error = EINVAL;
2057                                 goto bad;
2058                         }
2059                         MALLOC(px_sfap, _posix_spawn_file_actions_t, px_args.file_actions_size, M_TEMP, M_WAITOK);
2060                         if (px_sfap == NULL) {
2061                                 error = ENOMEM;
2062                                 goto bad;
2063                         }
2064                         imgp->ip_px_sfa = px_sfap;
2065
2066                         if ((error = copyin(px_args.file_actions, px_sfap,
2067                                                         px_args.file_actions_size)) != 0)
2068                                 goto bad;
2069
2070                         /* Verify that the action count matches the struct size */
2071                         if (PSF_ACTIONS_SIZE(px_sfap->psfa_act_count) != px_args.file_actions_size) {
2072                                 error = EINVAL;
2073                                 goto bad;
2074                         }
2075                 }
2076                 if (px_args.port_actions_size != 0) {
2077                         /* Limit port_actions to one page of data */
2078                         if (px_args.port_actions_size < PS_PORT_ACTIONS_SIZE(1) ||
2079                                 px_args.port_actions_size > PAGE_SIZE) {
2080                                 error = EINVAL;
2081                                 goto bad;
2082                         }
2083
2084                         MALLOC(px_spap, _posix_spawn_port_actions_t,
2085                                         px_args.port_actions_size, M_TEMP, M_WAITOK);
2086                         if (px_spap == NULL) {
2087                                 error = ENOMEM;
2088                                 goto bad;
2089                         }
2090                         imgp->ip_px_spa = px_spap;
2091
2092                         if ((error = copyin(px_args.port_actions, px_spap,
2093                                                         px_args.port_actions_size)) != 0)
2094                                 goto bad;
2095
2096                         /* Verify that the action count matches the struct size */
2097                         if (PS_PORT_ACTIONS_SIZE(px_spap->pspa_count) != px_args.port_actions_size) {
2098                                 error = EINVAL;
2099                                 goto bad;
2100                         }
2101                 }
2102
2103 #if CONFIG_MACF
2104                 if (px_args.mac_extensions_size != 0) {
2105                         if ((error = spawn_copyin_macpolicyinfo(&px_args, (_posix_spawn_mac_policy_extensions_t *)&imgp->ip_px_smpx)) != 0)
2106                                 goto bad;
2107                 }
2108 #endif /* CONFIG_MACF */
2109         }
2110
2111         /* set uthread to parent */
2112         uthread = get_bsdthread_info(current_thread());
2113
2114         /*
2115          * <rdar://6640530>; this does not result in a behaviour change
2116          * relative to Leopard, so there should not be any existing code
2117          * which depends on it.
2118          */
2119         if (uthread->uu_flag & UT_VFORK) {
2120             error = EINVAL;
2121             goto bad;
2122         }
2123
2124         /*
2125          * If we don't have the extension flag that turns "posix_spawn()"
2126          * into "execve() with options", then we will be creating a new
2127          * process which does not inherit memory from the parent process,
2128          * which is one of the most expensive things about using fork()
2129          * and execve().
2130          */
2131         if (imgp->ip_px_sa == NULL || !(px_sa.psa_flags & POSIX_SPAWN_SETEXEC)){
2132
2133                 /* Set the new task's coalition, if it is requested.  */
2134                 coalition_t coal[COALITION_NUM_TYPES] = { COALITION_NULL };
2135 #if CONFIG_COALITIONS
2136                 int i, ncoals;
2137                 kern_return_t kr = KERN_SUCCESS;
2138                 struct _posix_spawn_coalition_info coal_info;
2139                 int coal_role[COALITION_NUM_TYPES];
2140
2141                 if (imgp->ip_px_sa == NULL || !px_args.coal_info)
2142                         goto do_fork1;
2143
2144                 memset(&coal_info, 0, sizeof(coal_info));
2145
2146                 if (px_args.coal_info_size > sizeof(coal_info))
2147                         px_args.coal_info_size = sizeof(coal_info);
2148                 error = copyin(px_args.coal_info,
2149                                &coal_info, px_args.coal_info_size);
2150                 if (error != 0)
2151                         goto bad;
2152
2153                 ncoals = 0;
2154                 for (i = 0; i < COALITION_NUM_TYPES; i++) {
2155                         uint64_t cid = coal_info.psci_info[i].psci_id;
2156                         if (cid != 0) {
2157                                 /*
2158                                  * don't allow tasks which are not in a
2159                                  * privileged coalition to spawn processes
2160                                  * into coalitions other than their own
2161                                  */
2162                                 if (!task_is_in_privileged_coalition(p->task, i)) {
2163                                         coal_dbg("ERROR: %d not in privilegd "
2164                                                  "coalition of type %d",
2165                                                  p->p_pid, i);
2166                                         spawn_coalitions_release_all(coal);
2167                                         error = EPERM;
2168                                         goto bad;
2169                                 }
2170
2171                                 coal_dbg("searching for coalition id:%llu", cid);
2172                                 /*
2173                                  * take a reference and activation on the
2174                                  * coalition to guard against free-while-spawn
2175                                  * races
2176                                  */
2177                                 coal[i] = coalition_find_and_activate_by_id(cid);
2178                                 if (coal[i] == COALITION_NULL) {
2179                                         coal_dbg("could not find coalition id:%llu "
2180                                                  "(perhaps it has been terminated or reaped)", cid);
2181                                         /*
2182                                          * release any other coalition's we
2183                                          * may have a reference to
2184                                          */
2185                                         spawn_coalitions_release_all(coal);
2186                                         error = ESRCH;
2187                                         goto bad;
2188                                 }
2189                                 if (coalition_type(coal[i]) != i) {
2190                                         coal_dbg("coalition with id:%lld is not of type:%d"
2191                                                  " (it's type:%d)", cid, i, coalition_type(coal[i]));
2192                                         error = ESRCH;
2193                                         goto bad;
2194                                 }
2195                                 coal_role[i] = coal_info.psci_info[i].psci_role;
2196                                 ncoals++;
2197                         }
2198                 }
2199                 if (ncoals < COALITION_NUM_TYPES) {
2200                         /*
2201                          * If the user is attempting to spawn into a subset of
2202                          * the known coalition types, then make sure they have
2203                          * _at_least_ specified a resource coalition. If not,
2204                          * the following fork1() call will implicitly force an
2205                          * inheritance from 'p' and won't actually spawn the
2206                          * new task into the coalitions the user specified.
2207                          * (also the call to coalitions_set_roles will panic)
2208                          */
2209                         if (coal[COALITION_TYPE_RESOURCE] == COALITION_NULL) {
2210                                 spawn_coalitions_release_all(coal);
2211                                 error = EINVAL;
2212                                 goto bad;
2213                         }
2214                 }
2215 do_fork1:
2216 #endif /* CONFIG_COALITIONS */
2217
2218                 error = fork1(p, &imgp->ip_new_thread, PROC_CREATE_SPAWN, coal);
2219
2220 #if CONFIG_COALITIONS
2221                 /* set the roles of this task within each given coalition */
2222                 if (error == 0) {
2223                         kr = coalitions_set_roles(coal, get_threadtask(imgp->ip_new_thread), coal_role);
2224                         if (kr != KERN_SUCCESS)
2225                                 error = EINVAL;
2226                 }
2227
2228                 /* drop our references and activations - fork1() now holds them */
2229                 spawn_coalitions_release_all(coal);
2230 #endif /* CONFIG_COALITIONS */
2231                 if (error != 0) {
2232                         goto bad;
2233                 }
2234                 imgp->ip_flags |= IMGPF_SPAWN;  /* spawn w/o exec */
2235                 spawn_no_exec = TRUE;           /* used in later tests */
2236
2237         }
2238
2239         if (spawn_no_exec) {
2240                 p = (proc_t)get_bsdthreadtask_info(imgp->ip_new_thread);
2241
2242                 /*
2243                  * We had to wait until this point before firing the
2244                  * proc:::create probe, otherwise p would not point to the
2245                  * child process.
2246                  */
2247                 DTRACE_PROC1(create, proc_t, p);
2248         }
2249         assert(p != NULL);
2250
2251         /* By default, the thread everyone plays with is the parent */
2252         context.vc_thread = current_thread();
2253         context.vc_ucred = p->p_ucred;  /* XXX must NOT be kauth_cred_get() */
2254
2255         /*
2256          * However, if we're not in the setexec case, redirect the context
2257          * to the newly created process instead
2258          */
2259         if (spawn_no_exec)
2260                 context.vc_thread = imgp->ip_new_thread;
2261
2262         /*
2263          * Post fdcopy(), pre exec_handle_sugid() - this is where we want
2264          * to handle the file_actions.  Since vfork() also ends up setting
2265          * us into the parent process group, and saved off the signal flags,
2266          * this is also where we want to handle the spawn flags.
2267          */
2268
2269         /* Has spawn file actions? */
2270         if (imgp->ip_px_sfa != NULL) {
2271                 /*
2272                  * The POSIX_SPAWN_CLOEXEC_DEFAULT flag
2273                  * is handled in exec_handle_file_actions().
2274                  */
2275                 if ((error = exec_handle_file_actions(imgp,
2276                     imgp->ip_px_sa != NULL ? px_sa.psa_flags : 0)) != 0)
2277                         goto bad;
2278         }
2279
2280         /* Has spawn port actions? */
2281         if (imgp->ip_px_spa != NULL) {
2282                 boolean_t is_adaptive = FALSE;
2283                 boolean_t portwatch_present = FALSE;
2284
2285                 /* Will this process become adaptive? The apptype isn't ready yet, so we can't look there. */
2286                 if (imgp->ip_px_sa != NULL && px_sa.psa_apptype == POSIX_SPAWN_PROC_TYPE_DAEMON_ADAPTIVE)
2287                         is_adaptive = TRUE;
2288
2289                 /*
2290                  * portwatch only:
2291                  * Allocate a place to store the ports we want to bind to the new task
2292                  * We can't bind them until after the apptype is set.
2293                  */
2294                 if (px_spap->pspa_count != 0 && is_adaptive) {
2295                         portwatch_count = px_spap->pspa_count;
2296                         MALLOC(portwatch_ports, ipc_port_t *, (sizeof(ipc_port_t) * portwatch_count), M_TEMP, M_WAITOK | M_ZERO);
2297                 } else {
2298                         portwatch_ports = NULL;
2299                 }
2300
2301                 if ((error = exec_handle_port_actions(imgp,
2302                     imgp->ip_px_sa != NULL ? px_sa.psa_flags : 0, &portwatch_present, portwatch_ports)) != 0)
2303                         goto bad;
2304
2305                 if (portwatch_present == FALSE && portwatch_ports != NULL) {
2306                         FREE(portwatch_ports, M_TEMP);
2307                         portwatch_ports = NULL;
2308                         portwatch_count = 0;
2309                 }
2310         }
2311
2312         /* Has spawn attr? */
2313         if (imgp->ip_px_sa != NULL) {
2314                 /*
2315                  * Set the process group ID of the child process; this has
2316                  * to happen before the image activation.
2317                  */
2318                 if (px_sa.psa_flags & POSIX_SPAWN_SETPGROUP) {
2319                         struct setpgid_args spga;
2320                         spga.pid = p->p_pid;
2321                         spga.pgid = px_sa.psa_pgroup;
2322                         /*
2323                          * Effectively, call setpgid() system call; works
2324                          * because there are no pointer arguments.
2325                          */
2326                         if((error = setpgid(p, &spga, ival)) != 0)
2327                                 goto bad;
2328                 }
2329
2330                 /*
2331                  * Reset UID/GID to parent's RUID/RGID; This works only
2332                  * because the operation occurs *after* the vfork() and
2333                  * before the call to exec_handle_sugid() by the image
2334                  * activator called from exec_activate_image().  POSIX
2335                  * requires that any setuid/setgid bits on the process
2336                  * image will take precedence over the spawn attributes
2337                  * (re)setting them.
2338                  *
2339                  * The use of p_ucred is safe, since we are acting on the
2340                  * new process, and it has no threads other than the one
2341                  * we are creating for it.
2342                  */
2343                 if (px_sa.psa_flags & POSIX_SPAWN_RESETIDS) {
2344                         kauth_cred_t my_cred = p->p_ucred;
2345                         kauth_cred_t my_new_cred = kauth_cred_setuidgid(my_cred, kauth_cred_getruid(my_cred), kauth_cred_getrgid(my_cred));
2346                         if (my_new_cred != my_cred) {
2347                                 p->p_ucred = my_new_cred;
2348                                 /* update cred on proc */
2349                                 PROC_UPDATE_CREDS_ONPROC(p);
2350                         }
2351                 }
2352
2353 #if !SECURE_KERNEL
2354                 /*
2355                  * Disable ASLR for the spawned process.
2356                  *
2357                  * But only do so if we are not embedded + RELEASE.
2358                  * While embedded allows for a boot-arg (-disable_aslr)
2359                  * to deal with this (which itself is only honored on
2360                  * DEVELOPMENT or DEBUG builds of xnu), it is often
2361                  * useful or necessary to disable ASLR on a per-process
2362                  * basis for unit testing and debugging.
2363                  */
2364                 if (px_sa.psa_flags & _POSIX_SPAWN_DISABLE_ASLR)
2365                         OSBitOrAtomic(P_DISABLE_ASLR, &p->p_flag);
2366 #endif /* !SECURE_KERNEL */
2367
2368                 /*
2369                  * Forcibly disallow execution from data pages for the spawned process
2370                  * even if it would otherwise be permitted by the architecture default.
2371                  */
2372                 if (px_sa.psa_flags & _POSIX_SPAWN_ALLOW_DATA_EXEC)
2373                         imgp->ip_flags |= IMGPF_ALLOW_DATA_EXEC;
2374         }
2375
2376         /*
2377          * Disable ASLR during image activation.  This occurs either if the
2378          * _POSIX_SPAWN_DISABLE_ASLR attribute was found above or if
2379          * P_DISABLE_ASLR was inherited from the parent process.
2380          */
2381         if (p->p_flag & P_DISABLE_ASLR)
2382                 imgp->ip_flags |= IMGPF_DISABLE_ASLR;
2383
2384         /*
2385          * Clear transition flag so we won't hang if exec_activate_image() causes
2386          * an automount (and launchd does a proc sysctl to service it).
2387          *
2388          * <rdar://problem/6848672>, <rdar://problem/5959568>.
2389          */
2390         if (spawn_no_exec) {
2391                 proc_transend(p, 0);
2392                 proc_transit_set = 0;
2393         }
2394
2395 #if MAC_SPAWN   /* XXX */
2396         if (uap->mac_p != USER_ADDR_NULL) {
2397                 error = mac_execve_enter(uap->mac_p, imgp);
2398                 if (error)
2399                         goto bad;
2400         }
2401 #endif
2402
2403         /*
2404          * Activate the image
2405          */
2406         error = exec_activate_image(imgp);
2407
2408         if (error == 0) {
2409                 /* process completed the exec */
2410                 exec_done = TRUE;
2411         } else if (error == -1) {
2412                 /* Image not claimed by any activator? */
2413                 error = ENOEXEC;
2414         }
2415
2416         /*
2417          * If we have a spawn attr, and it contains signal related flags,
2418          * the we need to process them in the "context" of the new child
2419          * process, so we have to process it following image activation,
2420          * prior to making the thread runnable in user space.  This is
2421          * necessitated by some signal information being per-thread rather
2422          * than per-process, and we don't have the new allocation in hand
2423          * until after the image is activated.
2424          */
2425         if (!error && imgp->ip_px_sa != NULL) {
2426                 thread_t child_thread = current_thread();
2427                 uthread_t child_uthread = uthread;
2428
2429                 /*
2430                  * If we created a new child thread, then the thread and
2431                  * uthread are different than the current ones; otherwise,
2432                  * we leave them, since we are in the exec case instead.
2433                  */
2434                 if (spawn_no_exec) {
2435                         child_thread = imgp->ip_new_thread;
2436                         child_uthread = get_bsdthread_info(child_thread);
2437                 }
2438
2439                 /*
2440                  * Mask a list of signals, instead of them being unmasked, if
2441                  * they were unmasked in the parent; note that some signals
2442                  * are not maskable.
2443                  */
2444                 if (px_sa.psa_flags & POSIX_SPAWN_SETSIGMASK)
2445                         child_uthread->uu_sigmask = (px_sa.psa_sigmask & ~sigcantmask);
2446                 /*
2447                  * Default a list of signals instead of ignoring them, if
2448                  * they were ignored in the parent.  Note that we pass
2449                  * spawn_no_exec to setsigvec() to indicate that we called
2450                  * fork1() and therefore do not need to call proc_signalstart()
2451                  * internally.
2452                  */
2453                 if (px_sa.psa_flags & POSIX_SPAWN_SETSIGDEF) {
2454                         vec.sa_handler = SIG_DFL;
2455                         vec.sa_tramp = 0;
2456                         vec.sa_mask = 0;
2457                         vec.sa_flags = 0;
2458                         for (sig = 0; sig < NSIG; sig++)
2459                                 if (px_sa.psa_sigdefault & (1 << sig)) {
2460                                         error = setsigvec(p, child_thread, sig + 1, &vec, spawn_no_exec);
2461                         }
2462                 }
2463
2464                 /*
2465                  * Activate the CPU usage monitor, if requested. This is done via a task-wide, per-thread CPU
2466                  * usage limit, which will generate a resource exceeded exception if any one thread exceeds the
2467                  * limit.
2468                  *
2469                  * Userland gives us interval in seconds, and the kernel SPI expects nanoseconds.
2470                  */
2471                 if (px_sa.psa_cpumonitor_percent != 0) {
2472                         /*
2473                          * Always treat a CPU monitor activation coming from spawn as entitled. Requiring
2474                          * an entitlement to configure the monitor a certain way seems silly, since
2475                          * whomever is turning it on could just as easily choose not to do so.
2476                          */
2477                         error = proc_set_task_ruse_cpu(p->task,
2478                                         TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_EXC,
2479                                         px_sa.psa_cpumonitor_percent,
2480                                         px_sa.psa_cpumonitor_interval * NSEC_PER_SEC,
2481                                         0, TRUE);
2482                 }
2483         }
2484
2485 bad:
2486
2487         if (error == 0) {
2488                 /* reset delay idle sleep status if set */
2489                 if ((p->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP)
2490                         OSBitAndAtomic(~((uint32_t)P_DELAYIDLESLEEP), &p->p_flag);
2491                 /* upon  successful spawn, re/set the proc control state */
2492                 if (imgp->ip_px_sa != NULL) {
2493                         switch (px_sa.psa_pcontrol) {
2494                                 case POSIX_SPAWN_PCONTROL_THROTTLE:
2495                                         p->p_pcaction = P_PCTHROTTLE;
2496                                         break;
2497                                 case POSIX_SPAWN_PCONTROL_SUSPEND:
2498                                         p->p_pcaction = P_PCSUSP;
2499                                         break;
2500                                 case POSIX_SPAWN_PCONTROL_KILL:
2501                                         p->p_pcaction = P_PCKILL;
2502                                         break;
2503                                 case POSIX_SPAWN_PCONTROL_NONE:
2504                                 default:
2505                                         p->p_pcaction = 0;
2506                                         break;
2507                         };
2508                 }
2509                 exec_resettextvp(p, imgp);
2510
2511 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2512                 /* Has jetsam attributes? */
2513                 if (imgp->ip_px_sa != NULL && (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_SET)) {
2514                         /*
2515                          * With 2-level high-water-mark support, POSIX_SPAWN_JETSAM_HIWATER_BACKGROUND is no
2516                          * longer relevant, as background limits are described via the inactive limit slots.
2517                          * At the kernel layer, the flag is ignored.
2518                          *
2519                          * That said, however, if the POSIX_SPAWN_JETSAM_HIWATER_BACKGROUND is passed in,
2520                          * we attempt to mimic previous behavior by forcing the BG limit data into the
2521                          * inactive/non-fatal mode and force the active slots to hold system_wide/fatal mode.
2522                          * The kernel layer will flag this mapping.
2523                          */
2524                         if (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_HIWATER_BACKGROUND) {
2525                                 memorystatus_update(p, px_sa.psa_priority, 0,
2526                                             (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_USE_EFFECTIVE_PRIORITY),
2527                                             TRUE,
2528                                             -1, TRUE,
2529                                             px_sa.psa_memlimit_inactive, FALSE,
2530                                             (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_HIWATER_BACKGROUND));
2531                         } else {
2532                                 memorystatus_update(p, px_sa.psa_priority, 0,
2533                                             (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_USE_EFFECTIVE_PRIORITY),
2534                                             TRUE,
2535                                             px_sa.psa_memlimit_active,
2536                                             (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_MEMLIMIT_ACTIVE_FATAL),
2537                                             px_sa.psa_memlimit_inactive,
2538                                             (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_MEMLIMIT_INACTIVE_FATAL),
2539                                             (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_HIWATER_BACKGROUND));
2540                         }
2541
2542                 }
2543 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM*/
2544         }
2545
2546         /*
2547          * If we successfully called fork1(), we always need to do this;
2548          * we identify this case by noting the IMGPF_SPAWN flag.  This is
2549          * because we come back from that call with signals blocked in the
2550          * child, and we have to unblock them, but we want to wait until
2551          * after we've performed any spawn actions.  This has to happen
2552          * before check_for_signature(), which uses psignal.
2553          */
2554         if (spawn_no_exec) {
2555                 if (proc_transit_set)
2556                         proc_transend(p, 0);
2557
2558                 /*
2559                  * Drop the signal lock on the child which was taken on our
2560                  * behalf by forkproc()/cloneproc() to prevent signals being
2561                  * received by the child in a partially constructed state.
2562                  */
2563                 proc_signalend(p, 0);
2564
2565                 /* flag the 'fork' has occurred */
2566                 proc_knote(p->p_pptr, NOTE_FORK | p->p_pid);
2567                 /* then flag exec has occurred */
2568                 /* notify only if it has not failed due to FP Key error */
2569                 if ((p->p_lflag & P_LTERM_DECRYPTFAIL) == 0)
2570                         proc_knote(p, NOTE_EXEC);
2571         } else if (error == 0) {
2572                 /* reset the importance attribute from our previous life */
2573                 task_importance_reset(p->task);
2574
2575                 /* reset atm context from task */
2576                 task_atm_reset(p->task);
2577         }
2578
2579         /*
2580          * Apply the spawnattr policy, apptype (which primes the task for importance donation),
2581          * and bind any portwatch ports to the new task.
2582          * This must be done after the exec so that the child's thread is ready,
2583          * and after the in transit state has been released, because priority is
2584          * dropped here so we need to be prepared for a potentially long preemption interval
2585          *
2586          * TODO: Consider splitting this up into separate phases
2587          */
2588         if (error == 0 && imgp->ip_px_sa != NULL) {
2589                 struct _posix_spawnattr *psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
2590
2591                 exec_handle_spawnattr_policy(p, psa->psa_apptype, psa->psa_qos_clamp, psa->psa_darwin_role,
2592                                               portwatch_ports, portwatch_count);
2593         }
2594
2595         /* Apply the main thread qos */
2596         if (error == 0) {
2597                 thread_t main_thread = (imgp->ip_new_thread != NULL) ? imgp->ip_new_thread : current_thread();
2598
2599                 task_set_main_thread_qos(p->task, main_thread);
2600         }
2601
2602         /*
2603          * Release any ports we kept around for binding to the new task
2604          * We need to release the rights even if the posix_spawn has failed.
2605          */
2606         if (portwatch_ports != NULL) {
2607                 for (int i = 0; i < portwatch_count; i++) {
2608                         ipc_port_t port = NULL;
2609                         if ((port = portwatch_ports[i]) != NULL) {
2610                                 ipc_port_release_send(port);
2611                         }
2612                 }
2613                 FREE(portwatch_ports, M_TEMP);
2614                 portwatch_ports = NULL;
2615                 portwatch_count = 0;
2616         }
2617
2618         /*
2619          * We have to delay operations which might throw a signal until after
2620          * the signals have been unblocked; however, we want that to happen
2621          * after exec_resettextvp() so that the textvp is correct when they
2622          * fire.
2623          */
2624         if (error == 0) {
2625                 error = check_for_signature(p, imgp);
2626
2627                 /*
2628                  * Pay for our earlier safety; deliver the delayed signals from
2629                  * the incomplete spawn process now that it's complete.
2630                  */
2631                 if (imgp != NULL && spawn_no_exec && (p->p_lflag & P_LTRACED)) {
2632                         psignal_vfork(p, p->task, imgp->ip_new_thread, SIGTRAP);
2633                 }
2634         }
2635
2636
2637         if (imgp != NULL) {
2638                 if (imgp->ip_vp)
2639                         vnode_put(imgp->ip_vp);
2640                 if (imgp->ip_scriptvp)
2641                         vnode_put(imgp->ip_scriptvp);
2642                 if (imgp->ip_strings)
2643                         execargs_free(imgp);
2644                 if (imgp->ip_px_sfa != NULL)
2645                         FREE(imgp->ip_px_sfa, M_TEMP);
2646                 if (imgp->ip_px_spa != NULL)
2647                         FREE(imgp->ip_px_spa, M_TEMP);
2648 #if CONFIG_MACF
2649                 if (imgp->ip_px_smpx != NULL)
2650                         spawn_free_macpolicyinfo(imgp->ip_px_smpx);
2651                 if (imgp->ip_execlabelp)
2652                         mac_cred_label_free(imgp->ip_execlabelp);
2653                 if (imgp->ip_scriptlabelp)
2654                         mac_vnode_label_free(imgp->ip_scriptlabelp);
2655 #endif
2656         }
2657
2658 #if CONFIG_DTRACE
2659         if (spawn_no_exec) {
2660                 /*
2661                  * In the original DTrace reference implementation,
2662                  * posix_spawn() was a libc routine that just
2663                  * did vfork(2) then exec(2).  Thus the proc::: probes
2664                  * are very fork/exec oriented.  The details of this
2665                  * in-kernel implementation of posix_spawn() is different
2666                  * (while producing the same process-observable effects)
2667                  * particularly w.r.t. errors, and which thread/process
2668                  * is constructing what on behalf of whom.
2669                  */
2670                 if (error) {
2671                         DTRACE_PROC1(spawn__failure, int, error);
2672                 } else {
2673                         DTRACE_PROC(spawn__success);
2674                         /*
2675                          * Some DTrace scripts, e.g. newproc.d in
2676                          * /usr/bin, rely on the the 'exec-success'
2677                          * probe being fired in the child after the
2678                          * new process image has been constructed
2679                          * in order to determine the associated pid.
2680                          *
2681                          * So, even though the parent built the image
2682                          * here, for compatibility, mark the new thread
2683                          * so 'exec-success' fires on it as it leaves
2684                          * the kernel.
2685                          */
2686                         dtrace_thread_didexec(imgp->ip_new_thread);
2687                 }
2688         } else {
2689                 if (error) {
2690                         DTRACE_PROC1(exec__failure, int, error);
2691                 } else {
2692                         DTRACE_PROC(exec__success);
2693                 }
2694         }
2695
2696         if ((dtrace_proc_waitfor_hook = dtrace_proc_waitfor_exec_ptr) != NULL)
2697                 (*dtrace_proc_waitfor_hook)(p);
2698 #endif
2699
2700         /* Return to both the parent and the child? */
2701         if (imgp != NULL && spawn_no_exec) {
2702                 /*
2703                  * If the parent wants the pid, copy it out
2704                  */
2705                 if (pid != USER_ADDR_NULL)
2706                         (void)suword(pid, p->p_pid);
2707                 retval[0] = error;
2708
2709                 /*
2710                  * If we had an error, perform an internal reap ; this is
2711                  * entirely safe, as we have a real process backing us.
2712                  */
2713                 if (error) {
2714                         proc_list_lock();
2715                         p->p_listflag |= P_LIST_DEADPARENT;
2716                         proc_list_unlock();
2717                         proc_lock(p);
2718                         /* make sure no one else has killed it off... */
2719                         if (p->p_stat != SZOMB && p->exit_thread == NULL) {
2720                                 p->exit_thread = current_thread();
2721                                 proc_unlock(p);
2722                                 exit1(p, 1, (int *)NULL);
2723                                 proc_clear_return_wait(p, imgp->ip_new_thread);
2724                                 if (exec_done == FALSE) {
2725                                         task_deallocate(get_threadtask(imgp->ip_new_thread));
2726                                         thread_deallocate(imgp->ip_new_thread);
2727                                 }
2728                         } else {
2729                                 /* someone is doing it for us; just skip it */
2730                                 proc_unlock(p);
2731                                 proc_clear_return_wait(p, imgp->ip_new_thread);
2732                         }
2733                 } else {
2734
2735                         /*
2736                          * Return to the child
2737                          *
2738                          * Note: the image activator earlier dropped the
2739                          * task/thread references to the newly spawned
2740                          * process; this is OK, since we still have suspended
2741                          * queue references on them, so we should be fine
2742                          * with the delayed resume of the thread here.
2743                          */
2744                         proc_clear_return_wait(p, imgp->ip_new_thread);
2745                 }
2746         }
2747         if (bufp != NULL) {
2748                 FREE(bufp, M_TEMP);
2749         }
2750
2751         return(error);
2752 }
2753
2754
2755 /*
2756  * execve
2757  *
2758  * Parameters:  uap->fname              File name to exec
2759  *              uap->argp               Argument list
2760  *              uap->envp               Environment list
2761  *
2762  * Returns:     0                       Success
2763  *      __mac_execve:EINVAL             Invalid argument
2764  *      __mac_execve:ENOTSUP            Invalid argument
2765  *      __mac_execve:EACCES             Permission denied
2766  *      __mac_execve:EINTR              Interrupted function
2767  *      __mac_execve:ENOMEM             Not enough space
2768  *      __mac_execve:EFAULT             Bad address
2769  *      __mac_execve:ENAMETOOLONG       Filename too long
2770  *      __mac_execve:ENOEXEC            Executable file format error
2771  *      __mac_execve:ETXTBSY            Text file busy [misuse of error code]
2772  *      __mac_execve:???
2773  *
2774  * TODO:        Dynamic linker header address on stack is copied via suword()
2775  */
2776 /* ARGSUSED */
2777 int
2778 execve(proc_t p, struct execve_args *uap, int32_t *retval)
2779 {
2780         struct __mac_execve_args muap;
2781         int err;
2782
2783         memoryshot(VM_EXECVE, DBG_FUNC_NONE);
2784
2785         muap.fname = uap->fname;
2786         muap.argp = uap->argp;
2787         muap.envp = uap->envp;
2788         muap.mac_p = USER_ADDR_NULL;
2789         err = __mac_execve(p, &muap, retval);
2790
2791         return(err);
2792 }
2793
2794 /*
2795  * __mac_execve
2796  *
2797  * Parameters:  uap->fname              File name to exec
2798  *              uap->argp               Argument list
2799  *              uap->envp               Environment list
2800  *              uap->mac_p              MAC label supplied by caller
2801  *
2802  * Returns:     0                       Success
2803  *              EINVAL                  Invalid argument
2804  *              ENOTSUP                 Not supported
2805  *              ENOEXEC                 Executable file format error
2806  *      exec_activate_image:EINVAL      Invalid argument
2807  *      exec_activate_image:EACCES      Permission denied
2808  *      exec_activate_image:EINTR       Interrupted function
2809  *      exec_activate_image:ENOMEM      Not enough space
2810  *      exec_activate_image:EFAULT      Bad address
2811  *      exec_activate_image:ENAMETOOLONG        Filename too long
2812  *      exec_activate_image:ENOEXEC     Executable file format error
2813  *      exec_activate_image:ETXTBSY     Text file busy [misuse of error code]
2814  *      exec_activate_image:EBADEXEC    The executable is corrupt/unknown
2815  *      exec_activate_image:???
2816  *      mac_execve_enter:???
2817  *
2818  * TODO:        Dynamic linker header address on stack is copied via suword()
2819  */
2820 int
2821 __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
2822 {
2823         char *bufp = NULL;
2824         struct image_params *imgp;
2825         struct vnode_attr *vap;
2826         struct vnode_attr *origvap;
2827         int error;
2828         int is_64 = IS_64BIT_PROCESS(p);
2829         struct vfs_context context;
2830         struct uthread  *uthread;
2831
2832         context.vc_thread = current_thread();
2833         context.vc_ucred = kauth_cred_proc_ref(p);      /* XXX must NOT be kauth_cred_get() */
2834
2835         /* Allocate a big chunk for locals instead of using stack since these
2836          * structures a pretty big.
2837          */
2838         MALLOC(bufp, char *, (sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)), M_TEMP, M_WAITOK | M_ZERO);
2839         imgp = (struct image_params *) bufp;
2840         if (bufp == NULL) {
2841                 error = ENOMEM;
2842                 goto exit_with_error;
2843         }
2844         vap = (struct vnode_attr *) (bufp + sizeof(*imgp));
2845         origvap = (struct vnode_attr *) (bufp + sizeof(*imgp) + sizeof(*vap));
2846
2847         /* Initialize the common data in the image_params structure */
2848         imgp->ip_user_fname = uap->fname;
2849         imgp->ip_user_argv = uap->argp;
2850         imgp->ip_user_envv = uap->envp;
2851         imgp->ip_vattr = vap;
2852         imgp->ip_origvattr = origvap;
2853         imgp->ip_vfs_context = &context;
2854         imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE) | ((p->p_flag & P_DISABLE_ASLR) ? IMGPF_DISABLE_ASLR : IMGPF_NONE);
2855         imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32);
2856         imgp->ip_mac_return = 0;
2857
2858         uthread = get_bsdthread_info(current_thread());
2859         if (uthread->uu_flag & UT_VFORK) {
2860                 imgp->ip_flags |= IMGPF_VFORK_EXEC;
2861         }
2862
2863 #if CONFIG_MACF
2864         if (uap->mac_p != USER_ADDR_NULL) {
2865                 error = mac_execve_enter(uap->mac_p, imgp);
2866                 if (error) {
2867                         kauth_cred_unref(&context.vc_ucred);
2868                         goto exit_with_error;
2869                 }
2870         }
2871 #endif
2872
2873         error = exec_activate_image(imgp);
2874
2875         kauth_cred_unref(&context.vc_ucred);
2876
2877         /* Image not claimed by any activator? */
2878         if (error == -1)
2879                 error = ENOEXEC;
2880
2881         if (error == 0) {
2882                 exec_resettextvp(p, imgp);
2883                 error = check_for_signature(p, imgp);
2884         }
2885         if (imgp->ip_vp != NULLVP)
2886                 vnode_put(imgp->ip_vp);
2887         if (imgp->ip_scriptvp != NULLVP)
2888                 vnode_put(imgp->ip_scriptvp);
2889         if (imgp->ip_strings)
2890                 execargs_free(imgp);
2891 #if CONFIG_MACF
2892         if (imgp->ip_execlabelp)
2893                 mac_cred_label_free(imgp->ip_execlabelp);
2894         if (imgp->ip_scriptlabelp)
2895                 mac_vnode_label_free(imgp->ip_scriptlabelp);
2896 #endif
2897         if (!error) {
2898                 /* Sever any extant thread affinity */
2899                 thread_affinity_exec(current_thread());
2900
2901                 thread_t main_thread = (imgp->ip_new_thread != NULL) ? imgp->ip_new_thread : current_thread();
2902
2903                 task_set_main_thread_qos(p->task, main_thread);
2904
2905                 /* reset task importance */
2906                 task_importance_reset(p->task);
2907
2908                 /* reset atm context from task */
2909                 task_atm_reset(p->task);
2910
2911                 DTRACE_PROC(exec__success);
2912
2913 #if CONFIG_DTRACE
2914                 if ((dtrace_proc_waitfor_hook = dtrace_proc_waitfor_exec_ptr) != NULL)
2915                         (*dtrace_proc_waitfor_hook)(p);
2916 #endif
2917
2918                 if (imgp->ip_flags & IMGPF_VFORK_EXEC) {
2919                         vfork_return(p, retval, p->p_pid);
2920                         proc_clear_return_wait(p, imgp->ip_new_thread);
2921                 }
2922         } else {
2923                 DTRACE_PROC1(exec__failure, int, error);
2924         }
2925
2926 exit_with_error:
2927         if (bufp != NULL) {
2928                 FREE(bufp, M_TEMP);
2929         }
2930
2931         return(error);
2932 }
2933
2934
2935 /*
2936  * copyinptr
2937  *
2938  * Description: Copy a pointer in from user space to a user_addr_t in kernel
2939  *              space, based on 32/64 bitness of the user space
2940  *
2941  * Parameters:  froma                   User space address
2942  *              toptr                   Address of kernel space user_addr_t
2943  *              ptr_size                4/8, based on 'froma' address space
2944  *
2945  * Returns:     0                       Success
2946  *              EFAULT                  Bad 'froma'
2947  *
2948  * Implicit returns:
2949  *              *ptr_size               Modified
2950  */
2951 static int
2952 copyinptr(user_addr_t froma, user_addr_t *toptr, int ptr_size)
2953 {
2954         int error;
2955
2956         if (ptr_size == 4) {
2957                 /* 64 bit value containing 32 bit address */
2958                 unsigned int i;
2959
2960                 error = copyin(froma, &i, 4);
2961                 *toptr = CAST_USER_ADDR_T(i);   /* SAFE */
2962         } else {
2963                 error = copyin(froma, toptr, 8);
2964         }
2965         return (error);
2966 }
2967
2968
2969 /*
2970  * copyoutptr
2971  *
2972  * Description: Copy a pointer out from a user_addr_t in kernel space to
2973  *              user space, based on 32/64 bitness of the user space
2974  *
2975  * Parameters:  ua                      User space address to copy to
2976  *              ptr                     Address of kernel space user_addr_t
2977  *              ptr_size                4/8, based on 'ua' address space
2978  *
2979  * Returns:     0                       Success
2980  *              EFAULT                  Bad 'ua'
2981  *
2982  */
2983 static int
2984 copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size)
2985 {
2986         int error;
2987
2988         if (ptr_size == 4) {
2989                 /* 64 bit value containing 32 bit address */
2990                 unsigned int i = CAST_DOWN_EXPLICIT(unsigned int,ua);   /* SAFE */
2991
2992                 error = copyout(&i, ptr, 4);
2993         } else {
2994                 error = copyout(&ua, ptr, 8);
2995         }
2996         return (error);
2997 }
2998
2999
3000 /*
3001  * exec_copyout_strings
3002  *
3003  * Copy out the strings segment to user space.  The strings segment is put
3004  * on a preinitialized stack frame.
3005  *
3006  * Parameters:  struct image_params *   the image parameter block
3007  *              int *                   a pointer to the stack offset variable
3008  *
3009  * Returns:     0                       Success
3010  *              !0                      Faiure: errno
3011  *
3012  * Implicit returns:
3013  *              (*stackp)               The stack offset, modified
3014  *
3015  * Note:        The strings segment layout is backward, from the beginning
3016  *              of the top of the stack to consume the minimal amount of
3017  *              space possible; the returned stack pointer points to the
3018  *              end of the area consumed (stacks grow downward).
3019  *
3020  *              argc is an int; arg[i] are pointers; env[i] are pointers;
3021  *              the 0's are (void *)NULL's
3022  *
3023  * The stack frame layout is:
3024  *
3025  *      +-------------+ <- p->user_stack
3026  *      |     16b     |
3027  *      +-------------+
3028  *      | STRING AREA |
3029  *      |      :      |
3030  *      |      :      |
3031  *      |      :      |
3032  *      +- -- -- -- --+
3033  *      |  PATH AREA  |
3034  *      +-------------+
3035  *      |      0      |
3036  *      +-------------+
3037  *      |  applev[n]  |
3038  *      +-------------+
3039  *             :
3040  *             :
3041  *      +-------------+
3042  *      |  applev[1]  |
3043  *      +-------------+
3044  *      | exec_path / |
3045  *      |  applev[0]  |
3046  *      +-------------+
3047  *      |      0      |
3048  *      +-------------+
3049  *      |    env[n]   |
3050  *      +-------------+
3051  *             :
3052  *             :
3053  *      +-------------+
3054  *      |    env[0]   |
3055  *      +-------------+
3056  *      |      0      |
3057  *      +-------------+
3058  *      | arg[argc-1] |
3059  *      +-------------+
3060  *             :
3061  *             :
3062  *      +-------------+
3063  *      |    arg[0]   |
3064  *      +-------------+
3065  *      |     argc    |
3066  * sp-> +-------------+
3067  *
3068  * Although technically a part of the STRING AREA, we treat the PATH AREA as
3069  * a separate entity.  This allows us to align the beginning of the PATH AREA
3070  * to a pointer boundary so that the exec_path, env[i], and argv[i] pointers
3071  * which preceed it on the stack are properly aligned.
3072  */
3073
3074 static int
3075 exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp)
3076 {
3077         proc_t p = vfs_context_proc(imgp->ip_vfs_context);
3078         int     ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4;
3079         int     ptr_area_size;
3080         void *ptr_buffer_start, *ptr_buffer;
3081         int string_size;
3082
3083         user_addr_t     string_area;    /* *argv[], *env[] */
3084         user_addr_t     ptr_area;       /* argv[], env[], applev[] */
3085         user_addr_t argc_area;  /* argc */
3086         user_addr_t     stack;
3087         int error;
3088
3089         unsigned i;
3090         struct copyout_desc {
3091                 char    *start_string;
3092                 int             count;
3093 #if CONFIG_DTRACE
3094                 user_addr_t     *dtrace_cookie;
3095 #endif
3096                 boolean_t       null_term;
3097         } descriptors[] = {
3098                 {
3099                         .start_string = imgp->ip_startargv,
3100                         .count = imgp->ip_argc,
3101 #if CONFIG_DTRACE
3102                         .dtrace_cookie = &p->p_dtrace_argv,
3103 #endif
3104                         .null_term = TRUE
3105                 },
3106                 {
3107                         .start_string = imgp->ip_endargv,
3108                         .count = imgp->ip_envc,
3109 #if CONFIG_DTRACE
3110                         .dtrace_cookie = &p->p_dtrace_envp,
3111 #endif
3112                         .null_term = TRUE
3113                 },
3114                 {
3115                         .start_string = imgp->ip_strings,
3116                         .count = 1,
3117 #if CONFIG_DTRACE
3118                         .dtrace_cookie = NULL,
3119 #endif
3120                         .null_term = FALSE
3121                 },
3122                 {
3123                         .start_string = imgp->ip_endenvv,
3124                         .count = imgp->ip_applec - 1, /* exec_path handled above */
3125 #if CONFIG_DTRACE
3126                         .dtrace_cookie = NULL,
3127 #endif
3128                         .null_term = TRUE
3129                 }
3130         };
3131
3132         stack = *stackp;
3133
3134         /*
3135          * All previous contributors to the string area
3136          * should have aligned their sub-area
3137          */
3138         if (imgp->ip_strspace % ptr_size != 0) {
3139                 error = EINVAL;
3140                 goto bad;
3141         }
3142
3143         /* Grow the stack down for the strings we've been building up */
3144         string_size = imgp->ip_strendp - imgp->ip_strings;
3145         stack -= string_size;
3146         string_area = stack;
3147
3148         /*
3149          * Need room for one pointer for each string, plus
3150          * one for the NULLs terminating the argv, envv, and apple areas.
3151          */
3152         ptr_area_size = (imgp->ip_argc + imgp->ip_envc + imgp->ip_applec + 3) *
3153             ptr_size;
3154         stack -= ptr_area_size;
3155         ptr_area = stack;
3156
3157         /* We'll construct all the pointer arrays in our string buffer,
3158          * which we already know is aligned properly, and ip_argspace
3159          * was used to verify we have enough space.
3160          */
3161         ptr_buffer_start = ptr_buffer = (void *)imgp->ip_strendp;
3162
3163         /*
3164          * Need room for pointer-aligned argc slot.
3165          */
3166         stack -= ptr_size;
3167         argc_area = stack;
3168
3169         /*
3170          * Record the size of the arguments area so that sysctl_procargs()
3171          * can return the argument area without having to parse the arguments.
3172          */
3173         proc_lock(p);
3174         p->p_argc = imgp->ip_argc;
3175         p->p_argslen = (int)(*stackp - string_area);
3176         proc_unlock(p);
3177
3178         /* Return the initial stack address: the location of argc */
3179         *stackp = stack;
3180
3181         /*
3182          * Copy out the entire strings area.
3183          */
3184         error = copyout(imgp->ip_strings, string_area,
3185                                                    string_size);
3186         if (error)
3187                 goto bad;
3188
3189         for (i = 0; i < sizeof(descriptors)/sizeof(descriptors[0]); i++) {
3190                 char *cur_string = descriptors[i].start_string;
3191                 int j;
3192
3193 #if CONFIG_DTRACE
3194                 if (descriptors[i].dtrace_cookie) {
3195                         proc_lock(p);
3196                         *descriptors[i].dtrace_cookie = ptr_area + ((uintptr_t)ptr_buffer - (uintptr_t)ptr_buffer_start); /* dtrace convenience */
3197                         proc_unlock(p);
3198                 }
3199 #endif /* CONFIG_DTRACE */
3200
3201                 /*
3202                  * For each segment (argv, envv, applev), copy as many pointers as requested
3203                  * to our pointer buffer.
3204                  */
3205                 for (j = 0; j < descriptors[i].count; j++) {
3206                         user_addr_t cur_address = string_area + (cur_string - imgp->ip_strings);
3207
3208                         /* Copy out the pointer to the current string. Alignment has been verified  */
3209                         if (ptr_size == 8) {
3210                                 *(uint64_t *)ptr_buffer = (uint64_t)cur_address;
3211                         } else {
3212                                 *(uint32_t *)ptr_buffer = (uint32_t)cur_address;
3213                         }
3214
3215                         ptr_buffer = (void *)((uintptr_t)ptr_buffer + ptr_size);
3216                         cur_string += strlen(cur_string) + 1; /* Only a NUL between strings in the same area */
3217                 }
3218
3219                 if (descriptors[i].null_term) {
3220                         if (ptr_size == 8) {
3221                                 *(uint64_t *)ptr_buffer = 0ULL;
3222                         } else {
3223                                 *(uint32_t *)ptr_buffer = 0;
3224                         }
3225
3226                         ptr_buffer = (void *)((uintptr_t)ptr_buffer + ptr_size);
3227                 }
3228         }
3229
3230         /*
3231          * Copy out all our pointer arrays in bulk.
3232          */
3233         error = copyout(ptr_buffer_start, ptr_area,
3234                                         ptr_area_size);
3235         if (error)
3236                 goto bad;
3237
3238         /* argc (int32, stored in a ptr_size area) */
3239         error = copyoutptr((user_addr_t)imgp->ip_argc, argc_area, ptr_size);
3240         if (error)
3241                 goto bad;
3242
3243 bad:
3244         return(error);
3245 }
3246
3247
3248 /*
3249  * exec_extract_strings
3250  *
3251  * Copy arguments and environment from user space into work area; we may
3252  * have already copied some early arguments into the work area, and if
3253  * so, any arguments opied in are appended to those already there.
3254  * This function is the primary manipulator of ip_argspace, since
3255  * these are the arguments the client of execve(2) knows about. After
3256  * each argv[]/envv[] string is copied, we charge the string length
3257  * and argv[]/envv[] pointer slot to ip_argspace, so that we can
3258  * full preflight the arg list size.
3259  *
3260  * Parameters:  struct image_params *   the image parameter block
3261  *
3262  * Returns:     0                       Success
3263  *              !0                      Failure: errno
3264  *
3265  * Implicit returns;
3266  *              (imgp->ip_argc)         Count of arguments, updated
3267  *              (imgp->ip_envc)         Count of environment strings, updated
3268  *              (imgp->ip_argspace)     Count of remaining of NCARGS
3269  *              (imgp->ip_interp_buffer)        Interpreter and args (mutated in place)
3270  *
3271  *
3272  * Note:        The argument and environment vectors are user space pointers
3273  *              to arrays of user space pointers.
3274  */
3275 static int
3276 exec_extract_strings(struct image_params *imgp)
3277 {
3278         int error = 0;
3279         int     ptr_size = (imgp->ip_flags & IMGPF_WAS_64BIT) ? 8 : 4;
3280         int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4;
3281         user_addr_t     argv = imgp->ip_user_argv;
3282         user_addr_t     envv = imgp->ip_user_envv;
3283
3284         /*
3285          * Adjust space reserved for the path name by however much padding it
3286          * needs. Doing this here since we didn't know if this would be a 32-
3287          * or 64-bit process back in exec_save_path.
3288          */
3289         while (imgp->ip_strspace % new_ptr_size != 0) {
3290                 *imgp->ip_strendp++ = '\0';
3291                 imgp->ip_strspace--;
3292                 /* imgp->ip_argspace--; not counted towards exec args total */
3293         }
3294
3295         /*
3296          * From now on, we start attributing string space to ip_argspace
3297          */
3298         imgp->ip_startargv = imgp->ip_strendp;
3299         imgp->ip_argc = 0;
3300
3301         if((imgp->ip_flags & IMGPF_INTERPRET) != 0) {
3302                 user_addr_t     arg;
3303                 char *argstart, *ch;
3304
3305                 /* First, the arguments in the "#!" string are tokenized and extracted. */
3306                 argstart = imgp->ip_interp_buffer;
3307                 while (argstart) {
3308                         ch = argstart;
3309                         while (*ch && !IS_WHITESPACE(*ch)) {
3310                                 ch++;
3311                         }
3312
3313                         if (*ch == '\0') {
3314                                 /* last argument, no need to NUL-terminate */
3315                                 error = exec_add_user_string(imgp, CAST_USER_ADDR_T(argstart), UIO_SYSSPACE, TRUE);
3316                                 argstart = NULL;
3317                         } else {
3318                                 /* NUL-terminate */
3319                                 *ch = '\0';
3320                                 error = exec_add_user_string(imgp, CAST_USER_ADDR_T(argstart), UIO_SYSSPACE, TRUE);
3321
3322                                 /*
3323                                  * Find the next string. We know spaces at the end of the string have already
3324                                  * been stripped.
3325                                  */
3326                                 argstart = ch + 1;
3327                                 while (IS_WHITESPACE(*argstart)) {
3328                                         argstart++;
3329                                 }
3330                         }
3331
3332                         /* Error-check, regardless of whether this is the last interpreter arg or not */
3333                         if (error)
3334                                 goto bad;
3335                         if (imgp->ip_argspace < new_ptr_size) {
3336                                 error = E2BIG;
3337                                 goto bad;
3338                         }
3339                         imgp->ip_argspace -= new_ptr_size; /* to hold argv[] entry */
3340                         imgp->ip_argc++;
3341                 }
3342
3343                 if (argv != 0LL) {
3344                         /*
3345                          * If we are running an interpreter, replace the av[0] that was
3346                          * passed to execve() with the path name that was
3347                          * passed to execve() for interpreters which do not use the PATH
3348                          * to locate their script arguments.
3349                          */
3350                         error = copyinptr(argv, &arg, ptr_size);
3351                         if (error)
3352                                 goto bad;
3353                         if (arg != 0LL) {
3354                                 argv += ptr_size; /* consume without using */
3355                         }
3356                 }
3357
3358                 if (imgp->ip_interp_sugid_fd != -1) {
3359                         char temp[19]; /* "/dev/fd/" + 10 digits + NUL */
3360                         snprintf(temp, sizeof(temp), "/dev/fd/%d", imgp->ip_interp_sugid_fd);
3361                         error = exec_add_user_string(imgp, CAST_USER_ADDR_T(temp), UIO_SYSSPACE, TRUE);
3362                 } else {
3363                         error = exec_add_user_string(imgp, imgp->ip_user_fname, imgp->ip_seg, TRUE);
3364                 }
3365
3366                 if (error)
3367                         goto bad;
3368                 if (imgp->ip_argspace < new_ptr_size) {
3369                         error = E2BIG;
3370                         goto bad;
3371                 }
3372                 imgp->ip_argspace -= new_ptr_size; /* to hold argv[] entry */
3373                 imgp->ip_argc++;
3374         }
3375
3376         while (argv != 0LL) {
3377                 user_addr_t     arg;
3378
3379                 error = copyinptr(argv, &arg, ptr_size);
3380                 if (error)
3381                         goto bad;
3382
3383                 if (arg == 0LL) {
3384                         break;
3385                 }
3386
3387                 argv += ptr_size;
3388
3389                 /*
3390                 * av[n...] = arg[n]
3391                 */
3392                 error = exec_add_user_string(imgp, arg, imgp->ip_seg, TRUE);
3393                 if (error)
3394                         goto bad;
3395                 if (imgp->ip_argspace < new_ptr_size) {
3396                         error = E2BIG;
3397                         goto bad;
3398                 }
3399                 imgp->ip_argspace -= new_ptr_size; /* to hold argv[] entry */
3400                 imgp->ip_argc++;
3401         }
3402
3403         /* Save space for argv[] NULL terminator */
3404         if (imgp->ip_argspace < new_ptr_size) {
3405                 error = E2BIG;
3406                 goto bad;
3407         }
3408         imgp->ip_argspace -= new_ptr_size;
3409
3410         /* Note where the args ends and env begins. */
3411         imgp->ip_endargv = imgp->ip_strendp;
3412         imgp->ip_envc = 0;
3413
3414         /* Now, get the environment */
3415         while (envv != 0LL) {
3416                 user_addr_t     env;
3417
3418                 error = copyinptr(envv, &env, ptr_size);
3419                 if (error)
3420                         goto bad;
3421
3422                 envv += ptr_size;
3423                 if (env == 0LL) {
3424                         break;
3425                 }
3426                 /*
3427                 * av[n...] = env[n]
3428                 */
3429                 error = exec_add_user_string(imgp, env, imgp->ip_seg, TRUE);
3430                 if (error)
3431                         goto bad;
3432                 if (imgp->ip_argspace < new_ptr_size) {
3433                         error = E2BIG;
3434                         goto bad;
3435                 }
3436                 imgp->ip_argspace -= new_ptr_size; /* to hold envv[] entry */
3437                 imgp->ip_envc++;
3438         }
3439
3440         /* Save space for envv[] NULL terminator */
3441         if (imgp->ip_argspace < new_ptr_size) {
3442                 error = E2BIG;
3443                 goto bad;
3444         }
3445         imgp->ip_argspace -= new_ptr_size;
3446
3447         /* Align the tail of the combined argv+envv area */
3448         while (imgp->ip_strspace % new_ptr_size != 0) {
3449                 if (imgp->ip_argspace < 1) {
3450                         error = E2BIG;
3451                         goto bad;
3452                 }
3453                 *imgp->ip_strendp++ = '\0';
3454                 imgp->ip_strspace--;
3455                 imgp->ip_argspace--;
3456         }
3457
3458         /* Note where the envv ends and applev begins. */
3459         imgp->ip_endenvv = imgp->ip_strendp;
3460
3461         /*
3462          * From now on, we are no longer charging argument
3463          * space to ip_argspace.
3464          */
3465
3466 bad:
3467         return error;
3468 }
3469
3470 static char *
3471 random_hex_str(char *str, int len, boolean_t embedNUL)
3472 {
3473         uint64_t low, high, value;
3474         int idx;
3475         char digit;
3476
3477         /* A 64-bit value will only take 16 characters, plus '0x' and NULL. */
3478         if (len > 19)
3479                 len = 19;
3480
3481         /* We need enough room for at least 1 digit */
3482         if (len < 4)
3483                 return (NULL);
3484
3485         low = random();
3486         high = random();
3487         value = high << 32 | low;
3488
3489         if (embedNUL) {
3490                 /*
3491                  * Zero a byte to protect against C string vulnerabilities
3492                  * e.g. for userland __stack_chk_guard.
3493                  */
3494                 value &= ~(0xffull << 8);
3495         }
3496
3497         str[0] = '0';
3498         str[1] = 'x';
3499         for (idx = 2; idx < len - 1; idx++) {
3500                 digit = value & 0xf;
3501                 value = value >> 4;
3502                 if (digit < 10)
3503                         str[idx] = '0' + digit;
3504                 else
3505                         str[idx] = 'a' + (digit - 10);
3506         }
3507         str[idx] = '\0';
3508         return (str);
3509 }
3510
3511 /*
3512  * Libc has an 8-element array set up for stack guard values.  It only fills
3513  * in one of those entries, and both gcc and llvm seem to use only a single
3514  * 8-byte guard.  Until somebody needs more than an 8-byte guard value, don't
3515  * do the work to construct them.
3516  */
3517 #define GUARD_VALUES 1
3518 #define GUARD_KEY "stack_guard="
3519
3520 /*
3521  * System malloc needs some entropy when it is initialized.
3522  */
3523 #define ENTROPY_VALUES 2
3524 #define ENTROPY_KEY "malloc_entropy="
3525
3526 /*
3527  * System malloc engages nanozone for UIAPP.
3528  */
3529 #define NANO_ENGAGE_KEY "MallocNanoZone=1"
3530
3531 #define PFZ_KEY "pfz="
3532 extern user32_addr_t commpage_text32_location;
3533 extern user64_addr_t commpage_text64_location;
3534 /*
3535  * Build up the contents of the apple[] string vector
3536  */
3537 static int
3538 exec_add_apple_strings(struct image_params *imgp)
3539 {
3540         int i, error;
3541         int new_ptr_size=4;
3542         char guard[19];
3543         char guard_vec[strlen(GUARD_KEY) + 19 * GUARD_VALUES + 1];
3544
3545         char entropy[19];
3546         char entropy_vec[strlen(ENTROPY_KEY) + 19 * ENTROPY_VALUES + 1];
3547
3548         char pfz_string[strlen(PFZ_KEY) + 16 + 4 +1];
3549
3550         if( imgp->ip_flags & IMGPF_IS_64BIT) {
3551                 new_ptr_size = 8;
3552                 snprintf(pfz_string, sizeof(pfz_string),PFZ_KEY "0x%llx",commpage_text64_location);
3553         } else {
3554                 snprintf(pfz_string, sizeof(pfz_string),PFZ_KEY "0x%x",commpage_text32_location);
3555         }
3556
3557         /* exec_save_path stored the first string */
3558         imgp->ip_applec = 1;
3559
3560         /* adding the pfz string */
3561         error = exec_add_user_string(imgp, CAST_USER_ADDR_T(pfz_string),UIO_SYSSPACE,FALSE);
3562         if(error)
3563                 goto bad;
3564         imgp->ip_applec++;
3565
3566         /* adding the NANO_ENGAGE_KEY key */
3567         if (imgp->ip_px_sa) {
3568                 int proc_flags = (((struct _posix_spawnattr *) imgp->ip_px_sa)->psa_flags);
3569
3570                 if ((proc_flags & _POSIX_SPAWN_NANO_ALLOCATOR) == _POSIX_SPAWN_NANO_ALLOCATOR) {
3571                         char uiapp_string[strlen(NANO_ENGAGE_KEY) + 1];
3572
3573                         snprintf(uiapp_string, sizeof(uiapp_string), NANO_ENGAGE_KEY);
3574                         error = exec_add_user_string(imgp, CAST_USER_ADDR_T(uiapp_string),UIO_SYSSPACE,FALSE);
3575                         if (error)
3576                                 goto bad;
3577                         imgp->ip_applec++;
3578                 }
3579         }
3580
3581         /*
3582          * Supply libc with a collection of random values to use when
3583          * implementing -fstack-protector.
3584          *
3585          * (The first random string always contains an embedded NUL so that
3586          * __stack_chk_guard also protects against C string vulnerabilities)
3587          */
3588         (void)strlcpy(guard_vec, GUARD_KEY, sizeof (guard_vec));
3589         for (i = 0; i < GUARD_VALUES; i++) {
3590                 random_hex_str(guard, sizeof (guard), i == 0);
3591                 if (i)
3592                         (void)strlcat(guard_vec, ",", sizeof (guard_vec));
3593                 (void)strlcat(guard_vec, guard, sizeof (guard_vec));
3594         }
3595
3596         error = exec_add_user_string(imgp, CAST_USER_ADDR_T(guard_vec), UIO_SYSSPACE, FALSE);
3597         if (error)
3598                 goto bad;
3599         imgp->ip_applec++;
3600
3601         /*
3602          * Supply libc with entropy for system malloc.
3603          */
3604         (void)strlcpy(entropy_vec, ENTROPY_KEY, sizeof(entropy_vec));
3605         for (i = 0; i < ENTROPY_VALUES; i++) {
3606                 random_hex_str(entropy, sizeof (entropy), FALSE);
3607                 if (i)
3608                         (void)strlcat(entropy_vec, ",", sizeof (entropy_vec));
3609                 (void)strlcat(entropy_vec, entropy, sizeof (entropy_vec));
3610         }
3611
3612         error = exec_add_user_string(imgp, CAST_USER_ADDR_T(entropy_vec), UIO_SYSSPACE, FALSE);
3613         if (error)
3614                 goto bad;
3615         imgp->ip_applec++;
3616
3617         /* Align the tail of the combined applev area */
3618         while (imgp->ip_strspace % new_ptr_size != 0) {
3619                 *imgp->ip_strendp++ = '\0';
3620                 imgp->ip_strspace--;
3621         }
3622
3623 bad:
3624         return error;
3625 }
3626
3627 #define unix_stack_size(p)      (p->p_rlimit[RLIMIT_STACK].rlim_cur)
3628
3629 /*
3630  * exec_check_permissions
3631  *
3632  * Description: Verify that the file that is being attempted to be executed
3633  *              is in fact allowed to be executed based on it POSIX file
3634  *              permissions and other access control criteria
3635  *
3636  * Parameters:  struct image_params *   the image parameter block
3637  *
3638  * Returns:     0                       Success
3639  *              EACCES                  Permission denied
3640  *              ENOEXEC                 Executable file format error
3641  *              ETXTBSY                 Text file busy [misuse of error code]
3642  *      vnode_getattr:???
3643  *      vnode_authorize:???
3644  */
3645 static int
3646 exec_check_permissions(struct image_params *imgp)
3647 {
3648         struct vnode *vp = imgp->ip_vp;
3649         struct vnode_attr *vap = imgp->ip_vattr;
3650         proc_t p = vfs_context_proc(imgp->ip_vfs_context);
3651         int error;
3652         kauth_action_t action;
3653
3654         /* Only allow execution of regular files */
3655         if (!vnode_isreg(vp))
3656                 return (EACCES);
3657
3658         /* Get the file attributes that we will be using here and elsewhere */
3659         VATTR_INIT(vap);
3660         VATTR_WANTED(vap, va_uid);
3661         VATTR_WANTED(vap, va_gid);
3662         VATTR_WANTED(vap, va_mode);
3663         VATTR_WANTED(vap, va_fsid);
3664         VATTR_WANTED(vap, va_fileid);
3665         VATTR_WANTED(vap, va_data_size);
3666         if ((error = vnode_getattr(vp, vap, imgp->ip_vfs_context)) != 0)
3667                 return (error);
3668
3669         /*
3670          * Ensure that at least one execute bit is on - otherwise root
3671          * will always succeed, and we don't want to happen unless the
3672          * file really is executable.
3673          */
3674         if (!vfs_authopaque(vnode_mount(vp)) && ((vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0))
3675                 return (EACCES);
3676
3677         /* Disallow zero length files */
3678         if (vap->va_data_size == 0)
3679                 return (ENOEXEC);
3680
3681         imgp->ip_arch_offset = (user_size_t)0;
3682         imgp->ip_arch_size = vap->va_data_size;
3683
3684         /* Disable setuid-ness for traced programs or if MNT_NOSUID */
3685         if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_lflag & P_LTRACED))
3686                 vap->va_mode &= ~(VSUID | VSGID);
3687
3688         /*
3689          * Disable _POSIX_SPAWN_ALLOW_DATA_EXEC and _POSIX_SPAWN_DISABLE_ASLR
3690          * flags for setuid/setgid binaries.
3691          */
3692         if (vap->va_mode & (VSUID | VSGID))
3693                 imgp->ip_flags &= ~(IMGPF_ALLOW_DATA_EXEC | IMGPF_DISABLE_ASLR);
3694
3695 #if CONFIG_MACF
3696         error = mac_vnode_check_exec(imgp->ip_vfs_context, vp, imgp);
3697         if (error)
3698                 return (error);
3699 #endif
3700
3701         /* Check for execute permission */
3702         action = KAUTH_VNODE_EXECUTE;
3703         /* Traced images must also be readable */
3704         if (p->p_lflag & P_LTRACED)
3705                 action |= KAUTH_VNODE_READ_DATA;
3706         if ((error = vnode_authorize(vp, NULL, action, imgp->ip_vfs_context)) != 0)
3707                 return (error);
3708
3709 #if 0
3710         /* Don't let it run if anyone had it open for writing */
3711         vnode_lock(vp);
3712         if (vp->v_writecount) {
3713                 panic("going to return ETXTBSY %x", vp);
3714                 vnode_unlock(vp);
3715                 return (ETXTBSY);
3716         }
3717         vnode_unlock(vp);
3718 #endif
3719
3720
3721         /* XXX May want to indicate to underlying FS that vnode is open */
3722
3723         return (error);
3724 }
3725
3726
3727 /*
3728  * exec_handle_sugid
3729  *
3730  * Initially clear the P_SUGID in the process flags; if an SUGID process is
3731  * exec'ing a non-SUGID image, then  this is the point of no return.
3732  *
3733  * If the image being activated is SUGID, then replace the credential with a
3734  * copy, disable tracing (unless the tracing process is root), reset the
3735  * mach task port to revoke it, set the P_SUGID bit,
3736  *
3737  * If the saved user and group ID will be changing, then make sure it happens
3738  * to a new credential, rather than a shared one.
3739  *
3740  * Set the security token (this is probably obsolete, given that the token
3741  * should not technically be separate from the credential itself).
3742  *
3743  * Parameters:  struct image_params *   the image parameter block
3744  *
3745  * Returns:     void                    No failure indication
3746  *
3747  * Implicit returns:
3748  *              <process credential>    Potentially modified/replaced
3749  *              <task port>             Potentially revoked
3750  *              <process flags>         P_SUGID bit potentially modified
3751  *              <security token>        Potentially modified
3752  */
3753 static int
3754 exec_handle_sugid(struct image_params *imgp)
3755 {
3756         kauth_cred_t            cred = vfs_context_ucred(imgp->ip_vfs_context);
3757         proc_t                  p = vfs_context_proc(imgp->ip_vfs_context);
3758         int                     i;
3759         int                     leave_sugid_clear = 0;
3760         int                     mac_reset_ipc = 0;
3761         int                     error = 0;
3762 #if CONFIG_MACF
3763         int                     mac_transition, disjoint_cred = 0;
3764         int             label_update_return = 0;
3765
3766         /*
3767          * Determine whether a call to update the MAC label will result in the
3768          * credential changing.
3769          *
3770          * Note:        MAC policies which do not actually end up modifying
3771          *              the label subsequently are strongly encouraged to
3772          *              return 0 for this check, since a non-zero answer will
3773          *              slow down the exec fast path for normal binaries.
3774          */
3775         mac_transition = mac_cred_check_label_update_execve(
3776                                                         imgp->ip_vfs_context,
3777                                                         imgp->ip_vp,
3778                                                         imgp->ip_arch_offset,
3779                                                         imgp->ip_scriptvp,
3780                                                         imgp->ip_scriptlabelp,
3781                                                         imgp->ip_execlabelp,
3782                                                         p,
3783                                                         imgp->ip_px_smpx);
3784 #endif
3785
3786         OSBitAndAtomic(~((uint32_t)P_SUGID), &p->p_flag);
3787
3788         /*
3789          * Order of the following is important; group checks must go last,
3790          * as we use the success of the 'ismember' check combined with the
3791          * failure of the explicit match to indicate that we will be setting
3792          * the egid of the process even though the new process did not
3793          * require VSUID/VSGID bits in order for it to set the new group as
3794          * its egid.
3795          *
3796          * Note:        Technically, by this we are implying a call to
3797          *              setegid() in the new process, rather than implying
3798          *              it used its VSGID bit to set the effective group,
3799          *              even though there is no code in that process to make
3800          *              such a call.
3801          */
3802         if (((imgp->ip_origvattr->va_mode & VSUID) != 0 &&
3803              kauth_cred_getuid(cred) != imgp->ip_origvattr->va_uid) ||
3804             ((imgp->ip_origvattr->va_mode & VSGID) != 0 &&
3805                  ((kauth_cred_ismember_gid(cred, imgp->ip_origvattr->va_gid, &leave_sugid_clear) || !leave_sugid_clear) ||
3806                  (kauth_cred_getgid(cred) != imgp->ip_origvattr->va_gid)))) {
3807
3808 #if CONFIG_MACF
3809 /* label for MAC transition and neither VSUID nor VSGID */
3810 handle_mac_transition:
3811 #endif
3812
3813                 /*
3814                  * Replace the credential with a copy of itself if euid or
3815                  * egid change.
3816                  *
3817                  * Note:        setuid binaries will automatically opt out of
3818                  *              group resolver participation as a side effect
3819                  *              of this operation.  This is an intentional
3820                  *              part of the security model, which requires a
3821                  *              participating credential be established by
3822                  *              escalating privilege, setting up all other
3823                  *              aspects of the credential including whether
3824                  *              or not to participate in external group
3825                  *              membership resolution, then dropping their
3826                  *              effective privilege to that of the desired
3827                  *              final credential state.
3828                  */
3829                 if (imgp->ip_origvattr->va_mode & VSUID) {
3830                         p->p_ucred  = kauth_cred_setresuid(p->p_ucred, KAUTH_UID_NONE, imgp->ip_origvattr->va_uid, imgp->ip_origvattr->va_uid, KAUTH_UID_NONE);
3831                         /* update cred on proc */
3832                         PROC_UPDATE_CREDS_ONPROC(p);
3833                 }
3834                 if (imgp->ip_origvattr->va_mode & VSGID) {
3835                         p->p_ucred = kauth_cred_setresgid(p->p_ucred, KAUTH_GID_NONE, imgp->ip_origvattr->va_gid, imgp->ip_origvattr->va_gid);
3836                         /* update cred on proc */
3837                         PROC_UPDATE_CREDS_ONPROC(p);
3838                 }
3839
3840 #if CONFIG_MACF
3841                 /*
3842                  * If a policy has indicated that it will transition the label,
3843                  * before making the call into the MAC policies, get a new
3844                  * duplicate credential, so they can modify it without
3845                  * modifying any others sharing it.
3846                  */
3847                 if (mac_transition) {
3848                         kauth_proc_label_update_execve(p,
3849                                                 imgp->ip_vfs_context,
3850                                                 imgp->ip_vp,
3851                                                 imgp->ip_arch_offset,
3852                                                 imgp->ip_scriptvp,
3853                                                 imgp->ip_scriptlabelp,
3854                                                 imgp->ip_execlabelp,
3855                                                 &imgp->ip_csflags,
3856                                                 imgp->ip_px_smpx,
3857                                                 &disjoint_cred, /* will be non zero if disjoint */
3858                                                 &label_update_return);
3859
3860                         if (disjoint_cred) {
3861                                 /*
3862                                  * If updating the MAC label resulted in a
3863                                  * disjoint credential, flag that we need to
3864                                  * set the P_SUGID bit.  This protects
3865                                  * against debuggers being attached by an
3866                                  * insufficiently privileged process onto the
3867                                  * result of a transition to a more privileged
3868                                  * credential.
3869                                  */
3870                                 leave_sugid_clear = 0;
3871                         }
3872
3873                         imgp->ip_mac_return = label_update_return;
3874                 }
3875
3876                 mac_reset_ipc = mac_proc_check_inherit_ipc_ports(p, p->p_textvp, p->p_textoff, imgp->ip_vp, imgp->ip_arch_offset, imgp->ip_scriptvp);
3877
3878 #endif  /* CONFIG_MACF */
3879
3880                 /*
3881                  * If 'leave_sugid_clear' is non-zero, then we passed the
3882                  * VSUID and MACF checks, and successfully determined that
3883                  * the previous cred was a member of the VSGID group, but
3884                  * that it was not the default at the time of the execve,
3885                  * and that the post-labelling credential was not disjoint.
3886                  * So we don't set the P_SUGID or reset mach ports and fds
3887                  * on the basis of simply running this code.
3888                  */
3889                 if (mac_reset_ipc || !leave_sugid_clear) {
3890                         /*
3891                          * Have mach reset the task and thread ports.
3892                          * We don't want anyone who had the ports before
3893                          * a setuid exec to be able to access/control the
3894                          * task/thread after.
3895                          */
3896                         ipc_task_reset(p->task);
3897                         ipc_thread_reset((imgp->ip_new_thread != NULL) ?
3898                                          imgp->ip_new_thread : current_thread());
3899                 }
3900
3901                 if (!leave_sugid_clear) {
3902                         /*
3903                          * Flag the process as setuid.
3904                          */
3905                         OSBitOrAtomic(P_SUGID, &p->p_flag);
3906
3907                         /*
3908                          * Radar 2261856; setuid security hole fix
3909                          * XXX For setuid processes, attempt to ensure that
3910                          * stdin, stdout, and stderr are already allocated.
3911                          * We do not want userland to accidentally allocate
3912                          * descriptors in this range which has implied meaning
3913                          * to libc.
3914                          */
3915                         for (i = 0; i < 3; i++) {
3916
3917                                 if (p->p_fd->fd_ofiles[i] != NULL)
3918                                         continue;
3919
3920                                 /*
3921                                  * Do the kernel equivalent of
3922                                  *
3923                                  *      if i == 0
3924                                  *              (void) open("/dev/null", O_RDONLY);
3925                                  *      else
3926                                  *              (void) open("/dev/null", O_WRONLY);
3927                                  */
3928
3929                                 struct fileproc *fp;
3930                                 int indx;
3931                                 int flag;
3932                                 struct nameidata *ndp = NULL;
3933
3934                                 if (i == 0)
3935                                         flag = FREAD;
3936                                 else
3937                                         flag = FWRITE;
3938
3939                                 if ((error = falloc(p,
3940                                     &fp, &indx, imgp->ip_vfs_context)) != 0)
3941                                         continue;
3942
3943                                 MALLOC(ndp, struct nameidata *, sizeof(*ndp), M_TEMP, M_WAITOK | M_ZERO);
3944                                 if (ndp == NULL) {
3945                                         error = ENOMEM;
3946                                         break;
3947                                 }
3948
3949                                 NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE,
3950                                     CAST_USER_ADDR_T("/dev/null"),
3951                                     imgp->ip_vfs_context);
3952
3953                                 if ((error = vn_open(ndp, flag, 0)) != 0) {
3954                                         fp_free(p, indx, fp);
3955                                         break;
3956                                 }
3957
3958                                 struct fileglob *fg = fp->f_fglob;
3959
3960                                 fg->fg_flag = flag;
3961                                 fg->fg_ops = &vnops;
3962                                 fg->fg_data = ndp->ni_vp;
3963
3964                                 vnode_put(ndp->ni_vp);
3965
3966                                 proc_fdlock(p);
3967                                 procfdtbl_releasefd(p, indx, NULL);
3968                                 fp_drop(p, indx, fp, 1);
3969                                 proc_fdunlock(p);
3970
3971                                 FREE(ndp, M_TEMP);
3972                         }
3973                 }
3974         }
3975 #if CONFIG_MACF
3976         else {
3977                 /*
3978                  * We are here because we were told that the MAC label will
3979                  * be transitioned, and the binary is not VSUID or VSGID; to
3980                  * deal with this case, we could either duplicate a lot of
3981                  * code, or we can indicate we want to default the P_SUGID
3982                  * bit clear and jump back up.
3983                  */
3984                 if (mac_transition) {
3985                         leave_sugid_clear = 1;
3986                         goto handle_mac_transition;
3987                 }
3988         }
3989
3990 #endif  /* CONFIG_MACF */
3991
3992         /*
3993          * Implement the semantic where the effective user and group become
3994          * the saved user and group in exec'ed programs.
3995          */
3996         p->p_ucred = kauth_cred_setsvuidgid(p->p_ucred, kauth_cred_getuid(p->p_ucred),  kauth_cred_getgid(p->p_ucred));
3997         /* update cred on proc */
3998         PROC_UPDATE_CREDS_ONPROC(p);
3999
4000         /* Update the process' identity version and set the security token */
4001         p->p_idversion++;
4002         set_security_token(p);
4003
4004         return(error);
4005 }
4006
4007
4008 /*
4009  * create_unix_stack
4010  *
4011  * Description: Set the user stack address for the process to the provided
4012  *              address.  If a custom stack was not set as a result of the
4013  *              load process (i.e. as specified by the image file for the
4014  *              executable), then allocate the stack in the provided map and
4015  *              set up appropriate guard pages for enforcing administrative
4016  *              limits on stack growth, if they end up being needed.
4017  *
4018  * Parameters:  p                       Process to set stack on
4019  *              load_result             Information from mach-o load commands
4020  *              map                     Address map in which to allocate the new stack
4021  *
4022  * Returns:     KERN_SUCCESS            Stack successfully created
4023  *              !KERN_SUCCESS           Mach failure code
4024  */
4025 static kern_return_t
4026 create_unix_stack(vm_map_t map, load_result_t* load_result,
4027                         proc_t p)
4028 {
4029         mach_vm_size_t          size, prot_size;
4030         mach_vm_offset_t        addr, prot_addr;
4031         kern_return_t           kr;
4032
4033         mach_vm_address_t       user_stack = load_result->user_stack;
4034
4035         proc_lock(p);
4036         p->user_stack = user_stack;
4037         proc_unlock(p);
4038
4039         if (!load_result->prog_allocated_stack) {
4040                 /*
4041                  * Allocate enough space for the maximum stack size we
4042                  * will ever authorize and an extra page to act as
4043                  * a guard page for stack overflows. For default stacks,
4044                  * vm_initial_limit_stack takes care of the extra guard page.
4045                  * Otherwise we must allocate it ourselves.
4046                  */
4047
4048                 size = mach_vm_round_page(load_result->user_stack_size);
4049                 if (load_result->prog_stack_size)
4050                         size += PAGE_SIZE;
4051                 addr = mach_vm_trunc_page(load_result->user_stack - size);
4052                 kr = mach_vm_allocate(map, &addr, size,
4053                                         VM_MAKE_TAG(VM_MEMORY_STACK) |
4054                                         VM_FLAGS_FIXED);
4055                 if (kr != KERN_SUCCESS) {
4056                         /* If can't allocate at default location, try anywhere */
4057                         addr = 0;
4058                         kr = mach_vm_allocate(map, &addr, size,
4059                                                                   VM_MAKE_TAG(VM_MEMORY_STACK) |
4060                                                                   VM_FLAGS_ANYWHERE);
4061                         if (kr != KERN_SUCCESS)
4062                                 return kr;
4063
4064                         user_stack = addr + size;
4065                         load_result->user_stack = user_stack;
4066
4067                         proc_lock(p);
4068                         p->user_stack = user_stack;
4069                         proc_unlock(p);
4070                 }
4071
4072                 /*
4073                  * And prevent access to what's above the current stack
4074                  * size limit for this process.
4075                  */
4076                 prot_addr = addr;
4077                 if (load_result->prog_stack_size)
4078                         prot_size = PAGE_SIZE;
4079                 else
4080                         prot_size = mach_vm_trunc_page(size - unix_stack_size(p));
4081                 kr = mach_vm_protect(map,
4082                                                          prot_addr,
4083                                                          prot_size,
4084                                                          FALSE,
4085                                                          VM_PROT_NONE);
4086                 if (kr != KERN_SUCCESS) {
4087                         (void) mach_vm_deallocate(map, addr, size);
4088                         return kr;
4089                 }
4090         }
4091
4092         return KERN_SUCCESS;
4093 }
4094
4095 #include <sys/reboot.h>
4096
4097 /*
4098  * load_init_program_at_path
4099  *
4100  * Description: Load the "init" program; in most cases, this will be "launchd"
4101  *
4102  * Parameters:  p                       Process to call execve() to create
4103  *                                      the "init" program
4104  *              scratch_addr            Page in p, scratch space
4105  *              path                    NULL terminated path
4106  *
4107  * Returns:     KERN_SUCCESS            Success
4108  *              !KERN_SUCCESS           See execve/mac_execve for error codes
4109  *
4110  * Notes:       The process that is passed in is the first manufactured
4111  *              process on the system, and gets here via bsd_ast() firing
4112  *              for the first time.  This is done to ensure that bsd_init()
4113  *              has run to completion.
4114  *
4115  *              The address map of the first manufactured process is 32 bit.
4116  *              WHEN this becomes 64b, this code will fail; it needs to be
4117  *              made 64b capable.
4118  */
4119 static int
4120 load_init_program_at_path(proc_t p, user_addr_t scratch_addr, const char* path)
4121 {
4122         uint32_t argv[3];
4123         uint32_t argc = 0;
4124         int retval[2];
4125         struct execve_args init_exec_args;
4126
4127         /*
4128          * Validate inputs and pre-conditions
4129          */
4130         assert(p);
4131         assert(scratch_addr);
4132         assert(path);
4133
4134         if (IS_64BIT_PROCESS(p)) {
4135                 panic("Init against 64b primordial proc not implemented");
4136         }
4137
4138         /*
4139          * Copy out program name.
4140          */
4141         size_t path_length = strlen(path) + 1;
4142         (void) copyout(path, scratch_addr, path_length);
4143
4144         argv[argc++] = (uint32_t)scratch_addr;
4145         scratch_addr = USER_ADDR_ALIGN(scratch_addr + path_length, 16);
4146
4147         /*
4148          * Put out first (and only) argument, similarly.
4149          * Assumes everything fits in a page as allocated above.
4150          */
4151         if (boothowto & RB_SINGLE) {
4152                 const char *init_args = "-s";
4153                 size_t init_args_length = strlen(init_args)+1;
4154
4155                 copyout(init_args, scratch_addr, init_args_length);
4156
4157                 argv[argc++] = (uint32_t)scratch_addr;
4158                 scratch_addr = USER_ADDR_ALIGN(scratch_addr + init_args_length, 16);
4159         }
4160
4161         /*
4162          * Null-end the argument list
4163          */
4164         argv[argc] = 0;
4165
4166         /*
4167          * Copy out the argument list.
4168          */
4169         (void) copyout(argv, scratch_addr, sizeof(argv));
4170
4171         /*
4172          * Set up argument block for fake call to execve.
4173          */
4174         init_exec_args.fname = CAST_USER_ADDR_T(argv[0]);
4175         init_exec_args.argp = scratch_addr;
4176         init_exec_args.envp = USER_ADDR_NULL;
4177
4178         /*
4179          * So that init task is set with uid,gid 0 token
4180          */
4181         set_security_token(p);
4182
4183         return execve(p, &init_exec_args, retval);
4184 }
4185
4186 static const char * init_programs[] = {
4187 #if DEBUG
4188         "/usr/local/sbin/launchd.debug",
4189 #endif
4190 #if DEVELOPMENT || DEBUG
4191         /* Remove DEBUG conditional when <rdar://problem/17931977> is fixed */
4192         "/usr/local/sbin/launchd.development",
4193 #endif
4194         "/sbin/launchd",
4195 };
4196
4197 /*
4198  * load_init_program
4199  *
4200  * Description: Load the "init" program; in most cases, this will be "launchd"
4201  *
4202  * Parameters:  p                       Process to call execve() to create
4203  *                                      the "init" program
4204  *
4205  * Returns:     (void)
4206  *
4207  * Notes:       The process that is passed in is the first manufactured
4208  *              process on the system, and gets here via bsd_ast() firing
4209  *              for the first time.  This is done to ensure that bsd_init()
4210  *              has run to completion.
4211  *
4212  *              In DEBUG & DEVELOPMENT builds, the launchdsuffix boot-arg
4213  *              may be used to select a specific launchd executable. As with
4214  *              the kcsuffix boot-arg, setting launchdsuffix to "" or "release"
4215  *              will force /sbin/launchd to be selected.
4216  *
4217  *              The DEBUG kernel will continue to check for a .development
4218  *              version until <rdar://problem/17931977> is fixed.
4219  *
4220  *              Search order by build:
4221  *
4222  * DEBUG        DEVELOPMENT     RELEASE         PATH
4223  * ----------------------------------------------------------------------------------
4224  * 1            1               NA              /usr/local/sbin/launchd.$LAUNCHDSUFFIX
4225  * 2            NA              NA              /usr/local/sbin/launchd.debug
4226  * 3            2               NA              /usr/local/sbin/launchd.development
4227  * 4            3               1               /sbin/launchd
4228  */
4229 void
4230 load_init_program(proc_t p)
4231 {
4232         uint32_t i;
4233         int error;
4234         vm_offset_t scratch_addr = VM_MIN_ADDRESS;
4235
4236         (void) vm_allocate(current_map(), &scratch_addr, PAGE_SIZE, VM_FLAGS_ANYWHERE);
4237 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
4238         (void) memorystatus_init_at_boot_snapshot();
4239 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
4240
4241 #if DEBUG || DEVELOPMENT
4242         /* Check for boot-arg suffix first */
4243         char launchd_suffix[64];
4244         if (PE_parse_boot_argn("launchdsuffix", launchd_suffix, sizeof(launchd_suffix))) {
4245                 char launchd_path[128];
4246                 boolean_t is_release_suffix = ((launchd_suffix[0] == 0) ||
4247                                                (strcmp(launchd_suffix, "release") == 0));
4248
4249                 if (is_release_suffix) {
4250                         error = load_init_program_at_path(p, CAST_USER_ADDR_T(scratch_addr), "/sbin/launchd");
4251                         if (!error)
4252                                 return;
4253
4254                         panic("Process 1 exec of launchd.release failed, errno %d", error);
4255                 } else {
4256                         strlcpy(launchd_path, "/usr/local/sbin/launchd.", sizeof(launchd_path));
4257                         strlcat(launchd_path, launchd_suffix, sizeof(launchd_path));
4258
4259                         /* All the error data is lost in the loop below, don't
4260                          * attempt to save it. */
4261                         if (!load_init_program_at_path(p, CAST_USER_ADDR_T(scratch_addr), launchd_path)) {
4262                                 return;
4263                         }
4264                 }
4265         }
4266 #endif
4267
4268         error = ENOENT;
4269         for (i = 0; i < sizeof(init_programs)/sizeof(init_programs[0]); i++) {
4270                 error = load_init_program_at_path(p, CAST_USER_ADDR_T(scratch_addr), init_programs[i]);
4271                 if (!error)
4272                         return;
4273         }
4274
4275         panic("Process 1 exec of %s failed, errno %d", ((i == 0) ? "<null>" : init_programs[i-1]), error);
4276 }
4277
4278 /*
4279  * load_return_to_errno
4280  *
4281  * Description: Convert a load_return_t (Mach error) to an errno (BSD error)
4282  *
4283  * Parameters:  lrtn                    Mach error number
4284  *
4285  * Returns:     (int)                   BSD error number
4286  *              0                       Success
4287  *              EBADARCH                Bad architecture
4288  *              EBADMACHO               Bad Mach object file
4289  *              ESHLIBVERS              Bad shared library version
4290  *              ENOMEM                  Out of memory/resource shortage
4291  *              EACCES                  Access denied
4292  *              ENOENT                  Entry not found (usually "file does
4293  *                                      does not exist")
4294  *              EIO                     An I/O error occurred
4295  *              EBADEXEC                The executable is corrupt/unknown
4296  */
4297 static int
4298 load_return_to_errno(load_return_t lrtn)
4299 {
4300         switch (lrtn) {
4301         case LOAD_SUCCESS:
4302                 return 0;
4303         case LOAD_BADARCH:
4304                 return EBADARCH;
4305         case LOAD_BADMACHO:
4306                 return EBADMACHO;
4307         case LOAD_SHLIB:
4308                 return ESHLIBVERS;
4309         case LOAD_NOSPACE:
4310         case LOAD_RESOURCE:
4311                 return ENOMEM;
4312         case LOAD_PROTECT:
4313                 return EACCES;
4314         case LOAD_ENOENT:
4315                 return ENOENT;
4316         case LOAD_IOERROR:
4317                 return EIO;
4318         case LOAD_FAILURE:
4319         case LOAD_DECRYPTFAIL:
4320         default:
4321                 return EBADEXEC;
4322         }
4323 }
4324
4325 #include <mach/mach_types.h>
4326 #include <mach/vm_prot.h>
4327 #include <mach/semaphore.h>
4328 #include <mach/sync_policy.h>
4329 #include <kern/clock.h>
4330 #include <mach/kern_return.h>
4331
4332 /*
4333  * execargs_alloc
4334  *
4335  * Description: Allocate the block of memory used by the execve arguments.
4336  *              At the same time, we allocate a page so that we can read in
4337  *              the first page of the image.
4338  *
4339  * Parameters:  struct image_params *   the image parameter block
4340  *
4341  * Returns:     0                       Success
4342  *              EINVAL                  Invalid argument
4343  *              EACCES                  Permission denied
4344  *              EINTR                   Interrupted function
4345  *              ENOMEM                  Not enough space
4346  *
4347  * Notes:       This is a temporary allocation into the kernel address space
4348  *              to enable us to copy arguments in from user space.  This is
4349  *              necessitated by not mapping the process calling execve() into
4350  *              the kernel address space during the execve() system call.
4351  *
4352  *              We assemble the argument and environment, etc., into this
4353  *              region before copying it as a single block into the child
4354  *              process address space (at the top or bottom of the stack,
4355  *              depending on which way the stack grows; see the function
4356  *              exec_copyout_strings() for details).
4357  *
4358  *              This ends up with a second (possibly unnecessary) copy compared
4359  *              with assembing the data directly into the child address space,
4360  *              instead, but since we cannot be guaranteed that the parent has
4361  *              not modified its environment, we can't really know that it's
4362  *              really a block there as well.
4363  */
4364
4365
4366 static int execargs_waiters = 0;
4367 lck_mtx_t *execargs_cache_lock;
4368
4369 static void
4370 execargs_lock_lock(void) {
4371         lck_mtx_lock_spin(execargs_cache_lock);
4372 }
4373
4374 static void
4375 execargs_lock_unlock(void) {
4376         lck_mtx_unlock(execargs_cache_lock);
4377 }
4378
4379 static wait_result_t
4380 execargs_lock_sleep(void) {
4381         return(lck_mtx_sleep(execargs_cache_lock, LCK_SLEEP_DEFAULT, &execargs_free_count, THREAD_INTERRUPTIBLE));
4382 }
4383
4384 static kern_return_t
4385 execargs_purgeable_allocate(char **execarg_address) {
4386         kern_return_t kr = vm_allocate(bsd_pageable_map, (vm_offset_t *)execarg_address, BSD_PAGEABLE_SIZE_PER_EXEC, VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE);
4387         assert(kr == KERN_SUCCESS);
4388         return kr;
4389 }
4390
4391 static kern_return_t
4392 execargs_purgeable_reference(void *execarg_address) {
4393         int state = VM_PURGABLE_NONVOLATILE;
4394         kern_return_t kr = vm_purgable_control(bsd_pageable_map, (vm_offset_t) execarg_address, VM_PURGABLE_SET_STATE, &state);
4395
4396         assert(kr == KERN_SUCCESS);
4397         return kr;
4398 }
4399
4400 static kern_return_t
4401 execargs_purgeable_volatilize(void *execarg_address) {
4402         int state = VM_PURGABLE_VOLATILE | VM_PURGABLE_ORDERING_OBSOLETE;
4403         kern_return_t kr;
4404         kr = vm_purgable_control(bsd_pageable_map, (vm_offset_t) execarg_address, VM_PURGABLE_SET_STATE, &state);
4405
4406         assert(kr == KERN_SUCCESS);
4407
4408         return kr;
4409 }
4410
4411 static void
4412 execargs_wakeup_waiters(void) {
4413         thread_wakeup(&execargs_free_count);
4414 }
4415
4416 static int
4417 execargs_alloc(struct image_params *imgp)
4418 {
4419         kern_return_t kret;
4420         wait_result_t res;
4421         int i, cache_index = -1;
4422
4423         execargs_lock_lock();
4424
4425         while (execargs_free_count == 0) {
4426                 execargs_waiters++;
4427                 res = execargs_lock_sleep();
4428                 execargs_waiters--;
4429                 if (res != THREAD_AWAKENED) {
4430                         execargs_lock_unlock();
4431                         return (EINTR);
4432                 }
4433         }
4434
4435         execargs_free_count--;
4436
4437         for (i = 0; i < execargs_cache_size; i++) {
4438                 vm_offset_t element = execargs_cache[i];
4439                 if (element) {
4440                         cache_index = i;
4441                         imgp->ip_strings = (char *)(execargs_cache[i]);
4442                         execargs_cache[i] = 0;
4443                         break;
4444                 }
4445         }
4446
4447         assert(execargs_free_count >= 0);
4448
4449         execargs_lock_unlock();
4450
4451         if (cache_index == -1) {
4452                 kret = execargs_purgeable_allocate(&imgp->ip_strings);
4453         }
4454         else
4455                 kret = execargs_purgeable_reference(imgp->ip_strings);
4456
4457         assert(kret == KERN_SUCCESS);
4458         if (kret != KERN_SUCCESS) {
4459                 return (ENOMEM);
4460         }
4461
4462         /* last page used to read in file headers */
4463         imgp->ip_vdata = imgp->ip_strings + ( NCARGS + PAGE_SIZE );
4464         imgp->ip_strendp = imgp->ip_strings;
4465         imgp->ip_argspace = NCARGS;
4466         imgp->ip_strspace = ( NCARGS + PAGE_SIZE );
4467
4468         return (0);
4469 }
4470
4471 /*
4472  * execargs_free
4473  *
4474  * Description: Free the block of memory used by the execve arguments and the
4475  *              first page of the executable by a previous call to the function
4476  *              execargs_alloc().
4477  *
4478  * Parameters:  struct image_params *   the image parameter block
4479  *
4480  * Returns:     0                       Success
4481  *              EINVAL                  Invalid argument
4482  *              EINTR                   Oeration interrupted
4483  */
4484 static int
4485 execargs_free(struct image_params *imgp)
4486 {
4487         kern_return_t kret;
4488         int i;
4489         boolean_t needs_wakeup = FALSE;
4490
4491         kret = execargs_purgeable_volatilize(imgp->ip_strings);
4492
4493         execargs_lock_lock();
4494         execargs_free_count++;
4495
4496         for (i = 0; i < execargs_cache_size; i++) {
4497                 vm_offset_t element = execargs_cache[i];
4498                 if (element == 0) {
4499                         execargs_cache[i] = (vm_offset_t) imgp->ip_strings;
4500                         imgp->ip_strings = NULL;
4501                         break;
4502                 }
4503         }
4504
4505         assert(imgp->ip_strings == NULL);
4506
4507         if (execargs_waiters > 0)
4508                 needs_wakeup = TRUE;
4509
4510         execargs_lock_unlock();
4511
4512         if (needs_wakeup == TRUE)
4513                 execargs_wakeup_waiters();
4514
4515         return ((kret == KERN_SUCCESS ? 0 : EINVAL));
4516 }
4517
4518 static void
4519 exec_resettextvp(proc_t p, struct image_params *imgp)
4520 {
4521         vnode_t vp;
4522         off_t offset;
4523         vnode_t tvp  = p->p_textvp;
4524         int ret;
4525
4526         vp = imgp->ip_vp;
4527         offset = imgp->ip_arch_offset;
4528
4529         if (vp == NULLVP)
4530                 panic("exec_resettextvp: expected valid vp");
4531
4532         ret = vnode_ref(vp);
4533         proc_lock(p);
4534         if (ret == 0) {
4535                 p->p_textvp = vp;
4536                 p->p_textoff = offset;
4537         } else {
4538                 p->p_textvp = NULLVP;   /* this is paranoia */
4539                 p->p_textoff = 0;
4540         }
4541         proc_unlock(p);
4542
4543         if ( tvp != NULLVP) {
4544                 if (vnode_getwithref(tvp) == 0) {
4545                         vnode_rele(tvp);
4546                         vnode_put(tvp);
4547                 }
4548         }
4549
4550 }
4551
4552 /*
4553  * If the process is not signed or if it contains entitlements, we
4554  * need to communicate through the task_access_port to taskgated.
4555  *
4556  * taskgated will provide a detached code signature if present, and
4557  * will enforce any restrictions on entitlements.
4558  */
4559
4560 static boolean_t
4561 taskgated_required(proc_t p, boolean_t *require_success)
4562 {
4563         size_t length;
4564         void *blob;
4565         int error;
4566
4567         if (cs_debug > 2)
4568                 csvnode_print_debug(p->p_textvp);
4569
4570         const int can_skip_taskgated = csproc_get_platform_binary(p) && !csproc_get_platform_path(p);
4571         if (can_skip_taskgated) {
4572                 if (cs_debug) printf("taskgated not required for: %s\n", p->p_name);
4573                 *require_success = FALSE;
4574                 return FALSE;
4575         }
4576
4577         if ((p->p_csflags & CS_VALID) == 0) {
4578                 *require_success = FALSE;
4579                 return TRUE;
4580         }
4581
4582         error = cs_entitlements_blob_get(p, &blob, &length);
4583         if (error == 0 && blob != NULL) {
4584                 /*
4585                  * fatal on the desktop when entitlements are present,
4586                  * unless we started in single-user mode
4587                  */
4588                 if ((boothowto & RB_SINGLE) == 0)
4589                         *require_success = TRUE;
4590                 /*
4591                  * Allow initproc to run without causing taskgated to launch
4592                  */
4593                 if (p == initproc) {
4594                         *require_success = FALSE;
4595                         return FALSE;
4596                 }
4597
4598                 if (cs_debug) printf("taskgated required for: %s\n", p->p_name);
4599
4600                 return TRUE;
4601         }
4602
4603         *require_success = FALSE;
4604         return FALSE;
4605 }
4606
4607 /*
4608  * __EXEC_WAITING_ON_TASKGATED_CODE_SIGNATURE_UPCALL__
4609  *
4610  * Description: Waits for the userspace daemon to respond to the request
4611  *              we made. Function declared non inline to be visible in
4612  *              stackshots and spindumps as well as debugging.
4613  */
4614 __attribute__((noinline)) int
4615 __EXEC_WAITING_ON_TASKGATED_CODE_SIGNATURE_UPCALL__(mach_port_t task_access_port, int32_t new_pid)
4616 {
4617         return find_code_signature(task_access_port, new_pid);
4618 }
4619
4620 static int
4621 check_for_signature(proc_t p, struct image_params *imgp)
4622 {
4623         mach_port_t port = NULL;
4624         kern_return_t kr = KERN_FAILURE;
4625         int error = EACCES;
4626         boolean_t unexpected_failure = FALSE;
4627         unsigned char hash[SHA1_RESULTLEN];
4628         boolean_t require_success = FALSE;
4629         int spawn = (imgp->ip_flags & IMGPF_SPAWN);
4630         int vfexec = (imgp->ip_flags & IMGPF_VFORK_EXEC);
4631
4632         /*
4633          * Override inherited code signing flags with the
4634          * ones for the process that is being successfully
4635          * loaded
4636          */
4637         proc_lock(p);
4638         p->p_csflags = imgp->ip_csflags;
4639         proc_unlock(p);
4640
4641         /* Set the switch_protect flag on the map */
4642         if(p->p_csflags & (CS_HARD|CS_KILL)) {
4643                 vm_map_switch_protect(get_task_map(p->task), TRUE);
4644         }
4645
4646         /*
4647          * image activation may be failed due to policy
4648          * which is unexpected but security framework does not
4649          * approve of exec, kill and return immediately.
4650          */
4651         if (imgp->ip_mac_return != 0) {
4652                 error = imgp->ip_mac_return;
4653                 unexpected_failure = TRUE;
4654                 goto done;
4655         }
4656
4657         /* check if callout to taskgated is needed */
4658         if (!taskgated_required(p, &require_success)) {
4659                 error = 0;
4660                 goto done;
4661         }
4662
4663         kr = task_get_task_access_port(p->task, &port);
4664         if (KERN_SUCCESS != kr || !IPC_PORT_VALID(port)) {
4665                 error = 0;
4666                 if (require_success)
4667                         error = EACCES;
4668                 goto done;
4669         }
4670
4671         /*
4672          * taskgated returns KERN_SUCCESS if it has completed its work
4673          * and the exec should continue, KERN_FAILURE if the exec should
4674          * fail, or it may error out with different error code in an
4675          * event of mig failure (e.g. process was signalled during the
4676          * rpc call, taskgated died, mig server died etc.).
4677          */
4678
4679         kr = __EXEC_WAITING_ON_TASKGATED_CODE_SIGNATURE_UPCALL__(port, p->p_pid);
4680         switch (kr) {
4681         case KERN_SUCCESS:
4682                 error = 0;
4683                 break;
4684         case KERN_FAILURE:
4685                 error = EACCES;
4686                 goto done;
4687         default:
4688                 error = EACCES;
4689                 unexpected_failure = TRUE;
4690                 goto done;
4691         }
4692
4693         /* Only do this if exec_resettextvp() did not fail */
4694         if (p->p_textvp != NULLVP) {
4695                 /*
4696                  * If there's a new code directory, mark this process
4697                  * as signed.
4698                  */
4699                 if (0 == ubc_cs_getcdhash(p->p_textvp, p->p_textoff, hash)) {
4700                         proc_lock(p);
4701                         p->p_csflags |= CS_VALID;
4702                         proc_unlock(p);
4703                 }
4704         }
4705
4706 done:
4707         if (0 != error) {
4708                 if (!unexpected_failure)
4709                         p->p_csflags |= CS_KILLED;
4710                 /* make very sure execution fails */
4711                 if (vfexec || spawn) {
4712                         psignal_vfork(p, p->task, imgp->ip_new_thread, SIGKILL);
4713                         error = 0;
4714                 } else {
4715                         psignal(p, SIGKILL);
4716                 }
4717         }
4718         return error;
4719 }
4720
4721 /*
4722  * Typically as soon as we start executing this process, the
4723  * first instruction will trigger a VM fault to bring the text
4724  * pages (as executable) into the address space, followed soon
4725  * thereafter by dyld data structures (for dynamic executable).
4726  * To optimize this, as well as improve support for hardware
4727  * debuggers that can only access resident pages present
4728  * in the process' page tables, we prefault some pages if
4729  * possible. Errors are non-fatal.
4730  */
4731 static void exec_prefault_data(proc_t p __unused, struct image_params *imgp, load_result_t *load_result)
4732 {
4733         int ret;
4734         size_t expected_all_image_infos_size;
4735
4736         /*
4737          * Prefault executable or dyld entry point.
4738          */
4739         vm_fault(current_map(),
4740                  vm_map_trunc_page(load_result->entry_point,
4741                                    vm_map_page_mask(current_map())),
4742                  VM_PROT_READ | VM_PROT_EXECUTE,
4743                  FALSE,
4744                  THREAD_UNINT, NULL, 0);
4745
4746         if (imgp->ip_flags & IMGPF_IS_64BIT) {
4747                 expected_all_image_infos_size = sizeof(struct user64_dyld_all_image_infos);
4748         } else {
4749                 expected_all_image_infos_size = sizeof(struct user32_dyld_all_image_infos);
4750         }
4751
4752         /* Decode dyld anchor structure from <mach-o/dyld_images.h> */
4753         if (load_result->dynlinker &&
4754                 load_result->all_image_info_addr &&
4755                 load_result->all_image_info_size >= expected_all_image_infos_size) {
4756                 union {
4757                         struct user64_dyld_all_image_infos      infos64;
4758                         struct user32_dyld_all_image_infos      infos32;
4759                 } all_image_infos;
4760
4761                 /*
4762                  * Pre-fault to avoid copyin() going through the trap handler
4763                  * and recovery path.
4764                  */
4765                 vm_fault(current_map(),
4766                          vm_map_trunc_page(load_result->all_image_info_addr,
4767                                            vm_map_page_mask(current_map())),
4768                          VM_PROT_READ | VM_PROT_WRITE,
4769                          FALSE,
4770                          THREAD_UNINT, NULL, 0);
4771                 if ((load_result->all_image_info_addr & PAGE_MASK) + expected_all_image_infos_size > PAGE_SIZE) {
4772                         /* all_image_infos straddles a page */
4773                         vm_fault(current_map(),
4774                                  vm_map_trunc_page(load_result->all_image_info_addr + expected_all_image_infos_size - 1,
4775                                                    vm_map_page_mask(current_map())),
4776                                  VM_PROT_READ | VM_PROT_WRITE,
4777                                  FALSE,
4778                                  THREAD_UNINT, NULL, 0);
4779                 }
4780
4781                 ret = copyin(load_result->all_image_info_addr,
4782                                          &all_image_infos,
4783                                          expected_all_image_infos_size);
4784                 if (ret == 0 && all_image_infos.infos32.version >= 9) {
4785
4786                         user_addr_t notification_address;
4787                         user_addr_t dyld_image_address;
4788                         user_addr_t dyld_version_address;
4789                         user_addr_t dyld_all_image_infos_address;
4790                         user_addr_t dyld_slide_amount;
4791
4792                         if (imgp->ip_flags & IMGPF_IS_64BIT) {
4793                                 notification_address = all_image_infos.infos64.notification;
4794                                 dyld_image_address = all_image_infos.infos64.dyldImageLoadAddress;
4795                                 dyld_version_address = all_image_infos.infos64.dyldVersion;
4796                                 dyld_all_image_infos_address = all_image_infos.infos64.dyldAllImageInfosAddress;
4797                         } else {
4798                                 notification_address = all_image_infos.infos32.notification;
4799                                 dyld_image_address = all_image_infos.infos32.dyldImageLoadAddress;
4800                                 dyld_version_address = all_image_infos.infos32.dyldVersion;
4801                                 dyld_all_image_infos_address = all_image_infos.infos32.dyldAllImageInfosAddress;
4802                         }
4803
4804                         /*
4805                          * dyld statically sets up the all_image_infos in its Mach-O
4806                          * binary at static link time, with pointers relative to its default
4807                          * load address. Since ASLR might slide dyld before its first
4808                          * instruction is executed, "dyld_slide_amount" tells us how far
4809                          * dyld was loaded compared to its default expected load address.
4810                          * All other pointers into dyld's image should be adjusted by this
4811                          * amount. At some point later, dyld will fix up pointers to take
4812                          * into account the slide, at which point the all_image_infos_address
4813                          * field in the structure will match the runtime load address, and
4814                          * "dyld_slide_amount" will be 0, if we were to consult it again.
4815                          */
4816
4817                         dyld_slide_amount = load_result->all_image_info_addr - dyld_all_image_infos_address;
4818
4819 #if 0
4820                         kprintf("exec_prefault: 0x%016llx 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
4821                                         (uint64_t)load_result->all_image_info_addr,
4822                                         all_image_infos.infos32.version,
4823                                         (uint64_t)notification_address,
4824                                         (uint64_t)dyld_image_address,
4825                                         (uint64_t)dyld_version_address,
4826                                         (uint64_t)dyld_all_image_infos_address);
4827 #endif
4828
4829                         vm_fault(current_map(),
4830                                  vm_map_trunc_page(notification_address + dyld_slide_amount,
4831                                                    vm_map_page_mask(current_map())),
4832                                  VM_PROT_READ | VM_PROT_EXECUTE,
4833                                  FALSE,
4834                                  THREAD_UNINT, NULL, 0);
4835                         vm_fault(current_map(),
4836                                  vm_map_trunc_page(dyld_image_address + dyld_slide_amount,
4837                                                    vm_map_page_mask(current_map())),
4838                                  VM_PROT_READ | VM_PROT_EXECUTE,
4839                                  FALSE,
4840                                  THREAD_UNINT, NULL, 0);
4841                         vm_fault(current_map(),
4842                                  vm_map_trunc_page(dyld_version_address + dyld_slide_amount,
4843                                                    vm_map_page_mask(current_map())),
4844                                  VM_PROT_READ,
4845                                  FALSE,
4846                                  THREAD_UNINT, NULL, 0);
4847                         vm_fault(current_map(),
4848                                  vm_map_trunc_page(dyld_all_image_infos_address + dyld_slide_amount,
4849                                                    vm_map_page_mask(current_map())),
4850                                  VM_PROT_READ | VM_PROT_WRITE,
4851                                  FALSE,
4852                                  THREAD_UNINT, NULL, 0);
4853                 }
4854         }
4855 }