bsd/kern/kern_exec.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Mach Operating System
  31  * Copyright (c) 1987 Carnegie-Mellon University
  32  * All rights reserved.  The CMU software License Agreement specifies
  33  * the terms and conditions for use and redistribution.
  34  */
  35
  36 #include <cputypes.h>
  37
  38 /*-
  39  * Copyright (c) 1982, 1986, 1991, 1993
  40  *      The Regents of the University of California.  All rights reserved.
  41  * (c) UNIX System Laboratories, Inc.
  42  * All or some portions of this file are derived from material licensed
  43  * to the University of California by American Telephone and Telegraph
  44  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  45  * the permission of UNIX System Laboratories, Inc.
  46  *
  47  * Redistribution and use in source and binary forms, with or without
  48  * modification, are permitted provided that the following conditions
  49  * are met:
  50  * 1. Redistributions of source code must retain the above copyright
  51  *    notice, this list of conditions and the following disclaimer.
  52  * 2. Redistributions in binary form must reproduce the above copyright
  53  *    notice, this list of conditions and the following disclaimer in the
  54  *    documentation and/or other materials provided with the distribution.
  55  * 3. All advertising materials mentioning features or use of this software
  56  *    must display the following acknowledgement:
  57  *      This product includes software developed by the University of
  58  *      California, Berkeley and its contributors.
  59  * 4. Neither the name of the University nor the names of its contributors
  60  *    may be used to endorse or promote products derived from this software
  61  *    without specific prior written permission.
  62  *
  63  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  64  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  65  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  66  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  67  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  68  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  69  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  70  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  71  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  72  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  73  * SUCH DAMAGE.
  74  *
  75  *      from: @(#)kern_exec.c   8.1 (Berkeley) 6/10/93
  76  */
  77 /*
  78  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  79  * support for mandatory and extensible security protections.  This notice
  80  * is included in support of clause 2.2 (b) of the Apple Public License,
  81  * Version 2.0.
  82  */
  83 #include <machine/reg.h>
  84
  85 #include <sys/param.h>
  86 #include <sys/systm.h>
  87 #include <sys/filedesc.h>
  88 #include <sys/kernel.h>
  89 #include <sys/proc_internal.h>
  90 #include <sys/kauth.h>
  91 #include <sys/user.h>
  92 #include <sys/socketvar.h>
  93 #include <sys/malloc.h>
  94 #include <sys/namei.h>
  95 #include <sys/mount_internal.h>
  96 #include <sys/vnode_internal.h>
  97 #include <sys/file_internal.h>
  98 #include <sys/stat.h>
  99 #include <sys/uio_internal.h>
 100 #include <sys/acct.h>
 101 #include <sys/exec.h>
 102 #include <sys/kdebug.h>
 103 #include <sys/signal.h>
 104 #include <sys/aio_kern.h>
 105 #include <sys/sysproto.h>
 106 #if SYSV_SHM
 107 #include <sys/shm_internal.h>           /* shmexec() */
 108 #endif
 109 #include <sys/ubc_internal.h>           /* ubc_map() */
 110 #include <sys/spawn.h>
 111 #include <sys/spawn_internal.h>
 112 #include <sys/codesign.h>
 113 #include <crypto/sha1.h>
 114
 115 #include <security/audit/audit.h>
 116
 117 #include <ipc/ipc_types.h>
 118
 119 #include <mach/mach_types.h>
 120 #include <mach/port.h>
 121 #include <mach/task.h>
 122 #include <mach/task_access.h>
 123 #include <mach/thread_act.h>
 124 #include <mach/vm_map.h>
 125 #include <mach/mach_vm.h>
 126 #include <mach/vm_param.h>
 127
 128 #include <kern/sched_prim.h> /* thread_wakeup() */
 129 #include <kern/affinity.h>
 130 #include <kern/assert.h>
 131
 132 #if CONFIG_MACF
 133 #include <security/mac.h>
 134 #include <security/mac_mach_internal.h>
 135 #endif
 136
 137 #include <vm/vm_map.h>
 138 #include <vm/vm_kern.h>
 139 #include <vm/vm_protos.h>
 140 #include <vm/vm_kern.h>
 141
 142
 143 #if CONFIG_DTRACE
 144 /* Do not include dtrace.h, it redefines kmem_[alloc/free] */
 145 extern void (*dtrace_fasttrap_exec_ptr)(proc_t);
 146 extern void (*dtrace_helpers_cleanup)(proc_t);
 147 extern void dtrace_lazy_dofs_destroy(proc_t);
 148
 149 #include <sys/dtrace_ptss.h>
 150 #endif
 151
 152 /* support for child creation in exec after vfork */
 153 thread_t fork_create_child(task_t parent_task, proc_t child_proc, int inherit_memory, int is64bit);
 154 void vfork_exit(proc_t p, int rv);
 155 int setsigvec(proc_t, thread_t, int, struct __kern_sigaction *, boolean_t in_sigstart);
 156 void workqueue_exit(struct proc *);
 157
 158
 159 /*
 160  * Mach things for which prototypes are unavailable from Mach headers
 161  */
 162 void            ipc_task_reset(
 163                         task_t          task);
 164 void            ipc_thread_reset(
 165                         thread_t        thread);
 166 kern_return_t ipc_object_copyin(
 167         ipc_space_t             space,
 168         mach_port_name_t        name,
 169         mach_msg_type_name_t    msgt_name,
 170         ipc_object_t            *objectp);
 171 void ipc_port_release_send(ipc_port_t);
 172
 173 extern struct savearea *get_user_regs(thread_t);
 174
 175
 176 #include <kern/thread.h>
 177 #include <kern/task.h>
 178 #include <kern/ast.h>
 179 #include <kern/mach_loader.h>
 180 #include <kern/mach_fat.h>
 181 #include <mach-o/fat.h>
 182 #include <mach-o/loader.h>
 183 #include <machine/vmparam.h>
 184 #include <sys/imgact.h>
 185
 186 #include <sys/sdt.h>
 187
 188
 189 /*
 190  * SIZE_MAXPTR          The maximum size of a user space pointer, in bytes
 191  * SIZE_IMG_STRSPACE    The available string space, minus two pointers; we
 192  *                      define it interms of the maximum, since we don't
 193  *                      know the pointer size going in, until after we've
 194  *                      parsed the executable image.
 195  */
 196 #define SIZE_MAXPTR             8                               /* 64 bits */
 197 #define SIZE_IMG_STRSPACE       (NCARGS - 2 * SIZE_MAXPTR)
 198
 199 /*
 200  * EAI_ITERLIMIT        The maximum number of times to iterate an image
 201  *                      activator in exec_activate_image() before treating
 202  *                      it as malformed/corrupt.
 203  */
 204 #define EAI_ITERLIMIT           10
 205
 206 extern vm_map_t bsd_pageable_map;
 207 extern struct fileops vnops;
 208
 209 #define ROUND_PTR(type, addr)   \
 210         (type *)( ( (uintptr_t)(addr) + 16 - 1) \
 211                   & ~(16 - 1) )
 212
 213 struct image_params;    /* Forward */
 214 static int exec_activate_image(struct image_params *imgp);
 215 static int exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp);
 216 static int load_return_to_errno(load_return_t lrtn);
 217 static int execargs_alloc(struct image_params *imgp);
 218 static int execargs_free(struct image_params *imgp);
 219 static int exec_check_permissions(struct image_params *imgp);
 220 static int exec_extract_strings(struct image_params *imgp);
 221 static int exec_handle_sugid(struct image_params *imgp);
 222 static int sugid_scripts = 0;
 223 SYSCTL_INT (_kern, OID_AUTO, sugid_scripts, CTLFLAG_RW, &sugid_scripts, 0, "");
 224 static kern_return_t create_unix_stack(vm_map_t map, user_addr_t user_stack,
 225                                         int customstack, proc_t p);
 226 static int copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size);
 227 static void exec_resettextvp(proc_t, struct image_params *);
 228 static int check_for_signature(proc_t, struct image_params *);
 229
 230 /* We don't want this one exported */
 231 __private_extern__
 232 int  open1(vfs_context_t, struct nameidata *, int, struct vnode_attr *, int32_t *);
 233
 234 /*
 235  * exec_add_string
 236  *
 237  * Add the requested string to the string space area.
 238  *
 239  * Parameters;  struct image_params *           image parameter block
 240  *              user_addr_t                     string to add to strings area
 241  *
 242  * Returns:     0                       Success
 243  *              !0                      Failure errno from copyinstr()
 244  *
 245  * Implicit returns:
 246  *              (imgp->ip_strendp)      updated location of next add, if any
 247  *              (imgp->ip_strspace)     updated byte count of space remaining
 248  */
 249 static int
 250 exec_add_string(struct image_params *imgp, user_addr_t str)
 251 {
 252         int error = 0;
 253
 254         do {
 255                 size_t len = 0;
 256                 if (imgp->ip_strspace <= 0) {
 257                         error = E2BIG;
 258                         break;
 259                 }
 260                 if (!UIO_SEG_IS_USER_SPACE(imgp->ip_seg)) {
 261                         char *kstr = CAST_DOWN(char *,str);     /* SAFE */
 262                         error = copystr(kstr, imgp->ip_strendp, imgp->ip_strspace, &len);
 263                 } else  {
 264                         error = copyinstr(str, imgp->ip_strendp, imgp->ip_strspace,
 265                             &len);
 266                 }
 267                 imgp->ip_strendp += len;
 268                 imgp->ip_strspace -= len;
 269         } while (error == ENAMETOOLONG);
 270
 271         return error;
 272 }
 273
 274 /*
 275  * exec_save_path
 276  *
 277  * To support new app package launching for Mac OS X, the dyld needs the
 278  * first argument to execve() stored on the user stack.
 279  *
 280  * Save the executable path name at the top of the strings area and set
 281  * the argument vector pointer to the location following that to indicate
 282  * the start of the argument and environment tuples, setting the remaining
 283  * string space count to the size of the string area minus the path length
 284  * and a reserve for two pointers.
 285  *
 286  * Parameters;  struct image_params *           image parameter block
 287  *              char *                          path used to invoke program
 288  *              int                             segment from which path comes
 289  *
 290  * Returns:     int                     0       Success
 291  *              EFAULT                          Bad address
 292  *      copy[in]str:EFAULT                      Bad address
 293  *      copy[in]str:ENAMETOOLONG                Filename too long
 294  *
 295  * Implicit returns:
 296  *              (imgp->ip_strings)              saved path
 297  *              (imgp->ip_strspace)             space remaining in ip_strings
 298  *              (imgp->ip_argv)                 beginning of argument list
 299  *              (imgp->ip_strendp)              start of remaining copy area
 300  *
 301  * Note:        We have to do this before the initial namei() since in the
 302  *              path contains symbolic links, namei() will overwrite the
 303  *              original path buffer contents.  If the last symbolic link
 304  *              resolved was a relative pathname, we would lose the original
 305  *              "path", which could be an absolute pathname. This might be
 306  *              unacceptable for dyld.
 307  */
 308 static int
 309 exec_save_path(struct image_params *imgp, user_addr_t path, int seg)
 310 {
 311         int error;
 312         size_t  len;
 313         char *kpath = CAST_DOWN(char *,path);   /* SAFE */
 314
 315         imgp->ip_strendp = imgp->ip_strings;
 316         imgp->ip_strspace = SIZE_IMG_STRSPACE;
 317
 318         len = MIN(MAXPATHLEN, imgp->ip_strspace);
 319
 320         switch(seg) {
 321         case UIO_USERSPACE32:
 322         case UIO_USERSPACE64:   /* Same for copyin()... */
 323                 error = copyinstr(path, imgp->ip_strings, len, &len);
 324                 break;
 325         case UIO_SYSSPACE:
 326                 error = copystr(kpath, imgp->ip_strings, len, &len);
 327                 break;
 328         default:
 329                 error = EFAULT;
 330                 break;
 331         }
 332
 333         if (!error) {
 334                 imgp->ip_strendp += len;
 335                 imgp->ip_strspace -= len;
 336                 imgp->ip_argv = imgp->ip_strendp;
 337         }
 338
 339         return(error);
 340 }
 341
 342 #ifdef IMGPF_POWERPC
 343 /*
 344  * exec_powerpc32_imgact
 345  *
 346  * Implicitly invoke the PowerPC handler for a byte-swapped image magic
 347  * number.  This may happen either as a result of an attempt to invoke a
 348  * PowerPC image directly, or indirectly as the interpreter used in an
 349  * interpreter script.
 350  *
 351  * Parameters;  struct image_params *   image parameter block
 352  *
 353  * Returns:     -1              not an PowerPC image (keep looking)
 354  *              -3              Success: exec_archhandler_ppc: relookup
 355  *              >0              Failure: exec_archhandler_ppc: error number
 356  *
 357  * Note:        This image activator does not handle the case of a direct
 358  *              invocation of the exec_archhandler_ppc, since in that case, the
 359  *              exec_archhandler_ppc itself is not a PowerPC binary; instead,
 360  *              binary image activators must recognize the exec_archhandler_ppc;
 361  *              This is managed in exec_check_permissions().
 362  *
 363  * Note:        This image activator is limited to 32 bit powerpc images;
 364  *              if support for 64 bit powerpc images is desired, it would
 365  *              be more in line with this design to write a separate 64 bit
 366  *              image activator.
 367  */
 368 static int
 369 exec_powerpc32_imgact(struct image_params *imgp)
 370 {
 371         struct mach_header *mach_header = (struct mach_header *)imgp->ip_vdata;
 372         int error;
 373         size_t len = 0;
 374
 375         /*
 376          * Make sure it's a PowerPC binary.  If we've already redirected
 377          * from an interpreted file once, don't do it again.
 378          */
 379         if (mach_header->magic != MH_CIGAM) {
 380                 /*
 381                  * If it's a cross-architecture 64 bit binary, then claim
 382                  * it, but refuse to run it.
 383                  */
 384                 if (mach_header->magic == MH_CIGAM_64)
 385                         return (EBADARCH);
 386                 return (-1);
 387         }
 388
 389         /* If there is no exec_archhandler_ppc, we can't run it */
 390         if (exec_archhandler_ppc.path[0] == 0)
 391                 return (EBADARCH);
 392
 393         /* Remember the type of the original file for later grading */
 394         if (!imgp->ip_origcputype) {
 395                 imgp->ip_origcputype =
 396                         OSSwapBigToHostInt32(mach_header->cputype);
 397                 imgp->ip_origcpusubtype =
 398                         OSSwapBigToHostInt32(mach_header->cpusubtype);
 399         }
 400
 401         /*
 402          * The PowerPC flag will be set by the exec_check_permissions()
 403          * call anyway; however, we set this flag here so that the relookup
 404          * in execve() does not follow symbolic links, as a side effect.
 405          */
 406         imgp->ip_flags |= IMGPF_POWERPC;
 407
 408         /* impute an interpreter */
 409         error = copystr(exec_archhandler_ppc.path, imgp->ip_interp_name,
 410                         IMG_SHSIZE, &len);
 411         if (error)
 412                 return (error);
 413
 414         /*
 415          * provide a replacement string for p->p_comm; we have to use an
 416          * alternate buffer for this, rather than replacing it directly,
 417          * since the exec may fail and return to the parent.  In that case,
 418          * we would have erroneously changed the parent p->p_comm instead.
 419          */
 420         strlcpy(imgp->ip_p_comm, imgp->ip_ndp->ni_cnd.cn_nameptr, MAXCOMLEN+1);
 421                                                 /* +1 to allow MAXCOMLEN characters to be copied */
 422
 423         return (-3);
 424 }
 425 #endif  /* IMGPF_POWERPC */
 426
 427
 428 /*
 429  * exec_shell_imgact
 430  *
 431  * Image activator for interpreter scripts.  If the image begins with the
 432  * characters "#!", then it is an interpreter script.  Verify that we are
 433  * not already executing in PowerPC mode, and that the length of the script
 434  * line indicating the interpreter is not in excess of the maximum allowed
 435  * size.  If this is the case, then break out the arguments, if any, which
 436  * are separated by white space, and copy them into the argument save area
 437  * as if they were provided on the command line before all other arguments.
 438  * The line ends when we encounter a comment character ('#') or newline.
 439  *
 440  * Parameters;  struct image_params *   image parameter block
 441  *
 442  * Returns:     -1                      not an interpreter (keep looking)
 443  *              -3                      Success: interpreter: relookup
 444  *              >0                      Failure: interpreter: error number
 445  *
 446  * A return value other than -1 indicates subsequent image activators should
 447  * not be given the opportunity to attempt to activate the image.
 448  */
 449 static int
 450 exec_shell_imgact(struct image_params *imgp)
 451 {
 452         char *vdata = imgp->ip_vdata;
 453         char *ihp;
 454         char *line_endp;
 455         char *interp;
 456         char temp[16];
 457         proc_t p;
 458         struct fileproc *fp;
 459         int fd;
 460         int error;
 461         size_t len;
 462
 463         /*
 464          * Make sure it's a shell script.  If we've already redirected
 465          * from an interpreted file once, don't do it again.
 466          *
 467          * Note: We disallow PowerPC, since the expectation is that we
 468          * may run a PowerPC interpreter, but not an interpret a PowerPC
 469          * image.  This is consistent with historical behaviour.
 470          */
 471         if (vdata[0] != '#' ||
 472             vdata[1] != '!' ||
 473             (imgp->ip_flags & IMGPF_INTERPRET) != 0) {
 474                 return (-1);
 475         }
 476
 477 #ifdef IMGPF_POWERPC
 478         if ((imgp->ip_flags & IMGPF_POWERPC) != 0)
 479                   return (EBADARCH);
 480 #endif  /* IMGPF_POWERPC */
 481
 482         imgp->ip_flags |= IMGPF_INTERPRET;
 483
 484         /* Check to see if SUGID scripts are permitted.  If they aren't then
 485          * clear the SUGID bits.
 486          * imgp->ip_vattr is known to be valid.
 487          */
 488         if (sugid_scripts == 0) {
 489            imgp->ip_origvattr->va_mode &= ~(VSUID | VSGID);
 490         }
 491
 492         /* Find the nominal end of the interpreter line */
 493         for( ihp = &vdata[2]; *ihp != '\n' && *ihp != '#'; ihp++) {
 494                 if (ihp >= &vdata[IMG_SHSIZE])
 495                         return (ENOEXEC);
 496         }
 497
 498         line_endp = ihp;
 499         ihp = &vdata[2];
 500         /* Skip over leading spaces - until the interpreter name */
 501         while ( ihp < line_endp && ((*ihp == ' ') || (*ihp == '\t')))
 502                 ihp++;
 503
 504         /*
 505          * Find the last non-whitespace character before the end of line or
 506          * the beginning of a comment; this is our new end of line.
 507          */
 508         for (;line_endp > ihp && ((*line_endp == ' ') || (*line_endp == '\t')); line_endp--)
 509                 continue;
 510
 511         /* Empty? */
 512         if (line_endp == ihp)
 513                 return (ENOEXEC);
 514
 515         /* copy the interpreter name */
 516         interp = imgp->ip_interp_name;
 517         while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t'))
 518                 *interp++ = *ihp++;
 519         *interp = '\0';
 520
 521         exec_save_path(imgp, CAST_USER_ADDR_T(imgp->ip_interp_name),
 522                                                         UIO_SYSSPACE);
 523
 524         ihp = &vdata[2];
 525         while (ihp < line_endp) {
 526                 /* Skip leading whitespace before each argument */
 527                 while ((*ihp == ' ') || (*ihp == '\t'))
 528                         ihp++;
 529
 530                 if (ihp >= line_endp)
 531                         break;
 532
 533                 /* We have an argument; copy it */
 534                 while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) {
 535                         *imgp->ip_strendp++ = *ihp++;
 536                         imgp->ip_strspace--;
 537                 }
 538                 *imgp->ip_strendp++ = 0;
 539                 imgp->ip_strspace--;
 540                 imgp->ip_argc++;
 541         }
 542
 543         /*
 544          * If we have a SUID oder SGID script, create a file descriptor
 545          * from the vnode and pass /dev/fd/%d instead of the actual
 546          * path name so that the script does not get opened twice
 547          */
 548         if (imgp->ip_origvattr->va_mode & (VSUID | VSGID)) {
 549                 p = vfs_context_proc(imgp->ip_vfs_context);
 550                 error = falloc(p, &fp, &fd, imgp->ip_vfs_context);
 551                 if (error)
 552                         return(error);
 553
 554                 fp->f_fglob->fg_flag = FREAD;
 555                 fp->f_fglob->fg_type = DTYPE_VNODE;
 556                 fp->f_fglob->fg_ops = &vnops;
 557                 fp->f_fglob->fg_data = (caddr_t)imgp->ip_vp;
 558
 559                 proc_fdlock(p);
 560                 procfdtbl_releasefd(p, fd, NULL);
 561                 fp_drop(p, fd, fp, 1);
 562                 proc_fdunlock(p);
 563                 vnode_ref(imgp->ip_vp);
 564
 565                 snprintf(temp, sizeof(temp), "/dev/fd/%d", fd);
 566                 error = copyoutstr(temp, imgp->ip_user_fname, sizeof(temp), &len);
 567                 if (error)
 568                         return(error);
 569         }
 570
 571         return (-3);
 572 }
 573
 574
 575
 576 /*
 577  * exec_fat_imgact
 578  *
 579  * Image activator for fat 1.0 binaries.  If the binary is fat, then we
 580  * need to select an image from it internally, and make that the image
 581  * we are going to attempt to execute.  At present, this consists of
 582  * reloading the first page for the image with a first page from the
 583  * offset location indicated by the fat header.
 584  *
 585  * Parameters;  struct image_params *   image parameter block
 586  *
 587  * Returns:     -1                      not a fat binary (keep looking)
 588  *              -2                      Success: encapsulated binary: reread
 589  *              >0                      Failure: error number
 590  *
 591  * Important:   This image activator is byte order neutral.
 592  *
 593  * Note:        A return value other than -1 indicates subsequent image
 594  *              activators should not be given the opportunity to attempt
 595  *              to activate the image.
 596  *
 597  *              If we find an encapsulated binary, we make no assertions
 598  *              about its  validity; instead, we leave that up to a rescan
 599  *              for an activator to claim it, and, if it is claimed by one,
 600  *              that activator is responsible for determining validity.
 601  */
 602 static int
 603 exec_fat_imgact(struct image_params *imgp)
 604 {
 605         proc_t p = vfs_context_proc(imgp->ip_vfs_context);
 606         kauth_cred_t cred = kauth_cred_proc_ref(p);
 607         struct fat_header *fat_header = (struct fat_header *)imgp->ip_vdata;
 608         struct _posix_spawnattr *psa = NULL;
 609         struct fat_arch fat_arch;
 610         int resid, error;
 611         load_return_t lret;
 612
 613         /* Make sure it's a fat binary */
 614         if ((fat_header->magic != FAT_MAGIC) &&
 615             (fat_header->magic != FAT_CIGAM)) {
 616                 error = -1;
 617                 goto bad;
 618         }
 619
 620         /* If posix_spawn binprefs exist, respect those prefs. */
 621         psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
 622         if (psa != NULL && psa->psa_binprefs[0] != 0) {
 623                 struct fat_arch *arches = (struct fat_arch *) (fat_header + 1);
 624                 int nfat_arch = 0, pr = 0, f = 0;
 625
 626                 nfat_arch = OSSwapBigToHostInt32(fat_header->nfat_arch);
 627                 /* Check each preference listed against all arches in header */
 628                 for (pr = 0; pr < NBINPREFS; pr++) {
 629                         cpu_type_t pref = psa->psa_binprefs[pr];
 630                         if (pref == 0) {
 631                                 /* No suitable arch in the pref list */
 632                                 error = EBADARCH;
 633                                 goto bad;
 634                         }
 635
 636                         if (pref == CPU_TYPE_ANY) {
 637                                 /* Fall through to regular grading */
 638                                 break;
 639                         }
 640
 641                         for (f = 0; f < nfat_arch; f++) {
 642                                 cpu_type_t archtype = OSSwapBigToHostInt32(
 643                                                 arches[f].cputype);
 644                                 cpu_type_t archsubtype = OSSwapBigToHostInt32(
 645                                                 arches[f].cpusubtype) & ~CPU_SUBTYPE_MASK;
 646                                 if (pref == archtype &&
 647                                         grade_binary(archtype, archsubtype)) {
 648                                         /* We have a winner! */
 649                                         fat_arch.cputype = archtype;
 650                                         fat_arch.cpusubtype = archsubtype;
 651                                         fat_arch.offset = OSSwapBigToHostInt32(
 652                                                         arches[f].offset);
 653                                         fat_arch.size = OSSwapBigToHostInt32(
 654                                                         arches[f].size);
 655                                         fat_arch.align = OSSwapBigToHostInt32(
 656                                                         arches[f].align);
 657                                         goto use_arch;
 658                                 }
 659                         }
 660                 }
 661         }
 662
 663         /* Look up our preferred architecture in the fat file. */
 664         lret = fatfile_getarch_affinity(imgp->ip_vp,
 665                                         (vm_offset_t)fat_header,
 666                                         &fat_arch,
 667                                         (p->p_flag & P_AFFINITY));
 668         if (lret != LOAD_SUCCESS) {
 669                 error = load_return_to_errno(lret);
 670                 goto bad;
 671         }
 672
 673 use_arch:
 674         /* Read the Mach-O header out of fat_arch */
 675         error = vn_rdwr(UIO_READ, imgp->ip_vp, imgp->ip_vdata,
 676                         PAGE_SIZE, fat_arch.offset,
 677                         UIO_SYSSPACE, (IO_UNIT|IO_NODELOCKED),
 678                         cred, &resid, p);
 679         if (error) {
 680                 goto bad;
 681         }
 682
 683         /* Did we read a complete header? */
 684         if (resid) {
 685                 error = EBADEXEC;
 686                 goto bad;
 687         }
 688
 689         /* Success.  Indicate we have identified an encapsulated binary */
 690         error = -2;
 691         imgp->ip_arch_offset = (user_size_t)fat_arch.offset;
 692         imgp->ip_arch_size = (user_size_t)fat_arch.size;
 693
 694 bad:
 695         kauth_cred_unref(&cred);
 696         return (error);
 697 }
 698
 699 /*
 700  * exec_mach_imgact
 701  *
 702  * Image activator for mach-o 1.0 binaries.
 703  *
 704  * Parameters;  struct image_params *   image parameter block
 705  *
 706  * Returns:     -1                      not a fat binary (keep looking)
 707  *              -2                      Success: encapsulated binary: reread
 708  *              >0                      Failure: error number
 709  *              EBADARCH                Mach-o binary, but with an unrecognized
 710  *                                      architecture
 711  *              ENOMEM                  No memory for child process after -
 712  *                                      can only happen after vfork()
 713  *
 714  * Important:   This image activator is NOT byte order neutral.
 715  *
 716  * Note:        A return value other than -1 indicates subsequent image
 717  *              activators should not be given the opportunity to attempt
 718  *              to activate the image.
 719  *
 720  * TODO:        More gracefully handle failures after vfork
 721  */
 722 static int
 723 exec_mach_imgact(struct image_params *imgp)
 724 {
 725         struct mach_header *mach_header = (struct mach_header *)imgp->ip_vdata;
 726         proc_t                  p = vfs_context_proc(imgp->ip_vfs_context);
 727         int                     error = 0;
 728         int                     vfexec = 0;
 729         task_t                  task;
 730         task_t                  new_task = NULL; /* protected by vfexec */
 731         thread_t                thread;
 732         struct uthread          *uthread;
 733         vm_map_t old_map = VM_MAP_NULL;
 734         vm_map_t map;
 735         load_return_t           lret;
 736         load_result_t           load_result;
 737         struct _posix_spawnattr *psa = NULL;
 738         int spawn = (imgp->ip_flags & IMGPF_SPAWN);
 739
 740         /*
 741          * make sure it's a Mach-O 1.0 or Mach-O 2.0 binary; the difference
 742          * is a reserved field on the end, so for the most part, we can
 743          * treat them as if they were identical.
 744          */
 745         if ((mach_header->magic != MH_MAGIC) &&
 746             (mach_header->magic != MH_MAGIC_64)) {
 747                 error = -1;
 748                 goto bad;
 749         }
 750
 751         switch (mach_header->filetype) {
 752         case MH_DYLIB:
 753         case MH_BUNDLE:
 754                 error = -1;
 755                 goto bad;
 756         }
 757
 758         if (!imgp->ip_origcputype) {
 759                 imgp->ip_origcputype = mach_header->cputype;
 760                 imgp->ip_origcpusubtype = mach_header->cpusubtype;
 761         }
 762
 763         task = current_task();
 764         thread = current_thread();
 765         uthread = get_bsdthread_info(thread);
 766
 767         /*
 768          * Save off the vfexec state up front; we have to do this, because
 769          * we need to know if we were in this state initally subsequent to
 770          * creating the backing task, thread, and uthread for the child
 771          * process (from the vfs_context_t from in img_parms).
 772          */
 773         if (uthread->uu_flag & UT_VFORK)
 774                 vfexec = 1;      /* Mark in exec */
 775
 776         if ((mach_header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64)
 777                 imgp->ip_flags |= IMGPF_IS_64BIT;
 778
 779         /* If posix_spawn binprefs exist, respect those prefs. */
 780         psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
 781         if (psa != NULL && psa->psa_binprefs[0] != 0) {
 782                 int pr = 0;
 783                 for (pr = 0; pr < NBINPREFS; pr++) {
 784                         cpu_type_t pref = psa->psa_binprefs[pr];
 785                         if (pref == 0) {
 786                                 /* No suitable arch in the pref list */
 787                                 error = EBADARCH;
 788                                 goto bad;
 789                         }
 790
 791                         if (pref == CPU_TYPE_ANY) {
 792                                 /* Jump to regular grading */
 793                                 goto grade;
 794                         }
 795
 796                         if (pref == imgp->ip_origcputype) {
 797                                 /* We have a match! */
 798                                 goto grade;
 799                         }
 800                 }
 801                 error = EBADARCH;
 802                 goto bad;
 803         }
 804 grade:
 805         if (!grade_binary(imgp->ip_origcputype & ~CPU_SUBTYPE_LIB64,
 806                                 imgp->ip_origcpusubtype & ~CPU_SUBTYPE_MASK)) {
 807                 error = EBADARCH;
 808                 goto bad;
 809         }
 810
 811         /* Copy in arguments/environment from the old process */
 812         error = exec_extract_strings(imgp);
 813         if (error)
 814                 goto bad;
 815
 816         AUDIT_ARG(argv, imgp->ip_argv, imgp->ip_argc,
 817             imgp->ip_strendargvp - imgp->ip_argv);
 818         AUDIT_ARG(envv, imgp->ip_strendargvp, imgp->ip_envc,
 819             imgp->ip_strendp - imgp->ip_strendargvp);
 820
 821         /*
 822          * Hack for binary compatability; put three NULs on the end of the
 823          * string area, and round it up to the next word boundary.  This
 824          * ensures padding with NULs to the boundary.
 825          */
 826         imgp->ip_strendp[0] = 0;
 827         imgp->ip_strendp[1] = 0;
 828         imgp->ip_strendp[2] = 0;
 829         imgp->ip_strendp += (((imgp->ip_strendp - imgp->ip_strings) + NBPW-1) & ~(NBPW-1));
 830
 831 #ifdef IMGPF_POWERPC
 832         /*
 833          * XXX
 834          *
 835          * Should be factored out; this is here because we might be getting
 836          * invoked this way as the result of a shell script, and the check
 837          * in exec_check_permissions() is not interior to the jump back up
 838          * to the "encapsulated_binary:" label in exec_activate_image().
 839          */
 840         if (imgp->ip_vattr->va_fsid == exec_archhandler_ppc.fsid &&
 841                 imgp->ip_vattr->va_fileid == (uint64_t)((u_long)exec_archhandler_ppc.fileid)) {
 842                 imgp->ip_flags |= IMGPF_POWERPC;
 843         }
 844 #endif  /* IMGPF_POWERPC */
 845
 846         /*
 847          * We are being called to activate an image subsequent to a vfork()
 848          * operation; in this case, we know that our task, thread, and
 849          * uthread are actualy those of our parent, and our proc, which we
 850          * obtained indirectly from the image_params vfs_context_t, is the
 851          * new child process.
 852          */
 853         if (vfexec || spawn) {
 854                 if (vfexec) {
 855                         imgp->ip_new_thread = fork_create_child(task, p, FALSE, (imgp->ip_flags & IMGPF_IS_64BIT));
 856                         if (imgp->ip_new_thread == NULL) {
 857                                 error = ENOMEM;
 858                                 goto bad;
 859                         }
 860                 }
 861
 862                 /* reset local idea of thread, uthread, task */
 863                 thread = imgp->ip_new_thread;
 864                 uthread = get_bsdthread_info(thread);
 865                 task = new_task = get_threadtask(thread);
 866                 map = get_task_map(task);
 867         } else {
 868                 map = VM_MAP_NULL;
 869         }
 870
 871         /*
 872          * We set these flags here; this is OK, since if we fail after
 873          * this point, we have already destroyed the parent process anyway.
 874          */
 875         task_set_dyld_info(task, MACH_VM_MIN_ADDRESS, 0);
 876         if (imgp->ip_flags & IMGPF_IS_64BIT) {
 877                 task_set_64bit(task, TRUE);
 878                 OSBitOrAtomic(P_LP64, &p->p_flag);
 879         } else {
 880                 task_set_64bit(task, FALSE);
 881                 OSBitAndAtomic(~((uint32_t)P_LP64), &p->p_flag);
 882         }
 883
 884         /*
 885          *      Load the Mach-O file.
 886          *
 887          * NOTE: An error after this point  indicates we have potentially
 888          * destroyed or overwrote some process state while attempting an
 889          * execve() following a vfork(), which is an unrecoverable condition.
 890          */
 891
 892         /*
 893          * Actually load the image file we previously decided to load.
 894          */
 895         lret = load_machfile(imgp, mach_header, thread, map, &load_result);
 896
 897         if (lret != LOAD_SUCCESS) {
 898                 error = load_return_to_errno(lret);
 899                 goto badtoolate;
 900         }
 901
 902         vm_map_set_user_wire_limit(get_task_map(task), p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
 903
 904         /*
 905          * Set code-signing flags if this binary is signed, or if parent has
 906          * requested them on exec.
 907          */
 908         if (load_result.csflags & CS_VALID) {
 909                 imgp->ip_csflags |= load_result.csflags &
 910                         (CS_VALID|
 911                          CS_HARD|CS_KILL|CS_EXEC_SET_HARD|CS_EXEC_SET_KILL);
 912         } else {
 913                 imgp->ip_csflags &= ~CS_VALID;
 914         }
 915
 916         if (p->p_csflags & CS_EXEC_SET_HARD)
 917                 imgp->ip_csflags |= CS_HARD;
 918         if (p->p_csflags & CS_EXEC_SET_KILL)
 919                 imgp->ip_csflags |= CS_KILL;
 920
 921
 922         /*
 923          * Set up the system reserved areas in the new address space.
 924          */
 925         vm_map_exec(get_task_map(task),
 926                     task,
 927                     (void *) p->p_fd->fd_rdir,
 928 #ifdef IMGPF_POWERPC
 929                     imgp->ip_flags & IMGPF_POWERPC ?
 930                     CPU_TYPE_POWERPC :
 931 #endif
 932                     cpu_type());
 933
 934         /*
 935          * Close file descriptors
 936          * which specify close-on-exec.
 937          */
 938         fdexec(p);
 939
 940         /*
 941          * deal with set[ug]id.
 942          */
 943         error = exec_handle_sugid(imgp);
 944
 945         /* Make sure we won't interrupt ourself signalling a partial process */
 946         if (!vfexec && !spawn && (p->p_lflag & P_LTRACED))
 947                 psignal(p, SIGTRAP);
 948
 949         if (error) {
 950                 goto badtoolate;
 951         }
 952
 953         if (load_result.unixproc &&
 954                 create_unix_stack(get_task_map(task),
 955                                   load_result.user_stack,
 956                                   load_result.customstack,
 957                                   p) != KERN_SUCCESS) {
 958                 error = load_return_to_errno(LOAD_NOSPACE);
 959                 goto badtoolate;
 960         }
 961
 962         /*
 963          * There is no  continuing workq context during
 964          * vfork exec. So no need to reset then. Otherwise
 965          * clear the workqueue context.
 966          */
 967         if (vfexec == 0 && spawn == 0) {
 968                 (void)workqueue_exit(p);
 969         }
 970         if (vfexec || spawn) {
 971                 old_map = vm_map_switch(get_task_map(task));
 972         }
 973
 974         if (load_result.unixproc) {
 975                 user_addr_t     ap;
 976
 977                 /*
 978                  * Copy the strings area out into the new process address
 979                  * space.
 980                  */
 981                 ap = p->user_stack;
 982                 error = exec_copyout_strings(imgp, &ap);
 983                 if (error) {
 984                         if (vfexec || spawn)
 985                                 vm_map_switch(old_map);
 986                         goto badtoolate;
 987                 }
 988                 /* Set the stack */
 989                 thread_setuserstack(thread, ap);
 990         }
 991
 992         if (load_result.dynlinker) {
 993                 uint64_t        ap;
 994
 995                 /* Adjust the stack */
 996                 if (imgp->ip_flags & IMGPF_IS_64BIT) {
 997                         ap = thread_adjuserstack(thread, -8);
 998                         error = copyoutptr(load_result.mach_header, ap, 8);
 999                 } else {
1000                         ap = thread_adjuserstack(thread, -4);
1001                         error = suword(ap, load_result.mach_header);
1002                 }
1003                 if (error) {
1004                         if (vfexec || spawn)
1005                                 vm_map_switch(old_map);
1006                         goto badtoolate;
1007                 }
1008                 task_set_dyld_info(task, load_result.all_image_info_addr,
1009                     load_result.all_image_info_size);
1010         }
1011
1012         if (vfexec || spawn) {
1013                 vm_map_switch(old_map);
1014         }
1015         /* Set the entry point */
1016         thread_setentrypoint(thread, load_result.entry_point);
1017
1018         /* Stop profiling */
1019         stopprofclock(p);
1020
1021         /*
1022          * Reset signal state.
1023          */
1024         execsigs(p, thread);
1025
1026         /*
1027          * need to cancel async IO requests that can be cancelled and wait for those
1028          * already active.  MAY BLOCK!
1029          */
1030         _aio_exec( p );
1031
1032 #if SYSV_SHM
1033         /* FIXME: Till vmspace inherit is fixed: */
1034         if (!vfexec && p->vm_shm)
1035                 shmexec(p);
1036 #endif
1037 #if SYSV_SEM
1038         /* Clean up the semaphores */
1039         semexit(p);
1040 #endif
1041
1042         /*
1043          * Remember file name for accounting.
1044          */
1045         p->p_acflag &= ~AFORK;
1046         /* If the translated name isn't NULL, then we want to use
1047          * that translated name as the name we show as the "real" name.
1048          * Otherwise, use the name passed into exec.
1049          */
1050         if (0 != imgp->ip_p_comm[0]) {
1051                 bcopy((caddr_t)imgp->ip_p_comm, (caddr_t)p->p_comm,
1052                         sizeof(p->p_comm));
1053         } else {
1054                 if (imgp->ip_ndp->ni_cnd.cn_namelen > MAXCOMLEN)
1055                         imgp->ip_ndp->ni_cnd.cn_namelen = MAXCOMLEN;
1056                 bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_comm,
1057                         (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen);
1058                 p->p_comm[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0';
1059         }
1060
1061         memcpy(&p->p_uuid[0], &load_result.uuid[0], sizeof(p->p_uuid));
1062
1063 // <rdar://6598155> dtrace code cleanup needed
1064 #if CONFIG_DTRACE
1065         /*
1066          * Invalidate any predicate evaluation already cached for this thread by DTrace.
1067          * That's because we've just stored to p_comm and DTrace refers to that when it
1068          * evaluates the "execname" special variable. uid and gid may have changed as well.
1069          */
1070         dtrace_set_thread_predcache(current_thread(), 0);
1071
1072         /*
1073          * Free any outstanding lazy dof entries. It is imperative we
1074          * always call dtrace_lazy_dofs_destroy, rather than null check
1075          * and call if !NULL. If we NULL test, during lazy dof faulting
1076          * we can race with the faulting code and proceed from here to
1077          * beyond the helpers cleanup. The lazy dof faulting will then
1078          * install new helpers which no longer belong to this process!
1079          */
1080         dtrace_lazy_dofs_destroy(p);
1081
1082
1083         /*
1084          * Clean up any DTrace helpers for the process.
1085          */
1086         if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
1087                 (*dtrace_helpers_cleanup)(p);
1088         }
1089
1090         /*
1091          * Cleanup the DTrace provider associated with this process.
1092          */
1093         proc_lock(p);
1094         if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
1095                 (*dtrace_fasttrap_exec_ptr)(p);
1096         }
1097         proc_unlock(p);
1098 #endif
1099
1100         if (kdebug_enable) {
1101                 long dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4;
1102
1103                 /*
1104                  * Collect the pathname for tracing
1105                  */
1106                 kdbg_trace_string(p, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4);
1107
1108                 if (vfexec || spawn) {
1109                         KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE,
1110                                         p->p_pid ,0,0,0, (uintptr_t)thread_tid(thread));
1111                         KERNEL_DEBUG_CONSTANT1((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE,
1112                                         dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, (uintptr_t)thread_tid(thread));
1113                 } else {
1114                         KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_DATA, 2)) | DBG_FUNC_NONE,
1115                                         p->p_pid ,0,0,0,0);
1116                         KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_STRING, 2)) | DBG_FUNC_NONE,
1117                                         dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0);
1118                 }
1119         }
1120
1121 #ifdef IMGPF_POWERPC
1122         /*
1123          * Mark the process as powerpc or not.  If powerpc, set the affinity
1124          * flag, which will be used for grading binaries in future exec's
1125          * from the process.
1126          */
1127         if (((imgp->ip_flags & IMGPF_POWERPC) != 0))
1128                 OSBitOrAtomic(P_TRANSLATED, &p->p_flag);
1129         else
1130 #endif  /* IMGPF_POWERPC */
1131                 OSBitAndAtomic(~((uint32_t)P_TRANSLATED), &p->p_flag);
1132         OSBitAndAtomic(~((uint32_t)P_AFFINITY), &p->p_flag);
1133
1134         /*
1135          * If posix_spawned with the START_SUSPENDED flag, stop the
1136          * process before it runs.
1137          */
1138         if (imgp->ip_px_sa != NULL) {
1139                 psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
1140                 if (psa->psa_flags & POSIX_SPAWN_START_SUSPENDED) {
1141                         proc_lock(p);
1142                         p->p_stat = SSTOP;
1143                         proc_unlock(p);
1144                         (void) task_suspend(p->task);
1145                 }
1146         }
1147
1148         /*
1149          * mark as execed, wakeup the process that vforked (if any) and tell
1150          * it that it now has its own resources back
1151          */
1152         OSBitOrAtomic(P_EXEC, &p->p_flag);
1153         proc_resetregister(p);
1154         if (p->p_pptr && (p->p_lflag & P_LPPWAIT)) {
1155                 proc_lock(p);
1156                 p->p_lflag &= ~P_LPPWAIT;
1157                 proc_unlock(p);
1158                 wakeup((caddr_t)p->p_pptr);
1159         }
1160
1161         /*
1162          * Pay for our earlier safety; deliver the delayed signals from
1163          * the incomplete vfexec process now that it's complete.
1164          */
1165         if (vfexec && (p->p_lflag & P_LTRACED)) {
1166                 psignal_vfork(p, new_task, thread, SIGTRAP);
1167         }
1168
1169 badtoolate:
1170 if (!spawn)
1171         proc_knote(p, NOTE_EXEC);
1172
1173         if (vfexec || spawn) {
1174                 task_deallocate(new_task);
1175                 thread_deallocate(thread);
1176                 if (error)
1177                         error = 0;
1178         }
1179
1180 bad:
1181         return(error);
1182 }
1183
1184
1185
1186
1187 /*
1188  * Our image activator table; this is the table of the image types we are
1189  * capable of loading.  We list them in order of preference to ensure the
1190  * fastest image load speed.
1191  *
1192  * XXX hardcoded, for now; should use linker sets
1193  */
1194 struct execsw {
1195         int (*ex_imgact)(struct image_params *);
1196         const char *ex_name;
1197 } execsw[] = {
1198         { exec_mach_imgact,             "Mach-o Binary" },
1199         { exec_fat_imgact,              "Fat Binary" },
1200 #ifdef IMGPF_POWERPC
1201         { exec_powerpc32_imgact,        "PowerPC binary" },
1202 #endif  /* IMGPF_POWERPC */
1203         { exec_shell_imgact,            "Interpreter Script" },
1204         { NULL, NULL}
1205 };
1206
1207
1208 /*
1209  * exec_activate_image
1210  *
1211  * Description: Iterate through the available image activators, and activate
1212  *              the image associated with the imgp structure.  We start with
1213  *              the
1214  *
1215  * Parameters:  struct image_params *   Image parameter block
1216  *
1217  * Returns:     0                       Success
1218  *              EBADEXEC                The executable is corrupt/unknown
1219  *      execargs_alloc:EINVAL           Invalid argument
1220  *      execargs_alloc:EACCES           Permission denied
1221  *      execargs_alloc:EINTR            Interrupted function
1222  *      execargs_alloc:ENOMEM           Not enough space
1223  *      exec_save_path:EFAULT           Bad address
1224  *      exec_save_path:ENAMETOOLONG     Filename too long
1225  *      exec_check_permissions:EACCES   Permission denied
1226  *      exec_check_permissions:ENOEXEC  Executable file format error
1227  *      exec_check_permissions:ETXTBSY  Text file busy [misuse of error code]
1228  *      exec_check_permissions:???
1229  *      namei:???
1230  *      vn_rdwr:???                     [anything vn_rdwr can return]
1231  *      <ex_imgact>:???                 [anything an imgact can return]
1232  */
1233 static int
1234 exec_activate_image(struct image_params *imgp)
1235 {
1236         struct nameidata nd;
1237         int error;
1238         int resid;
1239         int once = 1;   /* save SGUID-ness for interpreted files */
1240         int i;
1241         int iterlimit = EAI_ITERLIMIT;
1242         proc_t p = vfs_context_proc(imgp->ip_vfs_context);
1243
1244         error = execargs_alloc(imgp);
1245         if (error)
1246                 goto bad;
1247
1248         /*
1249          * XXXAUDIT: Note: the double copyin introduces an audit
1250          * race.  To correct this race, we must use a single
1251          * copyin(), e.g. by passing a flag to namei to indicate an
1252          * external path buffer is being used.
1253          */
1254         error = exec_save_path(imgp, imgp->ip_user_fname, imgp->ip_seg);
1255         if (error) {
1256                 goto bad_notrans;
1257         }
1258
1259         DTRACE_PROC1(exec, uintptr_t, imgp->ip_strings);
1260
1261         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1,
1262                 imgp->ip_seg, imgp->ip_user_fname, imgp->ip_vfs_context);
1263
1264 again:
1265         error = namei(&nd);
1266         if (error)
1267                 goto bad_notrans;
1268         imgp->ip_ndp = &nd;     /* successful namei(); call nameidone() later */
1269         imgp->ip_vp = nd.ni_vp; /* if set, need to vnode_put() at some point */
1270
1271         error = proc_transstart(p, 0);
1272         if (error)
1273                 goto bad_notrans;
1274
1275         error = exec_check_permissions(imgp);
1276         if (error)
1277                 goto bad;
1278
1279         /* Copy; avoid invocation of an interpreter overwriting the original */
1280         if (once) {
1281                 once = 0;
1282                 *imgp->ip_origvattr = *imgp->ip_vattr;
1283         }
1284
1285         error = vn_rdwr(UIO_READ, imgp->ip_vp, imgp->ip_vdata, PAGE_SIZE, 0,
1286                         UIO_SYSSPACE, IO_NODELOCKED,
1287                         vfs_context_ucred(imgp->ip_vfs_context),
1288                         &resid, vfs_context_proc(imgp->ip_vfs_context));
1289         if (error)
1290                 goto bad;
1291
1292 encapsulated_binary:
1293         /* Limit the number of iterations we will attempt on each binary */
1294         if (--iterlimit == 0) {
1295                 error = EBADEXEC;
1296                 goto bad;
1297         }
1298         error = -1;
1299         for(i = 0; error == -1 && execsw[i].ex_imgact != NULL; i++) {
1300
1301                 error = (*execsw[i].ex_imgact)(imgp);
1302
1303                 switch (error) {
1304                 /* case -1: not claimed: continue */
1305                 case -2:                /* Encapsulated binary */
1306                         goto encapsulated_binary;
1307
1308                 case -3:                /* Interpreter */
1309 #if CONFIG_MACF
1310                         /*
1311                          * Copy the script label for later use. Note that
1312                          * the label can be different when the script is
1313                          * actually read by the interpreter.
1314                          */
1315                         if (imgp->ip_scriptlabelp)
1316                                 mac_vnode_label_free(imgp->ip_scriptlabelp);
1317                         imgp->ip_scriptlabelp = mac_vnode_label_alloc();
1318                         if (imgp->ip_scriptlabelp == NULL) {
1319                                 error = ENOMEM;
1320                                 break;
1321                         }
1322                         mac_vnode_label_copy(imgp->ip_vp->v_label,
1323                                              imgp->ip_scriptlabelp);
1324 #endif
1325                         vnode_put(imgp->ip_vp);
1326                         imgp->ip_vp = NULL;     /* already put */
1327
1328                         NDINIT(&nd, LOOKUP, (nd.ni_cnd.cn_flags & HASBUF) | (FOLLOW | LOCKLEAF),
1329                                 UIO_SYSSPACE, CAST_USER_ADDR_T(imgp->ip_interp_name), imgp->ip_vfs_context);
1330
1331 #ifdef IMGPF_POWERPC
1332                         /*
1333                          * PowerPC does not follow symlinks because the
1334                          * code which sets exec_archhandler_ppc.fsid and
1335                          * exec_archhandler_ppc.fileid doesn't follow them.
1336                          */
1337                         if (imgp->ip_flags & IMGPF_POWERPC)
1338                                 nd.ni_cnd.cn_flags &= ~FOLLOW;
1339 #endif  /* IMGPF_POWERPC */
1340
1341                         proc_transend(p, 0);
1342                         goto again;
1343
1344                 default:
1345                         break;
1346                 }
1347         }
1348
1349         /*
1350          * Call out to allow 3rd party notification of exec.
1351          * Ignore result of kauth_authorize_fileop call.
1352          */
1353         if (error == 0 && kauth_authorize_fileop_has_listeners()) {
1354                 kauth_authorize_fileop(vfs_context_ucred(imgp->ip_vfs_context),
1355                                         KAUTH_FILEOP_EXEC,
1356                                         (uintptr_t)nd.ni_vp, 0);
1357         }
1358
1359 bad:
1360         proc_transend(p, 0);
1361
1362 bad_notrans:
1363         if (imgp->ip_strings)
1364                 execargs_free(imgp);
1365         if (imgp->ip_ndp)
1366                 nameidone(imgp->ip_ndp);
1367
1368         return (error);
1369 }
1370
1371 /*
1372  * exec_handle_port_actions
1373  *
1374  * Description: Go through the _posix_port_actions_t contents,
1375  *              calling task_set_special_port, task_set_exception_ports
1376  *              and/or audit_session_spawnjoin for the current task.
1377  *
1378  * Parameters:  struct image_params *   Image parameter block
1379  *              short psa_flags         posix spawn attribute flags
1380  *
1381  * Returns:     0                       Success
1382  *              KERN_FAILURE            Failure
1383  *              ENOTSUP                 Illegal posix_spawn attr flag was set
1384  */
1385 static int
1386 exec_handle_port_actions(struct image_params *imgp, short psa_flags)
1387 {
1388         _posix_spawn_port_actions_t pacts = imgp->ip_px_spa;
1389         proc_t p = vfs_context_proc(imgp->ip_vfs_context);
1390         _ps_port_action_t *act = NULL;
1391         task_t task = p->task;
1392         ipc_port_t port = NULL;
1393         kern_return_t ret = KERN_SUCCESS;
1394         int i;
1395
1396         for (i = 0; i < pacts->pspa_count; i++) {
1397                 act = &pacts->pspa_actions[i];
1398
1399                 ret = ipc_object_copyin(get_task_ipcspace(current_task()),
1400                                 CAST_MACH_PORT_TO_NAME(act->new_port),
1401                                 MACH_MSG_TYPE_COPY_SEND,
1402                                 (ipc_object_t *) &port);
1403
1404                 if (ret)
1405                         return ret;
1406
1407                 switch (act->port_type) {
1408                         case PSPA_SPECIAL:
1409                                 /* Only allowed when not under vfork */
1410                                 if (!(psa_flags & POSIX_SPAWN_SETEXEC))
1411                                         return ENOTSUP;
1412                                 ret = task_set_special_port(task,
1413                                                 act->which,
1414                                                 port);
1415                                 break;
1416                         case PSPA_EXCEPTION:
1417                                 /* Only allowed when not under vfork */
1418                                 if (!(psa_flags & POSIX_SPAWN_SETEXEC))
1419                                         return ENOTSUP;
1420                                 ret = task_set_exception_ports(task,
1421                                                 act->mask,
1422                                                 port,
1423                                                 act->behavior,
1424                                                 act->flavor);
1425                                 break;
1426 #if CONFIG_AUDIT
1427                         case PSPA_AU_SESSION:
1428                                 ret = audit_session_spawnjoin(p,
1429                                                 port);
1430                                 break;
1431 #endif
1432                         default:
1433                                 ret = KERN_FAILURE;
1434                 }
1435                 /* action failed, so release port resources */
1436                 if (ret) {
1437                         ipc_port_release_send(port);
1438                         return ret;
1439                 }
1440         }
1441
1442         return ret;
1443 }
1444
1445 /*
1446  * exec_handle_file_actions
1447  *
1448  * Description: Go through the _posix_file_actions_t contents applying the
1449  *              open, close, and dup2 operations to the open file table for
1450  *              the current process.
1451  *
1452  * Parameters:  struct image_params *   Image parameter block
1453  *
1454  * Returns:     0                       Success
1455  *              ???
1456  *
1457  * Note:        Actions are applied in the order specified, with the credential
1458  *              of the parent process.  This is done to permit the parent
1459  *              process to utilize POSIX_SPAWN_RESETIDS to drop privilege in
1460  *              the child following operations the child may in fact not be
1461  *              normally permitted to perform.
1462  */
1463 static int
1464 exec_handle_file_actions(struct image_params *imgp)
1465 {
1466         int error = 0;
1467         int action;
1468         proc_t p = vfs_context_proc(imgp->ip_vfs_context);
1469         _posix_spawn_file_actions_t px_sfap = imgp->ip_px_sfa;
1470         int ival[2];            /* dummy retval for system calls) */
1471
1472         for (action = 0; action < px_sfap->psfa_act_count; action++) {
1473                 _psfa_action_t *psfa = &px_sfap->psfa_act_acts[ action];
1474
1475                 switch(psfa->psfaa_type) {
1476                 case PSFA_OPEN: {
1477                         /*
1478                          * Open is different, in that it requires the use of
1479                          * a path argument, which is normally copied in from
1480                          * user space; because of this, we have to support an
1481                          * open from kernel space that passes an address space
1482                          * context oof UIO_SYSSPACE, and casts the address
1483                          * argument to a user_addr_t.
1484                          */
1485                         struct vnode_attr va;
1486                         struct nameidata nd;
1487                         int mode = psfa->psfaa_openargs.psfao_mode;
1488                         struct dup2_args dup2a;
1489                         struct close_nocancel_args ca;
1490                         int origfd;
1491
1492                         VATTR_INIT(&va);
1493                         /* Mask off all but regular access permissions */
1494                         mode = ((mode &~ p->p_fd->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1495                         VATTR_SET(&va, va_mode, mode & ACCESSPERMS);
1496
1497                         NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, UIO_SYSSPACE,
1498                                CAST_USER_ADDR_T(psfa->psfaa_openargs.psfao_path),
1499                                imgp->ip_vfs_context);
1500
1501                         error = open1(imgp->ip_vfs_context,
1502                                         &nd,
1503                                         psfa->psfaa_openargs.psfao_oflag,
1504                                         &va,
1505                                         ival);
1506
1507                         /*
1508                          * If there's an error, or we get the right fd by
1509                          * accident, then drop out here.  This is easier that
1510                          * rearchitecting all the open code to preallocate fd
1511                          * slots, and internally taking one as an argument.
1512                          */
1513                         if (error || ival[0] == psfa->psfaa_filedes)
1514                                 break;
1515
1516                         origfd = ival[0];
1517                         /*
1518                          * If we didn't fall out from an error, we ended up
1519                          * with the wrong fd; so now we've got to try to dup2
1520                          * it to the right one.
1521                          */
1522                         dup2a.from = origfd;
1523                         dup2a.to = psfa->psfaa_filedes;
1524
1525                         /*
1526                          * The dup2() system call implementation sets
1527                          * ival to newfd in the success case, but we
1528                          * can ignore that, since if we didn't get the
1529                          * fd we wanted, the error will stop us.
1530                          */
1531                         error = dup2(p, &dup2a, ival);
1532                         if (error)
1533                                 break;
1534
1535                         /*
1536                          * Finally, close the original fd.
1537                          */
1538                         ca.fd = origfd;
1539
1540                         error = close_nocancel(p, &ca, ival);
1541                         }
1542                         break;
1543
1544                 case PSFA_DUP2: {
1545                         struct dup2_args dup2a;
1546
1547                         dup2a.from = psfa->psfaa_filedes;
1548                         dup2a.to = psfa->psfaa_openargs.psfao_oflag;
1549
1550                         /*
1551                          * The dup2() system call implementation sets
1552                          * ival to newfd in the success case, but we
1553                          * can ignore that, since if we didn't get the
1554                          * fd we wanted, the error will stop us.
1555                          */
1556                         error = dup2(p, &dup2a, ival);
1557                         }
1558                         break;
1559
1560                 case PSFA_CLOSE: {
1561                         struct close_nocancel_args ca;
1562
1563                         ca.fd = psfa->psfaa_filedes;
1564
1565                         error = close_nocancel(p, &ca, ival);
1566                         }
1567                         break;
1568
1569                 default:
1570                         error = EINVAL;
1571                         break;
1572                 }
1573                 /* All file actions failures are considered fatal, per POSIX */
1574                 if (error)
1575                         break;
1576         }
1577
1578         return (error);
1579 }
1580
1581
1582 /*
1583  * posix_spawn
1584  *
1585  * Parameters:  uap->pid                Pointer to pid return area
1586  *              uap->fname              File name to exec
1587  *              uap->argp               Argument list
1588  *              uap->envp               Environment list
1589  *
1590  * Returns:     0                       Success
1591  *              EINVAL                  Invalid argument
1592  *              ENOTSUP                 Not supported
1593  *              ENOEXEC                 Executable file format error
1594  *      exec_activate_image:EINVAL      Invalid argument
1595  *      exec_activate_image:EACCES      Permission denied
1596  *      exec_activate_image:EINTR       Interrupted function
1597  *      exec_activate_image:ENOMEM      Not enough space
1598  *      exec_activate_image:EFAULT      Bad address
1599  *      exec_activate_image:ENAMETOOLONG        Filename too long
1600  *      exec_activate_image:ENOEXEC     Executable file format error
1601  *      exec_activate_image:ETXTBSY     Text file busy [misuse of error code]
1602  *      exec_activate_image:EBADEXEC    The executable is corrupt/unknown
1603  *      exec_activate_image:???
1604  *      mac_execve_enter:???
1605  *
1606  * TODO:        Expect to need __mac_posix_spawn() at some point...
1607  *              Handle posix_spawnattr_t
1608  *              Handle posix_spawn_file_actions_t
1609  */
1610 int
1611 posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
1612 {
1613         proc_t p = ap;          /* quiet bogus GCC vfork() warning */
1614         user_addr_t pid = uap->pid;
1615         int ival[2];            /* dummy retval for setpgid() */
1616         char *bufp = NULL;
1617         struct image_params *imgp;
1618         struct vnode_attr *vap;
1619         struct vnode_attr *origvap;
1620         struct uthread  *uthread = 0;   /* compiler complains if not set to 0*/
1621         int error, sig;
1622         char alt_p_comm[sizeof(p->p_comm)] = {0};       /* for PowerPC */
1623         int is_64 = IS_64BIT_PROCESS(p);
1624         struct vfs_context context;
1625         struct user__posix_spawn_args_desc px_args;
1626         struct _posix_spawnattr px_sa;
1627         _posix_spawn_file_actions_t px_sfap = NULL;
1628         _posix_spawn_port_actions_t px_spap = NULL;
1629         struct __kern_sigaction vec;
1630         boolean_t spawn_no_exec = FALSE;
1631
1632         /*
1633          * Allocate a big chunk for locals instead of using stack since these
1634          * structures a pretty big.
1635          */
1636         MALLOC(bufp, char *, (sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)), M_TEMP, M_WAITOK | M_ZERO);
1637         imgp = (struct image_params *) bufp;
1638         if (bufp == NULL) {
1639                 error = ENOMEM;
1640                 goto bad;
1641         }
1642         vap = (struct vnode_attr *) (bufp + sizeof(*imgp));
1643         origvap = (struct vnode_attr *) (bufp + sizeof(*imgp) + sizeof(*vap));
1644
1645         /* Initialize the common data in the image_params structure */
1646         imgp->ip_user_fname = uap->path;
1647         imgp->ip_user_argv = uap->argv;
1648         imgp->ip_user_envv = uap->envp;
1649         imgp->ip_vattr = vap;
1650         imgp->ip_origvattr = origvap;
1651         imgp->ip_vfs_context = &context;
1652         imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE);
1653         imgp->ip_p_comm = alt_p_comm;           /* for PowerPC */
1654         imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32);
1655
1656         if (uap->adesc != USER_ADDR_NULL) {
1657                 if(is_64) {
1658                         error = copyin(uap->adesc, &px_args, sizeof(px_args));
1659                 } else {
1660                         struct user32__posix_spawn_args_desc px_args32;
1661
1662                         error = copyin(uap->adesc, &px_args32, sizeof(px_args32));
1663
1664                         /*
1665                          * Convert arguments descriptor from external 32 bit
1666                          * representation to internal 64 bit representation
1667                          */
1668                         px_args.attr_size = px_args32.attr_size;
1669                         px_args.attrp = CAST_USER_ADDR_T(px_args32.attrp);
1670                         px_args.file_actions_size = px_args32.file_actions_size;
1671                         px_args.file_actions = CAST_USER_ADDR_T(px_args32.file_actions);
1672                         px_args.port_actions_size = px_args32.port_actions_size;
1673                         px_args.port_actions = CAST_USER_ADDR_T(px_args32.port_actions);
1674                 }
1675                 if (error)
1676                         goto bad;
1677
1678                 if (px_args.attr_size != 0) {
1679                         /*
1680                          * This could lose some of the port_actions pointer,
1681                          * but we already have it from px_args.
1682                          */
1683                         if ((error = copyin(px_args.attrp, &px_sa, sizeof(px_sa))) != 0)
1684                         goto bad;
1685
1686                         imgp->ip_px_sa = &px_sa;
1687                 }
1688                 if (px_args.file_actions_size != 0) {
1689                         /* Limit file_actions to allowed number of open files */
1690                         int maxfa = (p->p_limit ? p->p_rlimit[RLIMIT_NOFILE].rlim_cur : NOFILE);
1691                         if (px_args.file_actions_size < PSF_ACTIONS_SIZE(1) ||
1692                                 px_args.file_actions_size > PSF_ACTIONS_SIZE(maxfa)) {
1693                                 error = EINVAL;
1694                                 goto bad;
1695                         }
1696                         MALLOC(px_sfap, _posix_spawn_file_actions_t, px_args.file_actions_size, M_TEMP, M_WAITOK);
1697                         if (px_sfap == NULL) {
1698                                 error = ENOMEM;
1699                                 goto bad;
1700                         }
1701                         imgp->ip_px_sfa = px_sfap;
1702
1703                         if ((error = copyin(px_args.file_actions, px_sfap,
1704                                                         px_args.file_actions_size)) != 0)
1705                                 goto bad;
1706                 }
1707                 if (px_args.port_actions_size != 0) {
1708                         /* Limit port_actions to one page of data */
1709                         if (px_args.port_actions_size < PS_PORT_ACTIONS_SIZE(1) ||
1710                                 px_args.port_actions_size > PAGE_SIZE) {
1711                                 error = EINVAL;
1712                                 goto bad;
1713                         }
1714
1715                         MALLOC(px_spap, _posix_spawn_port_actions_t,
1716                                         px_args.port_actions_size, M_TEMP, M_WAITOK);
1717                         if (px_spap == NULL) {
1718                                 error = ENOMEM;
1719                                 goto bad;
1720                         }
1721                         imgp->ip_px_spa = px_spap;
1722
1723                         if ((error = copyin(px_args.port_actions, px_spap,
1724                                                         px_args.port_actions_size)) != 0)
1725                                 goto bad;
1726                 }
1727         }
1728
1729         /* set uthread to parent */
1730         uthread = get_bsdthread_info(current_thread());
1731
1732         /*
1733          * <rdar://6640530>; this does not result in a behaviour change
1734          * relative to Leopard, so there should not be any existing code
1735          * which depends on it.
1736          */
1737         if (uthread->uu_flag & UT_VFORK) {
1738             error = EINVAL;
1739             goto bad;
1740         }
1741
1742         /*
1743          * If we don't have the extention flag that turns "posix_spawn()"
1744          * into "execve() with options", then we will be creating a new
1745          * process which does not inherit memory from the parent process,
1746          * which is one of the most expensive things about using fork()
1747          * and execve().
1748          */
1749         if (imgp->ip_px_sa == NULL || !(px_sa.psa_flags & POSIX_SPAWN_SETEXEC)){
1750                 if ((error = fork1(p, &imgp->ip_new_thread, PROC_CREATE_SPAWN)) != 0)
1751                         goto bad;
1752                 imgp->ip_flags |= IMGPF_SPAWN;  /* spawn w/o exec */
1753                 spawn_no_exec = TRUE;           /* used in later tests */
1754         }
1755
1756         if (spawn_no_exec)
1757                 p = (proc_t)get_bsdthreadtask_info(imgp->ip_new_thread);
1758
1759
1760         /* By default, the thread everyone plays with is the parent */
1761         context.vc_thread = current_thread();
1762         context.vc_ucred = p->p_ucred;  /* XXX must NOT be kauth_cred_get() */
1763
1764         /*
1765          * However, if we're not in the setexec case, redirect the context
1766          * to the newly created process instead
1767          */
1768         if (spawn_no_exec)
1769                 context.vc_thread = imgp->ip_new_thread;
1770
1771
1772         /*
1773          * Post fdcopy(), pre exec_handle_sugid() - this is where we want
1774          * to handle the file_actions.  Since vfork() also ends up setting
1775          * us into the parent process group, and saved off the signal flags,
1776          * this is also where we want to handle the spawn flags.
1777          */
1778         /* Has spawn file actions? */
1779         if (imgp->ip_px_sfa != NULL &&
1780             (error = exec_handle_file_actions(imgp)) != 0) {
1781                 goto bad;
1782         }
1783
1784         /* Has spawn port actions? */
1785         if (imgp->ip_px_spa != NULL) {
1786                 /*
1787                  * The check for the POSIX_SPAWN_SETEXEC flag is done in
1788                  * exec_handle_port_actions().
1789                  */
1790                 if((error = exec_handle_port_actions(imgp, px_sa.psa_flags)) != 0)
1791                         goto bad;
1792         }
1793
1794         /* Has spawn attr? */
1795         if (imgp->ip_px_sa != NULL) {
1796                 /*
1797                  * Set the process group ID of the child process; this has
1798                  * to happen before the image activation.
1799                  */
1800                 if (px_sa.psa_flags & POSIX_SPAWN_SETPGROUP) {
1801                         struct setpgid_args spga;
1802                         spga.pid = p->p_pid;
1803                         spga.pgid = px_sa.psa_pgroup;
1804                         /*
1805                          * Effectively, call setpgid() system call; works
1806                          * because there are no pointer arguments.
1807                          */
1808                         if((error = setpgid(p, &spga, ival)) != 0)
1809                                 goto bad;
1810                 }
1811
1812                 /*
1813                  * Reset UID/GID to parent's RUID/RGID; This works only
1814                  * because the operation occurs *after* the vfork() and
1815                  * before the call to exec_handle_sugid() by the image
1816                  * activator called from exec_activate_image().  POSIX
1817                  * requires that any setuid/setgid bits on the process
1818                  * image will take precedence over the spawn attributes
1819                  * (re)setting them.
1820                  *
1821                  * The use of p_ucred is safe, since we are acting on the
1822                  * new process, and it has no threads other than the one
1823                  * we are creating for it.
1824                  */
1825                 if (px_sa.psa_flags & POSIX_SPAWN_RESETIDS) {
1826                         kauth_cred_t my_cred = p->p_ucred;
1827                         kauth_cred_t my_new_cred = kauth_cred_setuidgid(my_cred, my_cred->cr_ruid, my_cred->cr_rgid);
1828                         if (my_new_cred != my_cred)
1829                                 p->p_ucred = my_new_cred;
1830                 }
1831         }
1832
1833         /*
1834          * Clear transition flag so we won't hang if exec_activate_image() causes
1835          * an automount (and launchd does a proc sysctl to service it).
1836          *
1837          * <rdar://problem/6848672>, <rdar://problem/5959568>.
1838          */
1839         if (spawn_no_exec) {
1840                 proc_transend(p, 0);
1841         }
1842
1843 #if MAC_SPAWN   /* XXX */
1844         if (uap->mac_p != USER_ADDR_NULL) {
1845                 error = mac_execve_enter(uap->mac_p, imgp);
1846                 if (error)
1847                         goto bad;
1848         }
1849 #endif
1850
1851         /*
1852          * Activate the image
1853          */
1854         error = exec_activate_image(imgp);
1855
1856         /* Image not claimed by any activator? */
1857         if (error == -1)
1858                 error = ENOEXEC;
1859
1860         /*
1861          * If we have a spawn attr, and it contains signal related flags,
1862          * the we need to process them in the "context" of the new child
1863          * process, so we have to process it following image activation,
1864          * prior to making the thread runnable in user space.  This is
1865          * necessitated by some signal information being per-thread rather
1866          * than per-process, and we don't have the new allocation in hand
1867          * until after the image is activated.
1868          */
1869         if (!error && imgp->ip_px_sa != NULL) {
1870                 thread_t child_thread = current_thread();
1871                 uthread_t child_uthread = uthread;
1872
1873                 /*
1874                  * If we created a new child thread, then the thread and
1875                  * uthread are different than the current ones; otherwise,
1876                  * we leave them, since we are in the exec case instead.
1877                  */
1878                 if (spawn_no_exec) {
1879                         child_thread = imgp->ip_new_thread;
1880                         child_uthread = get_bsdthread_info(child_thread);
1881                 }
1882
1883                 /*
1884                  * Mask a list of signals, instead of them being unmasked, if
1885                  * they were unmasked in the parent; note that some signals
1886                  * are not maskable.
1887                  */
1888                 if (px_sa.psa_flags & POSIX_SPAWN_SETSIGMASK)
1889                         child_uthread->uu_sigmask = (px_sa.psa_sigmask & ~sigcantmask);
1890                 /*
1891                  * Default a list of signals instead of ignoring them, if
1892                  * they were ignored in the parent.  Note that we pass
1893                  * spawn_no_exec to setsigvec() to indicate that we called
1894                  * fork1() and therefore do not need to call proc_signalstart()
1895                  * internally.
1896                  */
1897                 if (px_sa.psa_flags & POSIX_SPAWN_SETSIGDEF) {
1898                         vec.sa_handler = SIG_DFL;
1899                         vec.sa_tramp = 0;
1900                         vec.sa_mask = 0;
1901                         vec.sa_flags = 0;
1902                         for (sig = 0; sig < NSIG; sig++)
1903                                 if (px_sa.psa_sigdefault & (1 << sig)) {
1904                                         error = setsigvec(p, child_thread, sig + 1, &vec, spawn_no_exec);
1905                         }
1906                 }
1907         }
1908
1909 bad:
1910         if (error == 0) {
1911                 /* upon  successful spawn, re/set the proc control state */
1912                 if (imgp->ip_px_sa != NULL) {
1913                         switch (px_sa.psa_pcontrol) {
1914                                 case POSIX_SPAWN_PCONTROL_THROTTLE:
1915                                         p->p_pcaction = P_PCTHROTTLE;
1916                                         break;
1917                                 case POSIX_SPAWN_PCONTROL_SUSPEND:
1918                                         p->p_pcaction = P_PCSUSP;
1919                                         break;
1920                                 case POSIX_SPAWN_PCONTROL_KILL:
1921                                         p->p_pcaction = P_PCKILL;
1922                                         break;
1923                                 case POSIX_SPAWN_PCONTROL_NONE:
1924                                 default:
1925                                         p->p_pcaction = 0;
1926                                         break;
1927                         };
1928                 }
1929                 exec_resettextvp(p, imgp);
1930         }
1931
1932         /*
1933          * If we successfully called fork1(), we always need to do this;
1934          * we identify this case by noting the IMGPF_SPAWN flag.  This is
1935          * because we come back from that call with signals blocked in the
1936          * child, and we have to unblock them, but we want to wait until
1937          * after we've performed any spawn actions.  This has to happen
1938          * before check_for_signature(), which uses psignal.
1939          */
1940         if (spawn_no_exec) {
1941                 /*
1942                  * Drop the signal lock on the child which was taken on our
1943                  * behalf by forkproc()/cloneproc() to prevent signals being
1944                  * received by the child in a partially constructed state.
1945                  */
1946                 proc_signalend(p, 0);
1947
1948                 /* flag the 'fork' has occurred */
1949                 proc_knote(p->p_pptr, NOTE_FORK | p->p_pid);
1950                 /* then flag exec has occurred */
1951                 proc_knote(p, NOTE_EXEC);
1952                 DTRACE_PROC1(create, proc_t, p);
1953         }
1954
1955         /*
1956          * We have to delay operations which might throw a signal until after
1957          * the signals have been unblocked; however, we want that to happen
1958          * after exec_resettextvp() so that the textvp is correct when they
1959          * fire.
1960          */
1961         if (error == 0) {
1962                 error = check_for_signature(p, imgp);
1963
1964                 /*
1965                  * Pay for our earlier safety; deliver the delayed signals from
1966                  * the incomplete spawn process now that it's complete.
1967                  */
1968                 if (imgp != NULL && spawn_no_exec && (p->p_lflag & P_LTRACED)) {
1969                         psignal_vfork(p, p->task, imgp->ip_new_thread, SIGTRAP);
1970                 }
1971         }
1972
1973
1974         if (imgp != NULL) {
1975                 if (imgp->ip_vp)
1976                         vnode_put(imgp->ip_vp);
1977                 if (imgp->ip_strings)
1978                         execargs_free(imgp);
1979                 if (imgp->ip_px_sfa != NULL)
1980                         FREE(imgp->ip_px_sfa, M_TEMP);
1981                 if (imgp->ip_px_spa != NULL)
1982                         FREE(imgp->ip_px_spa, M_TEMP);
1983
1984 #if CONFIG_MACF
1985                 if (imgp->ip_execlabelp)
1986                         mac_cred_label_free(imgp->ip_execlabelp);
1987                 if (imgp->ip_scriptlabelp)
1988                         mac_vnode_label_free(imgp->ip_scriptlabelp);
1989 #endif
1990         }
1991
1992         if (error) {
1993                 DTRACE_PROC1(exec__failure, int, error);
1994         } else {
1995             /*
1996              * <rdar://6609474> temporary - so dtrace call to current_proc()
1997              * returns the child process instead of the parent.
1998              */
1999             if (imgp != NULL && imgp->ip_flags & IMGPF_SPAWN) {
2000                 p->p_lflag |= P_LINVFORK;
2001                 p->p_vforkact = current_thread();
2002                 uthread->uu_proc = p;
2003                 uthread->uu_flag |= UT_VFORK;
2004             }
2005
2006             DTRACE_PROC(exec__success);
2007
2008             /*
2009              * <rdar://6609474> temporary - so dtrace call to current_proc()
2010              * returns the child process instead of the parent.
2011              */
2012             if (imgp != NULL && imgp->ip_flags & IMGPF_SPAWN) {
2013                 p->p_lflag &= ~P_LINVFORK;
2014                 p->p_vforkact = NULL;
2015                 uthread->uu_proc = PROC_NULL;
2016                 uthread->uu_flag &= ~UT_VFORK;
2017             }
2018         }
2019
2020         /* Return to both the parent and the child? */
2021         if (imgp != NULL && spawn_no_exec) {
2022                 /*
2023                  * If the parent wants the pid, copy it out
2024                  */
2025                 if (pid != USER_ADDR_NULL)
2026                         (void)suword(pid, p->p_pid);
2027                 retval[0] = error;
2028
2029                 /*
2030                  * If we had an error, perform an internal reap ; this is
2031                  * entirely safe, as we have a real process backing us.
2032                  */
2033                 if (error) {
2034                         proc_list_lock();
2035                         p->p_listflag |= P_LIST_DEADPARENT;
2036                         proc_list_unlock();
2037                         proc_lock(p);
2038                         /* make sure no one else has killed it off... */
2039                         if (p->p_stat != SZOMB && p->exit_thread == NULL) {
2040                                 p->exit_thread = current_thread();
2041                                 proc_unlock(p);
2042                                 exit1(p, 1, (int *)NULL);
2043                                 task_deallocate(get_threadtask(imgp->ip_new_thread));
2044                                 thread_deallocate(imgp->ip_new_thread);
2045                         } else {
2046                                 /* someone is doing it for us; just skip it */
2047                                 proc_unlock(p);
2048                         }
2049                 } else {
2050
2051                         /*
2052                          * Return" to the child
2053                          *
2054                          * Note: the image activator earlier dropped the
2055                          * task/thread references to the newly spawned
2056                          * process; this is OK, since we still have suspended
2057                          * queue references on them, so we should be fine
2058                          * with the delayed resume of the thread here.
2059                          */
2060                         (void)thread_resume(imgp->ip_new_thread);
2061                 }
2062         }
2063         if (bufp != NULL) {
2064                 FREE(bufp, M_TEMP);
2065         }
2066
2067         return(error);
2068 }
2069
2070
2071 /*
2072  * execve
2073  *
2074  * Parameters:  uap->fname              File name to exec
2075  *              uap->argp               Argument list
2076  *              uap->envp               Environment list
2077  *
2078  * Returns:     0                       Success
2079  *      __mac_execve:EINVAL             Invalid argument
2080  *      __mac_execve:ENOTSUP            Invalid argument
2081  *      __mac_execve:EACCES             Permission denied
2082  *      __mac_execve:EINTR              Interrupted function
2083  *      __mac_execve:ENOMEM             Not enough space
2084  *      __mac_execve:EFAULT             Bad address
2085  *      __mac_execve:ENAMETOOLONG       Filename too long
2086  *      __mac_execve:ENOEXEC            Executable file format error
2087  *      __mac_execve:ETXTBSY            Text file busy [misuse of error code]
2088  *      __mac_execve:???
2089  *
2090  * TODO:        Dynamic linker header address on stack is copied via suword()
2091  */
2092 /* ARGSUSED */
2093 int
2094 execve(proc_t p, struct execve_args *uap, int32_t *retval)
2095 {
2096         struct __mac_execve_args muap;
2097         int err;
2098
2099         muap.fname = uap->fname;
2100         muap.argp = uap->argp;
2101         muap.envp = uap->envp;
2102         muap.mac_p = USER_ADDR_NULL;
2103         err = __mac_execve(p, &muap, retval);
2104
2105         return(err);
2106 }
2107
2108 /*
2109  * __mac_execve
2110  *
2111  * Parameters:  uap->fname              File name to exec
2112  *              uap->argp               Argument list
2113  *              uap->envp               Environment list
2114  *              uap->mac_p              MAC label supplied by caller
2115  *
2116  * Returns:     0                       Success
2117  *              EINVAL                  Invalid argument
2118  *              ENOTSUP                 Not supported
2119  *              ENOEXEC                 Executable file format error
2120  *      exec_activate_image:EINVAL      Invalid argument
2121  *      exec_activate_image:EACCES      Permission denied
2122  *      exec_activate_image:EINTR       Interrupted function
2123  *      exec_activate_image:ENOMEM      Not enough space
2124  *      exec_activate_image:EFAULT      Bad address
2125  *      exec_activate_image:ENAMETOOLONG        Filename too long
2126  *      exec_activate_image:ENOEXEC     Executable file format error
2127  *      exec_activate_image:ETXTBSY     Text file busy [misuse of error code]
2128  *      exec_activate_image:EBADEXEC    The executable is corrupt/unknown
2129  *      exec_activate_image:???
2130  *      mac_execve_enter:???
2131  *
2132  * TODO:        Dynamic linker header address on stack is copied via suword()
2133  */
2134 int
2135 __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
2136 {
2137         char *bufp = NULL;
2138         struct image_params *imgp;
2139         struct vnode_attr *vap;
2140         struct vnode_attr *origvap;
2141         int error;
2142         char alt_p_comm[sizeof(p->p_comm)] = {0};       /* for PowerPC */
2143         int is_64 = IS_64BIT_PROCESS(p);
2144         struct vfs_context context;
2145
2146         context.vc_thread = current_thread();
2147         context.vc_ucred = kauth_cred_proc_ref(p);      /* XXX must NOT be kauth_cred_get() */
2148
2149         /* Allocate a big chunk for locals instead of using stack since these
2150          * structures a pretty big.
2151          */
2152         MALLOC(bufp, char *, (sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)), M_TEMP, M_WAITOK | M_ZERO);
2153         imgp = (struct image_params *) bufp;
2154         if (bufp == NULL) {
2155                 error = ENOMEM;
2156                 goto exit_with_error;
2157         }
2158         vap = (struct vnode_attr *) (bufp + sizeof(*imgp));
2159         origvap = (struct vnode_attr *) (bufp + sizeof(*imgp) + sizeof(*vap));
2160
2161         /* Initialize the common data in the image_params structure */
2162         imgp->ip_user_fname = uap->fname;
2163         imgp->ip_user_argv = uap->argp;
2164         imgp->ip_user_envv = uap->envp;
2165         imgp->ip_vattr = vap;
2166         imgp->ip_origvattr = origvap;
2167         imgp->ip_vfs_context = &context;
2168         imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE);
2169         imgp->ip_p_comm = alt_p_comm;           /* for PowerPC */
2170         imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32);
2171
2172 #if CONFIG_MACF
2173         if (uap->mac_p != USER_ADDR_NULL) {
2174                 error = mac_execve_enter(uap->mac_p, imgp);
2175                 if (error) {
2176                         kauth_cred_unref(&context.vc_ucred);
2177                         goto exit_with_error;
2178                 }
2179         }
2180 #endif
2181
2182         error = exec_activate_image(imgp);
2183
2184         kauth_cred_unref(&context.vc_ucred);
2185
2186         /* Image not claimed by any activator? */
2187         if (error == -1)
2188                 error = ENOEXEC;
2189
2190         if (error == 0) {
2191                 exec_resettextvp(p, imgp);
2192                 error = check_for_signature(p, imgp);
2193         }
2194         if (imgp->ip_vp != NULLVP)
2195                 vnode_put(imgp->ip_vp);
2196         if (imgp->ip_strings)
2197                 execargs_free(imgp);
2198 #if CONFIG_MACF
2199         if (imgp->ip_execlabelp)
2200                 mac_cred_label_free(imgp->ip_execlabelp);
2201         if (imgp->ip_scriptlabelp)
2202                 mac_vnode_label_free(imgp->ip_scriptlabelp);
2203 #endif
2204         if (!error) {
2205                 struct uthread  *uthread;
2206
2207                 /* Sever any extant thread affinity */
2208                 thread_affinity_exec(current_thread());
2209
2210                 DTRACE_PROC(exec__success);
2211                 uthread = get_bsdthread_info(current_thread());
2212                 if (uthread->uu_flag & UT_VFORK) {
2213                         vfork_return(p, retval, p->p_pid);
2214                         (void)thread_resume(imgp->ip_new_thread);
2215                 }
2216         } else {
2217                 DTRACE_PROC1(exec__failure, int, error);
2218         }
2219
2220 exit_with_error:
2221         if (bufp != NULL) {
2222                 FREE(bufp, M_TEMP);
2223         }
2224
2225         return(error);
2226 }
2227
2228
2229 /*
2230  * copyinptr
2231  *
2232  * Description: Copy a pointer in from user space to a user_addr_t in kernel
2233  *              space, based on 32/64 bitness of the user space
2234  *
2235  * Parameters:  froma                   User space address
2236  *              toptr                   Address of kernel space user_addr_t
2237  *              ptr_size                4/8, based on 'froma' address space
2238  *
2239  * Returns:     0                       Success
2240  *              EFAULT                  Bad 'froma'
2241  *
2242  * Implicit returns:
2243  *              *ptr_size               Modified
2244  */
2245 static int
2246 copyinptr(user_addr_t froma, user_addr_t *toptr, int ptr_size)
2247 {
2248         int error;
2249
2250         if (ptr_size == 4) {
2251                 /* 64 bit value containing 32 bit address */
2252                 unsigned int i;
2253
2254                 error = copyin(froma, &i, 4);
2255                 *toptr = CAST_USER_ADDR_T(i);   /* SAFE */
2256         } else {
2257                 error = copyin(froma, toptr, 8);
2258         }
2259         return (error);
2260 }
2261
2262
2263 /*
2264  * copyoutptr
2265  *
2266  * Description: Copy a pointer out from a user_addr_t in kernel space to
2267  *              user space, based on 32/64 bitness of the user space
2268  *
2269  * Parameters:  ua                      User space address to copy to
2270  *              ptr                     Address of kernel space user_addr_t
2271  *              ptr_size                4/8, based on 'ua' address space
2272  *
2273  * Returns:     0                       Success
2274  *              EFAULT                  Bad 'ua'
2275  *
2276  * Implicit returns:
2277  *              *ptr_size               Modified
2278  */
2279 static int
2280 copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size)
2281 {
2282         int error;
2283
2284         if (ptr_size == 4) {
2285                 /* 64 bit value containing 32 bit address */
2286                 unsigned int i = CAST_DOWN_EXPLICIT(unsigned int,ua);   /* SAFE */
2287
2288                 error = copyout(&i, ptr, 4);
2289         } else {
2290                 error = copyout(&ua, ptr, 8);
2291         }
2292         return (error);
2293 }
2294
2295
2296 /*
2297  * exec_copyout_strings
2298  *
2299  * Copy out the strings segment to user space.  The strings segment is put
2300  * on a preinitialized stack frame.
2301  *
2302  * Parameters:  struct image_params *   the image parameter block
2303  *              int *                   a pointer to the stack offset variable
2304  *
2305  * Returns:     0                       Success
2306  *              !0                      Faiure: errno
2307  *
2308  * Implicit returns:
2309  *              (*stackp)               The stack offset, modified
2310  *
2311  * Note:        The strings segment layout is backward, from the beginning
2312  *              of the top of the stack to consume the minimal amount of
2313  *              space possible; the returned stack pointer points to the
2314  *              end of the area consumed (stacks grow upward).
2315  *
2316  *              argc is an int; arg[i] are pointers; env[i] are pointers;
2317  *              exec_path is a pointer; the 0's are (void *)NULL's
2318  *
2319  * The stack frame layout is:
2320  *
2321  *      +-------------+
2322  * sp-> |     argc    |
2323  *      +-------------+
2324  *      |    arg[0]   |
2325  *      +-------------+
2326  *             :
2327  *             :
2328  *      +-------------+
2329  *      | arg[argc-1] |
2330  *      +-------------+
2331  *      |      0      |
2332  *      +-------------+
2333  *      |    env[0]   |
2334  *      +-------------+
2335  *             :
2336  *             :
2337  *      +-------------+
2338  *      |    env[n]   |
2339  *      +-------------+
2340  *      |      0      |
2341  *      +-------------+
2342  *      |  exec_path  | In MacOS X PR2 Beaker2E the path passed to exec() is
2343  *      +-------------+ passed on the stack just after the trailing 0 of the
2344  *      |      0      | the envp[] array as a pointer to a string.
2345  *      +-------------+
2346  *      |  PATH AREA  |
2347  *      +-------------+
2348  *      | STRING AREA |
2349  *             :
2350  *             :
2351  *      |             | <- p->user_stack
2352  *      +-------------+
2353  *
2354  * Although technically a part of the STRING AREA, we treat the PATH AREA as
2355  * a separate entity.  This allows us to align the beginning of the PATH AREA
2356  * to a pointer boundary so that the exec_path, env[i], and argv[i] pointers
2357  * which preceed it on the stack are properly aligned.
2358  *
2359  * TODO:        argc copied with suword(), which takes a 64 bit address
2360  */
2361 static int
2362 exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp)
2363 {
2364         proc_t p = vfs_context_proc(imgp->ip_vfs_context);
2365         int     ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4;
2366         char    *argv = imgp->ip_argv;  /* modifiable copy of argv */
2367         user_addr_t     string_area;    /* *argv[], *env[] */
2368         user_addr_t     path_area;      /* package launch path */
2369         user_addr_t     ptr_area;       /* argv[], env[], exec_path */
2370         user_addr_t     stack;
2371         int     stringc = imgp->ip_argc + imgp->ip_envc;
2372         size_t len;
2373         int error;
2374         ssize_t strspace;
2375
2376         stack = *stackp;
2377
2378         size_t patharea_len = imgp->ip_argv - imgp->ip_strings;
2379         int envc_add = 0;
2380
2381         /*
2382          * Set up pointers to the beginning of the string area, the beginning
2383          * of the path area, and the beginning of the pointer area (actually,
2384          * the location of argc, an int, which may be smaller than a pointer,
2385          * but we use ptr_size worth of space for it, for alignment).
2386          */
2387         string_area = stack - (((imgp->ip_strendp - imgp->ip_strings) + ptr_size-1) & ~(ptr_size-1)) - ptr_size;
2388         path_area = string_area - ((patharea_len + ptr_size-1) & ~(ptr_size-1));
2389         ptr_area = path_area - ((imgp->ip_argc + imgp->ip_envc + 4 + envc_add) * ptr_size) - ptr_size /*argc*/;
2390
2391         /* Return the initial stack address: the location of argc */
2392         *stackp = ptr_area;
2393
2394         /*
2395          * Record the size of the arguments area so that sysctl_procargs()
2396          * can return the argument area without having to parse the arguments.
2397          */
2398         proc_lock(p);
2399         p->p_argc = imgp->ip_argc;
2400         p->p_argslen = (int)(stack - path_area);
2401         proc_unlock(p);
2402
2403
2404         /*
2405          * Support for new app package launching for Mac OS X allocates
2406          * the "path" at the begining of the imgp->ip_strings buffer.
2407          * copy it just before the string area.
2408          */
2409         len = 0;
2410         error = copyoutstr(imgp->ip_strings, path_area,
2411                                                    patharea_len,
2412                                                    &len);
2413         if (error)
2414                 goto bad;
2415
2416
2417         /* Save a NULL pointer below it */
2418         (void)copyoutptr(0LL, path_area - ptr_size, ptr_size);
2419
2420         /* Save the pointer to "path" just below it */
2421         (void)copyoutptr(path_area, path_area - 2*ptr_size, ptr_size);
2422
2423         /*
2424          * ptr_size for 2 NULL one each ofter arg[argc -1] and env[n]
2425          * ptr_size for argc
2426          * skip over saved path, ptr_size for pointer to path,
2427          * and ptr_size for the NULL after pointer to path.
2428          */
2429
2430         /* argc (int32, stored in a ptr_size area) */
2431         (void)suword(ptr_area, imgp->ip_argc);
2432         ptr_area += sizeof(int);
2433         /* pad to ptr_size, if 64 bit image, to ensure user stack alignment */
2434         if (imgp->ip_flags & IMGPF_IS_64BIT) {
2435                 (void)suword(ptr_area, 0);      /* int, not long: ignored */
2436                 ptr_area += sizeof(int);
2437         }
2438
2439 #if CONFIG_DTRACE
2440         p->p_dtrace_argv = ptr_area; /* user_addr_t &argv[0] for dtrace convenience */
2441 #endif /* CONFIG_DTRACE */
2442
2443         /*
2444          * We use (string_area - path_area) here rather than the more
2445          * intuitive (imgp->ip_argv - imgp->ip_strings) because we are
2446          * interested in the length of the PATH_AREA in user space,
2447          * rather than the actual length of the execution path, since
2448          * it includes alignment padding of the PATH_AREA + STRING_AREA
2449          * to a ptr_size boundary.
2450          */
2451         strspace = SIZE_IMG_STRSPACE - (string_area - path_area);
2452         for (;;) {
2453                 if (stringc == imgp->ip_envc) {
2454                         /* argv[n] = NULL */
2455                         (void)copyoutptr(0LL, ptr_area, ptr_size);
2456                         ptr_area += ptr_size;
2457 #if CONFIG_DTRACE
2458                         p->p_dtrace_envp = ptr_area; /* user_addr_t &env[0] for dtrace convenience */
2459 #endif /* CONFIG_DTRACE */
2460                 }
2461                 if (--stringc < 0)
2462                         break;
2463
2464                 /* pointer: argv[n]/env[n] */
2465                 (void)copyoutptr(string_area, ptr_area, ptr_size);
2466
2467                 /* string : argv[n][]/env[n][] */
2468                 do {
2469                         if (strspace <= 0) {
2470                                 error = E2BIG;
2471                                 break;
2472                         }
2473                         error = copyoutstr(argv, string_area,
2474                                                 strspace,
2475                                                 &len);
2476                         string_area += len;
2477                         argv += len;
2478                         strspace -= len;
2479                 } while (error == ENAMETOOLONG);
2480                 if (error == EFAULT || error == E2BIG)
2481                         break;  /* bad stack - user's problem */
2482                 ptr_area += ptr_size;
2483         }
2484         /* env[n] = NULL */
2485         (void)copyoutptr(0LL, ptr_area, ptr_size);
2486
2487 bad:
2488         return(error);
2489 }
2490
2491
2492 /*
2493  * exec_extract_strings
2494  *
2495  * Copy arguments and environment from user space into work area; we may
2496  * have already copied some early arguments into the work area, and if
2497  * so, any arguments opied in are appended to those already there.
2498  *
2499  * Parameters:  struct image_params *   the image parameter block
2500  *
2501  * Returns:     0                       Success
2502  *              !0                      Failure: errno
2503  *
2504  * Implicit returns;
2505  *              (imgp->ip_argc)         Count of arguments, updated
2506  *              (imgp->ip_envc)         Count of environment strings, updated
2507  *
2508  *
2509  * Note:        The argument and environment vectors are user space pointers
2510  *              to arrays of user space pointers.
2511  */
2512 static int
2513 exec_extract_strings(struct image_params *imgp)
2514 {
2515         int error = 0;
2516         int strsz = 0;
2517         int     ptr_size = (imgp->ip_flags & IMGPF_WAS_64BIT) ? 8 : 4;
2518         user_addr_t     argv = imgp->ip_user_argv;
2519         user_addr_t     envv = imgp->ip_user_envv;
2520
2521         /*
2522          * If the argument vector is NULL, this is the system startup
2523          * bootstrap from load_init_program(), and there's nothing to do
2524          */
2525         if (imgp->ip_user_argv == 0LL)
2526                 goto bad;
2527
2528         /* Now, get rest of arguments */
2529
2530         /*
2531          * Adjust space reserved for the path name by however much padding it
2532          * needs. Doing this here since we didn't know if this would be a 32-
2533          * or 64-bit process back in exec_save_path.
2534          */
2535         strsz = strlen(imgp->ip_strings) + 1;
2536         imgp->ip_strspace -= ((strsz + ptr_size-1) & ~(ptr_size-1)) - strsz;
2537
2538         /*
2539          * If we are running an interpreter, replace the av[0] that was
2540          * passed to execve() with the fully qualified path name that was
2541          * passed to execve() for interpreters which do not use the PATH
2542          * to locate their script arguments.
2543          */
2544         if((imgp->ip_flags & IMGPF_INTERPRET) != 0 && argv != 0LL) {
2545                 user_addr_t     arg;
2546
2547                 error = copyinptr(argv, &arg, ptr_size);
2548                 if (error)
2549                         goto bad;
2550                 if (arg != 0LL && arg != (user_addr_t)-1) {
2551                         argv += ptr_size;
2552                         error = exec_add_string(imgp, imgp->ip_user_fname);
2553                         if (error)
2554                                 goto bad;
2555                         imgp->ip_argc++;
2556                 }
2557         }
2558
2559         while (argv != 0LL) {
2560                 user_addr_t     arg;
2561
2562                 error = copyinptr(argv, &arg, ptr_size);
2563                 if (error)
2564                         goto bad;
2565
2566                 argv += ptr_size;
2567                 if (arg == 0LL) {
2568                         break;
2569                 } else if (arg == (user_addr_t)-1) {
2570                         /* Um... why would it be -1? */
2571                         error = EFAULT;
2572                         goto bad;
2573                 }
2574                 /*
2575                 * av[n...] = arg[n]
2576                 */
2577                 error = exec_add_string(imgp, arg);
2578                 if (error)
2579                         goto bad;
2580                 imgp->ip_argc++;
2581         }
2582
2583         /* Note where the args end and env begins. */
2584         imgp->ip_strendargvp = imgp->ip_strendp;
2585
2586         /* Now, get the environment */
2587         while (envv != 0LL) {
2588                 user_addr_t     env;
2589
2590                 error = copyinptr(envv, &env, ptr_size);
2591                 if (error)
2592                         goto bad;
2593
2594                 envv += ptr_size;
2595                 if (env == 0LL) {
2596                         break;
2597                 } else if (env == (user_addr_t)-1) {
2598                         error = EFAULT;
2599                         goto bad;
2600                 }
2601                 /*
2602                 * av[n...] = env[n]
2603                 */
2604                 error = exec_add_string(imgp, env);
2605                 if (error)
2606                         goto bad;
2607                 imgp->ip_envc++;
2608         }
2609 bad:
2610         return error;
2611 }
2612
2613
2614 #define unix_stack_size(p)      (p->p_rlimit[RLIMIT_STACK].rlim_cur)
2615
2616 /*
2617  * exec_check_permissions
2618  *
2619  * Decription:  Verify that the file that is being attempted to be executed
2620  *              is in fact allowed to be executed based on it POSIX file
2621  *              permissions and other access control criteria
2622  *
2623  * Parameters:  struct image_params *   the image parameter block
2624  *
2625  * Returns:     0                       Success
2626  *              EACCES                  Permission denied
2627  *              ENOEXEC                 Executable file format error
2628  *              ETXTBSY                 Text file busy [misuse of error code]
2629  *      vnode_getattr:???
2630  *      vnode_authorize:???
2631  */
2632 static int
2633 exec_check_permissions(struct image_params *imgp)
2634 {
2635         struct vnode *vp = imgp->ip_vp;
2636         struct vnode_attr *vap = imgp->ip_vattr;
2637         proc_t p = vfs_context_proc(imgp->ip_vfs_context);
2638         int error;
2639         kauth_action_t action;
2640
2641         /* Only allow execution of regular files */
2642         if (!vnode_isreg(vp))
2643                 return (EACCES);
2644
2645         /* Get the file attributes that we will be using here and elsewhere */
2646         VATTR_INIT(vap);
2647         VATTR_WANTED(vap, va_uid);
2648         VATTR_WANTED(vap, va_gid);
2649         VATTR_WANTED(vap, va_mode);
2650         VATTR_WANTED(vap, va_fsid);
2651         VATTR_WANTED(vap, va_fileid);
2652         VATTR_WANTED(vap, va_data_size);
2653         if ((error = vnode_getattr(vp, vap, imgp->ip_vfs_context)) != 0)
2654                 return (error);
2655
2656         /*
2657          * Ensure that at least one execute bit is on - otherwise root
2658          * will always succeed, and we don't want to happen unless the
2659          * file really is executable.
2660          */
2661         if ((vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)
2662                 return (EACCES);
2663
2664         /* Disallow zero length files */
2665         if (vap->va_data_size == 0)
2666                 return (ENOEXEC);
2667
2668         imgp->ip_arch_offset = (user_size_t)0;
2669         imgp->ip_arch_size = vap->va_data_size;
2670
2671         /* Disable setuid-ness for traced programs or if MNT_NOSUID */
2672         if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_lflag & P_LTRACED))
2673                 vap->va_mode &= ~(VSUID | VSGID);
2674
2675 #if CONFIG_MACF
2676         error = mac_vnode_check_exec(imgp->ip_vfs_context, vp, imgp);
2677         if (error)
2678                 return (error);
2679 #endif
2680
2681         /* Check for execute permission */
2682         action = KAUTH_VNODE_EXECUTE;
2683         /* Traced images must also be readable */
2684         if (p->p_lflag & P_LTRACED)
2685                 action |= KAUTH_VNODE_READ_DATA;
2686         if ((error = vnode_authorize(vp, NULL, action, imgp->ip_vfs_context)) != 0)
2687                 return (error);
2688
2689 #if 0
2690         /* Don't let it run if anyone had it open for writing */
2691         vnode_lock(vp);
2692         if (vp->v_writecount) {
2693                 panic("going to return ETXTBSY %x", vp);
2694                 vnode_unlock(vp);
2695                 return (ETXTBSY);
2696         }
2697         vnode_unlock(vp);
2698 #endif
2699
2700
2701 #ifdef IMGPF_POWERPC
2702         /*
2703          * If the file we are about to attempt to load is the exec_handler_ppc,
2704          * which is determined by matching the vattr fields against previously
2705          * cached values, then we set the PowerPC environment flag.
2706          */
2707         if (vap->va_fsid == exec_archhandler_ppc.fsid &&
2708                 vap->va_fileid == (uint64_t)((uint32_t)exec_archhandler_ppc.fileid)) {
2709                 imgp->ip_flags |= IMGPF_POWERPC;
2710         }
2711 #endif  /* IMGPF_POWERPC */
2712
2713         /* XXX May want to indicate to underlying FS that vnode is open */
2714
2715         return (error);
2716 }
2717
2718
2719 /*
2720  * exec_handle_sugid
2721  *
2722  * Initially clear the P_SUGID in the process flags; if an SUGID process is
2723  * exec'ing a non-SUGID image, then  this is the point of no return.
2724  *
2725  * If the image being activated is SUGID, then replace the credential with a
2726  * copy, disable tracing (unless the tracing process is root), reset the
2727  * mach task port to revoke it, set the P_SUGID bit,
2728  *
2729  * If the saved user and group ID will be changing, then make sure it happens
2730  * to a new credential, rather than a shared one.
2731  *
2732  * Set the security token (this is probably obsolete, given that the token
2733  * should not technically be separate from the credential itself).
2734  *
2735  * Parameters:  struct image_params *   the image parameter block
2736  *
2737  * Returns:     void                    No failure indication
2738  *
2739  * Implicit returns:
2740  *              <process credential>    Potentially modified/replaced
2741  *              <task port>             Potentially revoked
2742  *              <process flags>         P_SUGID bit potentially modified
2743  *              <security token>        Potentially modified
2744  */
2745 static int
2746 exec_handle_sugid(struct image_params *imgp)
2747 {
2748         kauth_cred_t            cred = vfs_context_ucred(imgp->ip_vfs_context);
2749         proc_t                  p = vfs_context_proc(imgp->ip_vfs_context);
2750         int                     i;
2751         int                     leave_sugid_clear = 0;
2752         int                     error = 0;
2753         struct vnode    *dev_null = NULLVP;
2754 #if CONFIG_MACF
2755         int                     mac_transition;
2756
2757         /*
2758          * Determine whether a call to update the MAC label will result in the
2759          * credential changing.
2760          *
2761          * Note:        MAC policies which do not actually end up modifying
2762          *              the label subsequently are strongly encouraged to
2763          *              return 0 for this check, since a non-zero answer will
2764          *              slow down the exec fast path for normal binaries.
2765          */
2766         mac_transition = mac_cred_check_label_update_execve(
2767                                                         imgp->ip_vfs_context,
2768                                                         imgp->ip_vp,
2769                                                         imgp->ip_scriptlabelp,
2770                                                         imgp->ip_execlabelp, p);
2771 #endif
2772
2773         OSBitAndAtomic(~((uint32_t)P_SUGID), &p->p_flag);
2774
2775         /*
2776          * Order of the following is important; group checks must go last,
2777          * as we use the success of the 'ismember' check combined with the
2778          * failure of the explicit match to indicate that we will be setting
2779          * the egid of the process even though the new process did not
2780          * require VSUID/VSGID bits in order for it to set the new group as
2781          * its egid.
2782          *
2783          * Note:        Technically, by this we are implying a call to
2784          *              setegid() in the new process, rather than implying
2785          *              it used its VSGID bit to set the effective group,
2786          *              even though there is no code in that process to make
2787          *              such a call.
2788          */
2789         if (((imgp->ip_origvattr->va_mode & VSUID) != 0 &&
2790              kauth_cred_getuid(cred) != imgp->ip_origvattr->va_uid) ||
2791             ((imgp->ip_origvattr->va_mode & VSGID) != 0 &&
2792                  ((kauth_cred_ismember_gid(cred, imgp->ip_origvattr->va_gid, &leave_sugid_clear) || !leave_sugid_clear) ||
2793                  (cred->cr_gid != imgp->ip_origvattr->va_gid)))) {
2794
2795 #if CONFIG_MACF
2796 /* label for MAC transition and neither VSUID nor VSGID */
2797 handle_mac_transition:
2798 #endif
2799
2800                 /*
2801                  * Replace the credential with a copy of itself if euid or
2802                  * egid change.
2803                  *
2804                  * Note:        setuid binaries will automatically opt out of
2805                  *              group resolver participation as a side effect
2806                  *              of this operation.  This is an intentional
2807                  *              part of the security model, which requires a
2808                  *              participating credential be established by
2809                  *              escalating privilege, setting up all other
2810                  *              aspects of the credential including whether
2811                  *              or not to participate in external group
2812                  *              membership resolution, then dropping their
2813                  *              effective privilege to that of the desired
2814                  *              final credential state.
2815                  */
2816                 if (imgp->ip_origvattr->va_mode & VSUID) {
2817                         p->p_ucred  = kauth_cred_setresuid(p->p_ucred, KAUTH_UID_NONE, imgp->ip_origvattr->va_uid, imgp->ip_origvattr->va_uid, KAUTH_UID_NONE);
2818                 }
2819                 if (imgp->ip_origvattr->va_mode & VSGID) {
2820                         p->p_ucred = kauth_cred_setresgid(p->p_ucred, KAUTH_GID_NONE, imgp->ip_origvattr->va_gid, imgp->ip_origvattr->va_gid);
2821                 }
2822
2823 #if CONFIG_MACF
2824                 /*
2825                  * If a policy has indicated that it will transition the label,
2826                  * before making the call into the MAC policies, get a new
2827                  * duplicate credential, so they can modify it without
2828                  * modifying any others sharing it.
2829                  */
2830                 if (mac_transition) {
2831                         kauth_cred_t    my_cred;
2832                         if (kauth_proc_label_update_execve(p,
2833                                                 imgp->ip_vfs_context,
2834                                                 imgp->ip_vp,
2835                                                 imgp->ip_scriptlabelp,
2836                                                 imgp->ip_execlabelp)) {
2837                                 /*
2838                                  * If updating the MAC label resulted in a
2839                                  * disjoint credential, flag that we need to
2840                                  * set the P_SUGID bit.  This protects
2841                                  * against debuggers being attached by an
2842                                  * insufficiently privileged process onto the
2843                                  * result of a transition to a more privileged
2844                                  * credential.
2845                                  */
2846                                 leave_sugid_clear = 0;
2847                         }
2848
2849                         my_cred = kauth_cred_proc_ref(p);
2850                         mac_task_label_update_cred(my_cred, p->task);
2851                         kauth_cred_unref(&my_cred);
2852                 }
2853 #endif  /* CONFIG_MACF */
2854
2855                 /*
2856                  * Have mach reset the task and thread ports.
2857                  * We don't want anyone who had the ports before
2858                  * a setuid exec to be able to access/control the
2859                  * task/thread after.
2860                  */
2861                 if (current_task() == p->task) {
2862                         ipc_task_reset(p->task);
2863                         ipc_thread_reset(current_thread());
2864                 }
2865
2866                 /*
2867                  * If 'leave_sugid_clear' is non-zero, then we passed the
2868                  * VSUID and MACF checks, and successfully determined that
2869                  * the previous cred was a member of the VSGID group, but
2870                  * that it was not the default at the time of the execve,
2871                  * and that the post-labelling credential was not disjoint.
2872                  * So we don't set the P_SUGID on the basis of simply
2873                  * running this code.
2874                  */
2875                 if (!leave_sugid_clear)
2876                         OSBitOrAtomic(P_SUGID, &p->p_flag);
2877
2878                 /* Cache the vnode for /dev/null the first time around */
2879                 if (dev_null == NULLVP) {
2880                         struct nameidata nd1;
2881
2882                         NDINIT(&nd1, LOOKUP, FOLLOW, UIO_SYSSPACE,
2883                             CAST_USER_ADDR_T("/dev/null"),
2884                             imgp->ip_vfs_context);
2885
2886                         if ((error = vn_open(&nd1, FREAD, 0)) == 0) {
2887                                 dev_null = nd1.ni_vp;
2888                                 /*
2889                                  * vn_open returns with both a use_count
2890                                  * and an io_count on the found vnode
2891                                  * drop the io_count, but keep the use_count
2892                                  */
2893                                 vnode_put(nd1.ni_vp);
2894                         }
2895                 }
2896
2897                 /* Radar 2261856; setuid security hole fix */
2898                 /* Patch from OpenBSD: A. Ramesh */
2899                 /*
2900                  * XXX For setuid processes, attempt to ensure that
2901                  * stdin, stdout, and stderr are already allocated.
2902                  * We do not want userland to accidentally allocate
2903                  * descriptors in this range which has implied meaning
2904                  * to libc.
2905                  */
2906                 if (dev_null != NULLVP) {
2907                         for (i = 0; i < 3; i++) {
2908                                 struct fileproc *fp;
2909                                 int indx;
2910
2911                                 if (p->p_fd->fd_ofiles[i] != NULL)
2912                                         continue;
2913
2914                                 if ((error = falloc(p, &fp, &indx, imgp->ip_vfs_context)) != 0)
2915                                         continue;
2916
2917                                 if ((error = vnode_ref_ext(dev_null, FREAD)) != 0) {
2918                                         fp_free(p, indx, fp);
2919                                         break;
2920                                 }
2921
2922                                 fp->f_fglob->fg_flag = FREAD;
2923                                 fp->f_fglob->fg_type = DTYPE_VNODE;
2924                                 fp->f_fglob->fg_ops = &vnops;
2925                                 fp->f_fglob->fg_data = (caddr_t)dev_null;
2926
2927                                 proc_fdlock(p);
2928                                 procfdtbl_releasefd(p, indx, NULL);
2929                                 fp_drop(p, indx, fp, 1);
2930                                 proc_fdunlock(p);
2931                         }
2932                         /*
2933                          * for now we need to drop the reference immediately
2934                          * since we don't have any mechanism in place to
2935                          * release it before starting to unmount "/dev"
2936                          * during a reboot/shutdown
2937                          */
2938                         vnode_rele(dev_null);
2939                         dev_null = NULLVP;
2940                 }
2941         }
2942 #if CONFIG_MACF
2943         else {
2944                 /*
2945                  * We are here because we were told that the MAC label will
2946                  * be transitioned, and the binary is not VSUID or VSGID; to
2947                  * deal with this case, we could either duplicate a lot of
2948                  * code, or we can indicate we want to default the P_SUGID
2949                  * bit clear and jump back up.
2950                  */
2951                 if (mac_transition) {
2952                         leave_sugid_clear = 1;
2953                         goto handle_mac_transition;
2954                 }
2955         }
2956 #endif  /* CONFIG_MACF */
2957
2958         /*
2959          * Implement the semantic where the effective user and group become
2960          * the saved user and group in exec'ed programs.
2961          */
2962         p->p_ucred = kauth_cred_setsvuidgid(p->p_ucred, kauth_cred_getuid(p->p_ucred),  p->p_ucred->cr_gid);
2963
2964         /* Update the process' identity version and set the security token */
2965         p->p_idversion++;
2966         set_security_token(p);
2967
2968         return(error);
2969 }
2970
2971
2972 /*
2973  * create_unix_stack
2974  *
2975  * Description: Set the user stack address for the process to the provided
2976  *              address.  If a custom stack was not set as a result of the
2977  *              load process (i.e. as specified by the image file for the
2978  *              executable), then allocate the stack in the provided map and
2979  *              set up appropriate guard pages for enforcing administrative
2980  *              limits on stack growth, if they end up being needed.
2981  *
2982  * Parameters:  p                       Process to set stack on
2983  *              user_stack              Address to set stack for process to
2984  *              customstack             FALSE if no custom stack in binary
2985  *              map                     Address map in which to allocate the
2986  *                                      new stack, if 'customstack' is FALSE
2987  *
2988  * Returns:     KERN_SUCCESS            Stack successfully created
2989  *              !KERN_SUCCESS           Mach failure code
2990  */
2991 static kern_return_t
2992 create_unix_stack(vm_map_t map, user_addr_t user_stack, int customstack,
2993                         proc_t p)
2994 {
2995         mach_vm_size_t          size, prot_size;
2996         mach_vm_offset_t        addr, prot_addr;
2997         kern_return_t           kr;
2998
2999         proc_lock(p);
3000         p->user_stack = user_stack;
3001         proc_unlock(p);
3002
3003         if (!customstack) {
3004                 /*
3005                  * Allocate enough space for the maximum stack size we
3006                  * will ever authorize and an extra page to act as
3007                  * a guard page for stack overflows.
3008                  */
3009                 size = mach_vm_round_page(MAXSSIZ);
3010 #if STACK_GROWTH_UP
3011                 addr = mach_vm_trunc_page(user_stack);
3012 #else   /* STACK_GROWTH_UP */
3013                 addr = mach_vm_trunc_page(user_stack - size);
3014 #endif  /* STACK_GROWTH_UP */
3015                 kr = mach_vm_allocate(map, &addr, size,
3016                                         VM_MAKE_TAG(VM_MEMORY_STACK) |
3017                                       VM_FLAGS_FIXED);
3018                 if (kr != KERN_SUCCESS) {
3019                         return kr;
3020                 }
3021                 /*
3022                  * And prevent access to what's above the current stack
3023                  * size limit for this process.
3024                  */
3025                 prot_addr = addr;
3026 #if STACK_GROWTH_UP
3027                 prot_addr += unix_stack_size(p);
3028 #endif /* STACK_GROWTH_UP */
3029                 prot_addr = mach_vm_round_page(prot_addr);
3030                 prot_size = mach_vm_trunc_page(size - unix_stack_size(p));
3031                 kr = mach_vm_protect(map,
3032                                      prot_addr,
3033                                      prot_size,
3034                                      FALSE,
3035                                      VM_PROT_NONE);
3036                 if (kr != KERN_SUCCESS) {
3037                         (void) mach_vm_deallocate(map, addr, size);
3038                         return kr;
3039                 }
3040         }
3041         return KERN_SUCCESS;
3042 }
3043
3044 #include <sys/reboot.h>
3045
3046 static char             init_program_name[128] = "/sbin/launchd";
3047
3048 struct execve_args      init_exec_args;
3049
3050 /*
3051  * load_init_program
3052  *
3053  * Description: Load the "init" program; in most cases, this will be "launchd"
3054  *
3055  * Parameters:  p                       Process to call execve() to create
3056  *                                      the "init" program
3057  *
3058  * Returns:     (void)
3059  *
3060  * Notes:       The process that is passed in is the first manufactured
3061  *              process on the system, and gets here via bsd_ast() firing
3062  *              for the first time.  This is done to ensure that bsd_init()
3063  *              has run to completion.
3064  */
3065 void
3066 load_init_program(proc_t p)
3067 {
3068         vm_offset_t     init_addr;
3069         int             argc = 0;
3070         uint32_t argv[3];
3071         int                     error;
3072         int             retval[2];
3073
3074         /*
3075          * Copy out program name.
3076          */
3077
3078         init_addr = VM_MIN_ADDRESS;
3079         (void) vm_allocate(current_map(), &init_addr, PAGE_SIZE,
3080                                 VM_FLAGS_ANYWHERE);
3081         if (init_addr == 0)
3082                 init_addr++;
3083
3084         (void) copyout((caddr_t) init_program_name, CAST_USER_ADDR_T(init_addr),
3085                         (unsigned) sizeof(init_program_name)+1);
3086
3087         argv[argc++] = (uint32_t)init_addr;
3088         init_addr += sizeof(init_program_name);
3089         init_addr = (vm_offset_t)ROUND_PTR(char, init_addr);
3090
3091         /*
3092          * Put out first (and only) argument, similarly.
3093          * Assumes everything fits in a page as allocated
3094          * above.
3095          */
3096         if (boothowto & RB_SINGLE) {
3097                 const char *init_args = "-s";
3098
3099                 copyout(init_args, CAST_USER_ADDR_T(init_addr),
3100                         strlen(init_args));
3101
3102                 argv[argc++] = (uint32_t)init_addr;
3103                 init_addr += strlen(init_args);
3104                 init_addr = (vm_offset_t)ROUND_PTR(char, init_addr);
3105
3106         }
3107
3108         /*
3109          * Null-end the argument list
3110          */
3111         argv[argc] = 0;
3112
3113         /*
3114          * Copy out the argument list.
3115          */
3116
3117         (void) copyout((caddr_t) argv, CAST_USER_ADDR_T(init_addr),
3118                         (unsigned) sizeof(argv));
3119
3120         /*
3121          * Set up argument block for fake call to execve.
3122          */
3123
3124         init_exec_args.fname = CAST_USER_ADDR_T(argv[0]);
3125         init_exec_args.argp = CAST_USER_ADDR_T((char **)init_addr);
3126         init_exec_args.envp = CAST_USER_ADDR_T(0);
3127
3128         /*
3129          * So that mach_init task is set with uid,gid 0 token
3130          */
3131         set_security_token(p);
3132
3133         error = execve(p,&init_exec_args,retval);
3134         if (error)
3135                 panic("Process 1 exec of %s failed, errno %d\n",
3136                       init_program_name, error);
3137 }
3138
3139 /*
3140  * load_return_to_errno
3141  *
3142  * Description: Convert a load_return_t (Mach error) to an errno (BSD error)
3143  *
3144  * Parameters:  lrtn                    Mach error number
3145  *
3146  * Returns:     (int)                   BSD error number
3147  *              0                       Success
3148  *              EBADARCH                Bad architecture
3149  *              EBADMACHO               Bad Mach object file
3150  *              ESHLIBVERS              Bad shared library version
3151  *              ENOMEM                  Out of memory/resource shortage
3152  *              EACCES                  Access denied
3153  *              ENOENT                  Entry not found (usually "file does
3154  *                                      does not exist")
3155  *              EIO                     An I/O error occurred
3156  *              EBADEXEC                The executable is corrupt/unknown
3157  */
3158 static int
3159 load_return_to_errno(load_return_t lrtn)
3160 {
3161         switch (lrtn) {
3162         case LOAD_SUCCESS:
3163                 return 0;
3164         case LOAD_BADARCH:
3165                 return EBADARCH;
3166         case LOAD_BADMACHO:
3167                 return EBADMACHO;
3168         case LOAD_SHLIB:
3169                 return ESHLIBVERS;
3170         case LOAD_NOSPACE:
3171         case LOAD_RESOURCE:
3172                 return ENOMEM;
3173         case LOAD_PROTECT:
3174                 return EACCES;
3175         case LOAD_ENOENT:
3176                 return ENOENT;
3177         case LOAD_IOERROR:
3178                 return EIO;
3179         case LOAD_FAILURE:
3180         default:
3181                 return EBADEXEC;
3182         }
3183 }
3184
3185 #include <mach/mach_types.h>
3186 #include <mach/vm_prot.h>
3187 #include <mach/semaphore.h>
3188 #include <mach/sync_policy.h>
3189 #include <kern/clock.h>
3190 #include <mach/kern_return.h>
3191
3192 extern semaphore_t execve_semaphore;
3193
3194 /*
3195  * execargs_alloc
3196  *
3197  * Description: Allocate the block of memory used by the execve arguments.
3198  *              At the same time, we allocate a page so that we can read in
3199  *              the first page of the image.
3200  *
3201  * Parameters:  struct image_params *   the image parameter block
3202  *
3203  * Returns:     0                       Success
3204  *              EINVAL                  Invalid argument
3205  *              EACCES                  Permission denied
3206  *              EINTR                   Interrupted function
3207  *              ENOMEM                  Not enough space
3208  *
3209  * Notes:       This is a temporary allocation into the kernel address space
3210  *              to enable us to copy arguments in from user space.  This is
3211  *              necessitated by not mapping the process calling execve() into
3212  *              the kernel address space during the execve() system call.
3213  *
3214  *              We assemble the argument and environment, etc., into this
3215  *              region before copying it as a single block into the child
3216  *              process address space (at the top or bottom of the stack,
3217  *              depending on which way the stack grows; see the function
3218  *              exec_copyout_strings() for details).
3219  *
3220  *              This ends up with a second (possibly unnecessary) copy compared
3221  *              with assembing the data directly into the child address space,
3222  *              instead, but since we cannot be guaranteed that the parent has
3223  *              not modified its environment, we can't really know that it's
3224  *              really a block there as well.
3225  */
3226
3227
3228 static int execargs_waiters = 0;
3229 lck_mtx_t *execargs_cache_lock;
3230
3231 static void
3232 execargs_lock_lock(void) {
3233         lck_mtx_lock_spin(execargs_cache_lock);
3234 }
3235
3236 static void
3237 execargs_lock_unlock(void) {
3238         lck_mtx_unlock(execargs_cache_lock);
3239 }
3240
3241 static void
3242 execargs_lock_sleep(void) {
3243         lck_mtx_sleep(execargs_cache_lock, LCK_SLEEP_DEFAULT, &execargs_free_count, THREAD_UNINT);
3244 }
3245
3246 static kern_return_t
3247 execargs_purgeable_allocate(char **execarg_address) {
3248         kern_return_t kr = vm_allocate(bsd_pageable_map, (vm_offset_t *)execarg_address, NCARGS + PAGE_SIZE, VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE);
3249         assert(kr == KERN_SUCCESS);
3250         return kr;
3251 }
3252
3253 static kern_return_t
3254 execargs_purgeable_reference(void *execarg_address) {
3255         int state = VM_PURGABLE_NONVOLATILE;
3256         kern_return_t kr = vm_purgable_control(bsd_pageable_map, (vm_offset_t) execarg_address, VM_PURGABLE_SET_STATE, &state);
3257
3258         assert(kr == KERN_SUCCESS);
3259         return kr;
3260 }
3261
3262 static kern_return_t
3263 execargs_purgeable_volatilize(void *execarg_address) {
3264         int state = VM_PURGABLE_VOLATILE | VM_PURGABLE_ORDERING_OBSOLETE;
3265         kern_return_t kr;
3266         kr = vm_purgable_control(bsd_pageable_map, (vm_offset_t) execarg_address, VM_PURGABLE_SET_STATE, &state);
3267
3268         assert(kr == KERN_SUCCESS);
3269
3270         return kr;
3271 }
3272
3273 static void
3274 execargs_wakeup_waiters(void) {
3275         thread_wakeup(&execargs_free_count);
3276 }
3277
3278 static int
3279 execargs_alloc(struct image_params *imgp)
3280 {
3281         kern_return_t kret;
3282         int i, cache_index = -1;
3283
3284         execargs_lock_lock();
3285
3286         while (execargs_free_count == 0) {
3287                 execargs_waiters++;
3288                 execargs_lock_sleep();
3289                 execargs_waiters--;
3290         }
3291
3292         execargs_free_count--;
3293
3294         for (i = 0; i < execargs_cache_size; i++) {
3295                 vm_offset_t element = execargs_cache[i];
3296                 if (element) {
3297                         cache_index = i;
3298                         imgp->ip_strings = (char *)(execargs_cache[i]);
3299                         execargs_cache[i] = 0;
3300                         break;
3301                 }
3302         }
3303
3304         assert(execargs_free_count >= 0);
3305
3306         execargs_lock_unlock();
3307
3308         if (cache_index == -1) {
3309                 kret = execargs_purgeable_allocate(&imgp->ip_strings);
3310         }
3311         else
3312                 kret = execargs_purgeable_reference(imgp->ip_strings);
3313
3314         assert(kret == KERN_SUCCESS);
3315         if (kret != KERN_SUCCESS) {
3316                 return (ENOMEM);
3317         }
3318
3319         imgp->ip_vdata = imgp->ip_strings + NCARGS;
3320
3321         return (0);
3322 }
3323
3324 /*
3325  * execargs_free
3326  *
3327  * Description: Free the block of memory used by the execve arguments and the
3328  *              first page of the executable by a previous call to the function
3329  *              execargs_alloc().
3330  *
3331  * Parameters:  struct image_params *   the image parameter block
3332  *
3333  * Returns:     0                       Success
3334  *              EINVAL                  Invalid argument
3335  *              EINTR                   Oeration interrupted
3336  */
3337 static int
3338 execargs_free(struct image_params *imgp)
3339 {
3340         kern_return_t kret;
3341         int i;
3342         boolean_t needs_wakeup = FALSE;
3343
3344         kret = execargs_purgeable_volatilize(imgp->ip_strings);
3345
3346         execargs_lock_lock();
3347         execargs_free_count++;
3348
3349         for (i = 0; i < execargs_cache_size; i++) {
3350                 vm_offset_t element = execargs_cache[i];
3351                 if (element == 0) {
3352                         execargs_cache[i] = (vm_offset_t) imgp->ip_strings;
3353                         imgp->ip_strings = NULL;
3354                         break;
3355                 }
3356         }
3357
3358         assert(imgp->ip_strings == NULL);
3359
3360         if (execargs_waiters > 0)
3361                 needs_wakeup = TRUE;
3362
3363         execargs_lock_unlock();
3364
3365         if (needs_wakeup == TRUE)
3366                 execargs_wakeup_waiters();
3367
3368         return ((kret == KERN_SUCCESS ? 0 : EINVAL));
3369 }
3370
3371 static void
3372 exec_resettextvp(proc_t p, struct image_params *imgp)
3373 {
3374         vnode_t vp;
3375         off_t offset;
3376         vnode_t tvp  = p->p_textvp;
3377         int ret;
3378
3379         vp = imgp->ip_vp;
3380         offset = imgp->ip_arch_offset;
3381
3382         if (vp == NULLVP)
3383                 panic("exec_resettextvp: expected valid vp");
3384
3385         ret = vnode_ref(vp);
3386         proc_lock(p);
3387         if (ret == 0) {
3388                 p->p_textvp = vp;
3389                 p->p_textoff = offset;
3390         } else {
3391                 p->p_textvp = NULLVP;   /* this is paranoia */
3392                 p->p_textoff = 0;
3393         }
3394         proc_unlock(p);
3395
3396         if ( tvp != NULLVP) {
3397                 if (vnode_getwithref(tvp) == 0) {
3398                         vnode_rele(tvp);
3399                         vnode_put(tvp);
3400                 }
3401         }
3402
3403 }
3404
3405 static int
3406 check_for_signature(proc_t p, struct image_params *imgp)
3407 {
3408         mach_port_t port = NULL;
3409         kern_return_t error = 0;
3410         unsigned char hash[SHA1_RESULTLEN];
3411
3412         /*
3413          * Override inherited code signing flags with the
3414          * ones for the process that is being successfully
3415          * loaded
3416          */
3417         proc_lock(p);
3418         p->p_csflags = imgp->ip_csflags;
3419         proc_unlock(p);
3420
3421         /* Set the switch_protect flag on the map */
3422         if(p->p_csflags & (CS_HARD|CS_KILL)) {
3423                 vm_map_switch_protect(get_task_map(p->task), TRUE);
3424         }
3425
3426         /*
3427          * If the task_access_port is set and the proc isn't signed,
3428          * ask for a code signature from user space. Fail the exec
3429          * if permission is denied.
3430          */
3431         error = task_get_task_access_port(p->task, &port);
3432         if (error == 0 && IPC_PORT_VALID(port) && !(p->p_csflags & CS_VALID)) {
3433                 error = find_code_signature(port, p->p_pid);
3434                 if (error == KERN_FAILURE) {
3435                         /* Make very sure execution fails */
3436                         psignal(p, SIGKILL);
3437                         return EACCES;
3438                 }
3439
3440                 /* Only do this if exec_resettextvp() did not fail */
3441                 if (p->p_textvp != NULLVP) {
3442                         /*
3443                          * If there's a new code directory, mark this process
3444                          * as signed.
3445                          */
3446                         error = ubc_cs_getcdhash(p->p_textvp, p->p_textoff, hash);
3447                         if (error == 0) {
3448                                 proc_lock(p);
3449                                 p->p_csflags |= CS_VALID;
3450                                 proc_unlock(p);
3451                         }
3452                 }
3453         }
3454
3455         return KERN_SUCCESS;
3456 }
3457