bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/xattr.h>
  98 #include <sys/fcntl.h>
  99 #include <sys/fsctl.h>
 100 #include <sys/ubc_internal.h>
 101 #include <sys/disk.h>
 102 #include <sys/content_protection.h>
 103 #include <sys/priv.h>
 104 #include <machine/cons.h>
 105 #include <machine/limits.h>
 106 #include <miscfs/specfs/specdev.h>
 107
 108 #include <security/audit/audit.h>
 109 #include <bsm/audit_kevents.h>
 110
 111 #include <mach/mach_types.h>
 112 #include <kern/kern_types.h>
 113 #include <kern/kalloc.h>
 114 #include <kern/task.h>
 115
 116 #include <vm/vm_pageout.h>
 117
 118 #include <libkern/OSAtomic.h>
 119 #include <pexpert/pexpert.h>
 120 #include <IOKit/IOBSD.h>
 121
 122 #if ROUTEFS
 123 #include <miscfs/routefs/routefs.h>
 124 #endif /* ROUTEFS */
 125
 126 #if CONFIG_MACF
 127 #include <security/mac.h>
 128 #include <security/mac_framework.h>
 129 #endif
 130
 131 #if CONFIG_FSE
 132 #define GET_PATH(x) \
 133         (x) = get_pathbuff();
 134 #define RELEASE_PATH(x) \
 135         release_pathbuff(x);
 136 #else
 137 #define GET_PATH(x)     \
 138         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 139 #define RELEASE_PATH(x) \
 140         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 141 #endif /* CONFIG_FSE */
 142
 143 /* struct for checkdirs iteration */
 144 struct cdirargs {
 145         vnode_t olddp;
 146         vnode_t newdp;
 147 };
 148 /* callback  for checkdirs iteration */
 149 static int checkdirs_callback(proc_t p, void * arg);
 150
 151 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 152 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 153 void enablequotas(struct mount *mp, vfs_context_t ctx);
 154 static int getfsstat_callback(mount_t mp, void * arg);
 155 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 156 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 157 static int sync_callback(mount_t, void *);
 158 static void sync_thread(void *, __unused wait_result_t);
 159 static int sync_async(int);
 160 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 161                         user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 162                                                 boolean_t partial_copy);
 163 static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
 164                         user_addr_t bufp);
 165 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 166 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 167                         struct componentname *cnp, user_addr_t fsmountargs,
 168                         int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 169                         vfs_context_t ctx);
 170 void vfs_notify_mount(vnode_t pdvp);
 171
 172 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 173
 174 struct fd_vn_data * fg_vn_data_alloc(void);
 175
 176 /*
 177  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 178  * Concurrent lookups (or lookups by ids) on hard links can cause the
 179  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 180  * does) to return ENOENT as the path cannot be returned from the name cache
 181  * alone. We have no option but to retry and hope to get one namei->reverse path
 182  * generation done without an intervening lookup, lookup by id on the hard link
 183  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 184  * which currently are the MAC hooks for rename, unlink and rmdir.
 185  */
 186 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 187
 188 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
 189
 190 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
 191
 192 #ifdef CONFIG_IMGSRC_ACCESS
 193 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 194 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 195 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 196 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 197 static void mount_end_update(mount_t mp);
 198 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 199 #endif /* CONFIG_IMGSRC_ACCESS */
 200
 201 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 202
 203 __private_extern__
 204 int sync_internal(void);
 205
 206 __private_extern__
 207 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 208
 209 extern lck_grp_t *fd_vn_lck_grp;
 210 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 211 extern lck_attr_t *fd_vn_lck_attr;
 212
 213 /*
 214  * incremented each time a mount or unmount operation occurs
 215  * used to invalidate the cached value of the rootvp in the
 216  * mount structure utilized by cache_lookup_path
 217  */
 218 uint32_t mount_generation = 0;
 219
 220 /* counts number of mount and unmount operations */
 221 unsigned int vfs_nummntops=0;
 222
 223 extern const struct fileops vnops;
 224 #if CONFIG_APPLEDOUBLE
 225 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 226 #endif /* CONFIG_APPLEDOUBLE */
 227
 228 typedef uint32_t vfs_rename_flags_t;
 229 #if CONFIG_SECLUDED_RENAME
 230 enum {
 231         VFS_SECLUDE_RENAME              = 0x00000001
 232 };
 233 #endif
 234
 235 /*
 236  * Virtual File System System Calls
 237  */
 238
 239 #if NFSCLIENT || DEVFS || ROUTEFS
 240 /*
 241  * Private in-kernel mounting spi (NFS only, not exported)
 242  */
 243  __private_extern__
 244 boolean_t
 245 vfs_iskernelmount(mount_t mp)
 246 {
 247         return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE);
 248 }
 249
 250  __private_extern__
 251 int
 252 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 253              void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
 254 {
 255         struct nameidata nd;
 256         boolean_t did_namei;
 257         int error;
 258
 259         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 260                UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 261
 262         /*
 263          * Get the vnode to be covered if it's not supplied
 264          */
 265         if (vp == NULLVP) {
 266                 error = namei(&nd);
 267                 if (error)
 268                         return (error);
 269                 vp = nd.ni_vp;
 270                 pvp = nd.ni_dvp;
 271                 did_namei = TRUE;
 272         } else {
 273                 char *pnbuf = CAST_DOWN(char *, path);
 274
 275                 nd.ni_cnd.cn_pnbuf = pnbuf;
 276                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 277                 did_namei = FALSE;
 278         }
 279
 280         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 281                              syscall_flags, kern_flags, NULL, TRUE, ctx);
 282
 283         if (did_namei) {
 284                 vnode_put(vp);
 285                 vnode_put(pvp);
 286                 nameidone(&nd);
 287         }
 288
 289         return (error);
 290 }
 291 #endif /* NFSCLIENT || DEVFS */
 292
 293 /*
 294  * Mount a file system.
 295  */
 296 /* ARGSUSED */
 297 int
 298 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 299 {
 300         struct __mac_mount_args muap;
 301
 302         muap.type = uap->type;
 303         muap.path = uap->path;
 304         muap.flags = uap->flags;
 305         muap.data = uap->data;
 306         muap.mac_p = USER_ADDR_NULL;
 307         return (__mac_mount(p, &muap, retval));
 308 }
 309
 310 void
 311 vfs_notify_mount(vnode_t pdvp)
 312 {
 313         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 314         lock_vnode_and_post(pdvp, NOTE_WRITE);
 315 }
 316
 317 /*
 318  * __mac_mount:
 319  *      Mount a file system taking into account MAC label behavior.
 320  *      See mount(2) man page for more information
 321  *
 322  * Parameters:    p                        Process requesting the mount
 323  *                uap                      User argument descriptor (see below)
 324  *                retval                   (ignored)
 325  *
 326  * Indirect:      uap->type                Filesystem type
 327  *                uap->path                Path to mount
 328  *                uap->data                Mount arguments
 329  *                uap->mac_p               MAC info
 330  *                uap->flags               Mount flags
 331  *
 332  *
 333  * Returns:        0                       Success
 334  *                !0                       Not success
 335  */
 336 boolean_t root_fs_upgrade_try = FALSE;
 337
 338 int
 339 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 340 {
 341         vnode_t pvp = NULL;
 342         vnode_t vp = NULL;
 343         int need_nameidone = 0;
 344         vfs_context_t ctx = vfs_context_current();
 345         char fstypename[MFSNAMELEN];
 346         struct nameidata nd;
 347         size_t dummy=0;
 348         char *labelstr = NULL;
 349         int flags = uap->flags;
 350         int error;
 351 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 352         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 353 #else
 354 #pragma unused(p)
 355 #endif
 356         /*
 357          * Get the fs type name from user space
 358          */
 359         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 360         if (error)
 361                 return (error);
 362
 363         /*
 364          * Get the vnode to be covered
 365          */
 366         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 367                UIO_USERSPACE, uap->path, ctx);
 368         error = namei(&nd);
 369         if (error) {
 370                 goto out;
 371         }
 372         need_nameidone = 1;
 373         vp = nd.ni_vp;
 374         pvp = nd.ni_dvp;
 375
 376 #ifdef CONFIG_IMGSRC_ACCESS
 377         /* Mounting image source cannot be batched with other operations */
 378         if (flags == MNT_IMGSRC_BY_INDEX) {
 379                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 380                                                   ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 381                 goto out;
 382         }
 383 #endif /* CONFIG_IMGSRC_ACCESS */
 384
 385 #if CONFIG_MACF
 386         /*
 387          * Get the label string (if any) from user space
 388          */
 389         if (uap->mac_p != USER_ADDR_NULL) {
 390                 struct user_mac mac;
 391                 size_t ulen = 0;
 392
 393                 if (is_64bit) {
 394                         struct user64_mac mac64;
 395                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 396                         mac.m_buflen = mac64.m_buflen;
 397                         mac.m_string = mac64.m_string;
 398                 } else {
 399                         struct user32_mac mac32;
 400                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 401                         mac.m_buflen = mac32.m_buflen;
 402                         mac.m_string = mac32.m_string;
 403                 }
 404                 if (error)
 405                         goto out;
 406                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 407                     (mac.m_buflen < 2)) {
 408                         error = EINVAL;
 409                         goto out;
 410                 }
 411                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 412                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 413                 if (error) {
 414                         goto out;
 415                 }
 416                 AUDIT_ARG(mac_string, labelstr);
 417         }
 418 #endif /* CONFIG_MACF */
 419
 420         AUDIT_ARG(fflags, flags);
 421
 422 #if SECURE_KERNEL
 423         if (flags & MNT_UNION) {
 424                 /* No union mounts on release kernels */
 425                 error = EPERM;
 426                 goto out;
 427         }
 428 #endif
 429
 430         if ((vp->v_flag & VROOT) &&
 431                         (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 432                 if (!(flags & MNT_UNION)) {
 433                         flags |= MNT_UPDATE;
 434                 }
 435                 else {
 436                         /*
 437                          * For a union mount on '/', treat it as fresh
 438                          * mount instead of update.
 439                          * Otherwise, union mouting on '/' used to panic the
 440                          * system before, since mnt_vnodecovered was found to
 441                          * be NULL for '/' which is required for unionlookup
 442                          * after it gets ENOENT on union mount.
 443                          */
 444                         flags = (flags & ~(MNT_UPDATE));
 445                 }
 446
 447 #if SECURE_KERNEL
 448                 if ((flags & MNT_RDONLY) == 0) {
 449                         /* Release kernels are not allowed to mount "/" as rw */
 450                         error = EPERM;
 451                         goto out;
 452                 }
 453 #endif
 454                 /*
 455                  * See 7392553 for more details on why this check exists.
 456                  * Suffice to say: If this check is ON and something tries
 457                  * to mount the rootFS RW, we'll turn off the codesign
 458                  * bitmap optimization.
 459                  */
 460 #if CHECK_CS_VALIDATION_BITMAP
 461                 if ((flags & MNT_RDONLY) == 0 ) {
 462                         root_fs_upgrade_try = TRUE;
 463                 }
 464 #endif
 465         }
 466
 467         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 468                              labelstr, FALSE, ctx);
 469
 470 out:
 471
 472 #if CONFIG_MACF
 473         if (labelstr)
 474                 FREE(labelstr, M_MACTEMP);
 475 #endif /* CONFIG_MACF */
 476
 477         if (vp) {
 478                 vnode_put(vp);
 479         }
 480         if (pvp) {
 481                 vnode_put(pvp);
 482         }
 483         if (need_nameidone) {
 484                 nameidone(&nd);
 485         }
 486
 487         return (error);
 488 }
 489
 490 /*
 491  * common mount implementation (final stage of mounting)
 492
 493  * Arguments:
 494  *  fstypename  file system type (ie it's vfs name)
 495  *  pvp         parent of covered vnode
 496  *  vp          covered vnode
 497  *  cnp         component name (ie path) of covered vnode
 498  *  flags       generic mount flags
 499  *  fsmountargs file system specific data
 500  *  labelstr    optional MAC label
 501  *  kernelmount TRUE for mounts initiated from inside the kernel
 502  *  ctx         caller's context
 503  */
 504 static int
 505 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 506              struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 507              char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 508 {
 509 #if !CONFIG_MACF
 510 #pragma unused(labelstr)
 511 #endif
 512         struct vnode *devvp = NULLVP;
 513         struct vnode *device_vnode = NULLVP;
 514 #if CONFIG_MACF
 515         struct vnode *rvp;
 516 #endif
 517         struct mount *mp;
 518         struct vfstable *vfsp = (struct vfstable *)0;
 519         struct proc *p = vfs_context_proc(ctx);
 520         int error, flag = 0;
 521         user_addr_t devpath = USER_ADDR_NULL;
 522         int ronly = 0;
 523         int mntalloc = 0;
 524         boolean_t vfsp_ref = FALSE;
 525         boolean_t is_rwlock_locked = FALSE;
 526         boolean_t did_rele = FALSE;
 527         boolean_t have_usecount = FALSE;
 528
 529         /*
 530          * Process an update for an existing mount
 531          */
 532         if (flags & MNT_UPDATE) {
 533                 if ((vp->v_flag & VROOT) == 0) {
 534                         error = EINVAL;
 535                         goto out1;
 536                 }
 537                 mp = vp->v_mount;
 538
 539                 /* unmount in progress return error */
 540                 mount_lock_spin(mp);
 541                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 542                         mount_unlock(mp);
 543                         error = EBUSY;
 544                         goto out1;
 545                 }
 546                 mount_unlock(mp);
 547                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 548                 is_rwlock_locked = TRUE;
 549                 /*
 550                  * We only allow the filesystem to be reloaded if it
 551                  * is currently mounted read-only.
 552                  */
 553                 if ((flags & MNT_RELOAD) &&
 554                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 555                         error = ENOTSUP;
 556                         goto out1;
 557                 }
 558
 559                 /*
 560                  * If content protection is enabled, update mounts are not
 561                  * allowed to turn it off.
 562                  */
 563                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 564                            ((flags & MNT_CPROTECT) == 0)) {
 565                         error = EINVAL;
 566                         goto out1;
 567                 }
 568
 569 #ifdef CONFIG_IMGSRC_ACCESS
 570                 /* Can't downgrade the backer of the root FS */
 571                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 572                         (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 573                         error = ENOTSUP;
 574                         goto out1;
 575                 }
 576 #endif /* CONFIG_IMGSRC_ACCESS */
 577
 578                 /*
 579                  * Only root, or the user that did the original mount is
 580                  * permitted to update it.
 581                  */
 582                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 583                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 584                         goto out1;
 585                 }
 586 #if CONFIG_MACF
 587                 error = mac_mount_check_remount(ctx, mp);
 588                 if (error != 0) {
 589                         goto out1;
 590                 }
 591 #endif
 592                 /*
 593                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 594                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 595                  */
 596                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 597                         flags |= MNT_NOSUID | MNT_NODEV;
 598                         if (mp->mnt_flag & MNT_NOEXEC)
 599                                 flags |= MNT_NOEXEC;
 600                 }
 601                 flag = mp->mnt_flag;
 602
 603
 604
 605                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 606
 607                 vfsp = mp->mnt_vtable;
 608                 goto update;
 609         }
 610         /*
 611          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 612          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 613          */
 614         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 615                 flags |= MNT_NOSUID | MNT_NODEV;
 616                 if (vp->v_mount->mnt_flag & MNT_NOEXEC)
 617                         flags |= MNT_NOEXEC;
 618         }
 619
 620         /* XXXAUDIT: Should we capture the type on the error path as well? */
 621         AUDIT_ARG(text, fstypename);
 622         mount_list_lock();
 623         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 624                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 625                         vfsp->vfc_refcount++;
 626                         vfsp_ref = TRUE;
 627                         break;
 628                 }
 629         mount_list_unlock();
 630         if (vfsp == NULL) {
 631                 error = ENODEV;
 632                 goto out1;
 633         }
 634
 635         /*
 636          * VFC_VFSLOCALARGS is not currently supported for kernel mounts
 637          */
 638         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
 639                 error = EINVAL;  /* unsupported request */
 640                 goto out1;
 641         }
 642
 643         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 644         if (error != 0) {
 645                 goto out1;
 646         }
 647
 648         /*
 649          * Allocate and initialize the filesystem (mount_t)
 650          */
 651         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 652                 M_MOUNT, M_WAITOK);
 653         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 654         mntalloc = 1;
 655
 656         /* Initialize the default IO constraints */
 657         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 658         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 659         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 660         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 661         mp->mnt_devblocksize = DEV_BSIZE;
 662         mp->mnt_alignmentmask = PAGE_MASK;
 663         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 664         mp->mnt_ioscale = 1;
 665         mp->mnt_ioflags = 0;
 666         mp->mnt_realrootvp = NULLVP;
 667         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 668
 669         TAILQ_INIT(&mp->mnt_vnodelist);
 670         TAILQ_INIT(&mp->mnt_workerqueue);
 671         TAILQ_INIT(&mp->mnt_newvnodes);
 672         mount_lock_init(mp);
 673         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 674         is_rwlock_locked = TRUE;
 675         mp->mnt_op = vfsp->vfc_vfsops;
 676         mp->mnt_vtable = vfsp;
 677         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 678         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 679         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 680         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 681         mp->mnt_vnodecovered = vp;
 682         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 683         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 684         mp->mnt_devbsdunit = 0;
 685
 686         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 687         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 688
 689 #if NFSCLIENT || DEVFS || ROUTEFS
 690         if (kernelmount)
 691                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 692         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0)
 693                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 694 #endif /* NFSCLIENT || DEVFS */
 695
 696 update:
 697         /*
 698          * Set the mount level flags.
 699          */
 700         if (flags & MNT_RDONLY)
 701                 mp->mnt_flag |= MNT_RDONLY;
 702         else if (mp->mnt_flag & MNT_RDONLY) {
 703                 // disallow read/write upgrades of file systems that
 704                 // had the TYPENAME_OVERRIDE feature set.
 705                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 706                         error = EPERM;
 707                         goto out1;
 708                 }
 709                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 710         }
 711         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 712                           MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 713                           MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 714                           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 715                           MNT_QUARANTINE | MNT_CPROTECT);
 716         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 717                                  MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 718                                  MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 719                                  MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 720                                  MNT_QUARANTINE | MNT_CPROTECT);
 721
 722 #if CONFIG_MACF
 723         if (flags & MNT_MULTILABEL) {
 724                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 725                         error = EINVAL;
 726                         goto out1;
 727                 }
 728                 mp->mnt_flag |= MNT_MULTILABEL;
 729         }
 730 #endif
 731         /*
 732          * Process device path for local file systems if requested
 733          */
 734         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
 735                 if (vfs_context_is64bit(ctx)) {
 736                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
 737                                 goto out1;
 738                         fsmountargs += sizeof(devpath);
 739                 } else {
 740                         user32_addr_t tmp;
 741                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
 742                                 goto out1;
 743                         /* munge into LP64 addr */
 744                         devpath = CAST_USER_ADDR_T(tmp);
 745                         fsmountargs += sizeof(tmp);
 746                 }
 747
 748                 /* Lookup device and authorize access to it */
 749                 if ((devpath)) {
 750                         struct nameidata nd;
 751
 752                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 753                         if ( (error = namei(&nd)) )
 754                                 goto out1;
 755
 756                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 757                         devvp = nd.ni_vp;
 758
 759                         nameidone(&nd);
 760
 761                         if (devvp->v_type != VBLK) {
 762                                 error = ENOTBLK;
 763                                 goto out2;
 764                         }
 765                         if (major(devvp->v_rdev) >= nblkdev) {
 766                                 error = ENXIO;
 767                                 goto out2;
 768                         }
 769                         /*
 770                         * If mount by non-root, then verify that user has necessary
 771                         * permissions on the device.
 772                         */
 773                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 774                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 775
 776                                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
 777                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 778                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0)
 779                                         goto out2;
 780                         }
 781                 }
 782                 /* On first mount, preflight and open device */
 783                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 784                         if ( (error = vnode_ref(devvp)) )
 785                                 goto out2;
 786                         /*
 787                         * Disallow multiple mounts of the same device.
 788                         * Disallow mounting of a device that is currently in use
 789                         * (except for root, which might share swap device for miniroot).
 790                         * Flush out any old buffers remaining from a previous use.
 791                         */
 792                         if ( (error = vfs_mountedon(devvp)) )
 793                                 goto out3;
 794
 795                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 796                                 error = EBUSY;
 797                                 goto out3;
 798                         }
 799                         if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, ctx)) ) {
 800                                 error = ENOTBLK;
 801                                 goto out3;
 802                         }
 803                         if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0)) )
 804                                 goto out3;
 805
 806                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 807 #if CONFIG_MACF
 808                         error = mac_vnode_check_open(ctx,
 809                             devvp,
 810                             ronly ? FREAD : FREAD|FWRITE);
 811                         if (error)
 812                                 goto out3;
 813 #endif /* MAC */
 814                         if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, ctx)) )
 815                                 goto out3;
 816
 817                         mp->mnt_devvp = devvp;
 818                         device_vnode = devvp;
 819
 820                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 821                            (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 822                            (device_vnode = mp->mnt_devvp)) {
 823                         dev_t dev;
 824                         int maj;
 825                         /*
 826                          * If upgrade to read-write by non-root, then verify
 827                          * that user has necessary permissions on the device.
 828                          */
 829                         vnode_getalways(device_vnode);
 830
 831                         if (suser(vfs_context_ucred(ctx), NULL) &&
 832                             (error = vnode_authorize(device_vnode, NULL,
 833                              KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
 834                              ctx)) != 0) {
 835                                 vnode_put(device_vnode);
 836                                 goto out2;
 837                         }
 838
 839                         /* Tell the device that we're upgrading */
 840                         dev = (dev_t)device_vnode->v_rdev;
 841                         maj = major(dev);
 842
 843                         if ((u_int)maj >= (u_int)nblkdev)
 844                                 panic("Volume mounted on a device with invalid major number.");
 845
 846                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
 847                         vnode_put(device_vnode);
 848                         device_vnode = NULLVP;
 849                         if (error != 0) {
 850                                 goto out2;
 851                         }
 852                 }
 853         }
 854 #if CONFIG_MACF
 855         if ((flags & MNT_UPDATE) == 0) {
 856                 mac_mount_label_init(mp);
 857                 mac_mount_label_associate(ctx, mp);
 858         }
 859         if (labelstr) {
 860                 if ((flags & MNT_UPDATE) != 0) {
 861                         error = mac_mount_check_label_update(ctx, mp);
 862                         if (error != 0)
 863                                 goto out3;
 864                 }
 865         }
 866 #endif
 867         /*
 868          * Mount the filesystem.
 869          */
 870         error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
 871
 872         if (flags & MNT_UPDATE) {
 873                 if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 874                         mp->mnt_flag &= ~MNT_RDONLY;
 875                 mp->mnt_flag &=~
 876                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
 877                 mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 878                 if (error)
 879                         mp->mnt_flag = flag;  /* restore flag value */
 880                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
 881                 lck_rw_done(&mp->mnt_rwlock);
 882                 is_rwlock_locked = FALSE;
 883                 if (!error)
 884                         enablequotas(mp, ctx);
 885                 goto exit;
 886         }
 887
 888         /*
 889          * Put the new filesystem on the mount list after root.
 890          */
 891         if (error == 0) {
 892                 struct vfs_attr vfsattr;
 893 #if CONFIG_MACF
 894                 if (vfs_flags(mp) & MNT_MULTILABEL) {
 895                         error = VFS_ROOT(mp, &rvp, ctx);
 896                         if (error) {
 897                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
 898                                 goto out3;
 899                         }
 900                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
 901                         /*
 902                          * drop reference provided by VFS_ROOT
 903                          */
 904                         vnode_put(rvp);
 905
 906                         if (error)
 907                                 goto out3;
 908                 }
 909 #endif  /* MAC */
 910
 911                 vnode_lock_spin(vp);
 912                 CLR(vp->v_flag, VMOUNT);
 913                 vp->v_mountedhere = mp;
 914                 vnode_unlock(vp);
 915
 916                 /*
 917                  * taking the name_cache_lock exclusively will
 918                  * insure that everyone is out of the fast path who
 919                  * might be trying to use a now stale copy of
 920                  * vp->v_mountedhere->mnt_realrootvp
 921                  * bumping mount_generation causes the cached values
 922                  * to be invalidated
 923                  */
 924                 name_cache_lock();
 925                 mount_generation++;
 926                 name_cache_unlock();
 927
 928                 error = vnode_ref(vp);
 929                 if (error != 0) {
 930                         goto out4;
 931                 }
 932
 933                 have_usecount = TRUE;
 934
 935                 error = checkdirs(vp, ctx);
 936                 if (error != 0)  {
 937                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
 938                         goto out4;
 939                 }
 940                 /*
 941                  * there is no cleanup code here so I have made it void
 942                  * we need to revisit this
 943                  */
 944                 (void)VFS_START(mp, 0, ctx);
 945
 946                 if (mount_list_add(mp) != 0) {
 947                         /*
 948                          * The system is shutting down trying to umount
 949                          * everything, so fail with a plausible errno.
 950                          */
 951                         error = EBUSY;
 952                         goto out4;
 953                 }
 954                 lck_rw_done(&mp->mnt_rwlock);
 955                 is_rwlock_locked = FALSE;
 956
 957                 /* Check if this mounted file system supports EAs or named streams. */
 958                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
 959                 VFSATTR_INIT(&vfsattr);
 960                 VFSATTR_WANTED(&vfsattr, f_capabilities);
 961                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
 962                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
 963                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
 964                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
 965                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
 966                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
 967                         }
 968 #if NAMEDSTREAMS
 969                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
 970                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
 971                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
 972                         }
 973 #endif
 974                         /* Check if this file system supports path from id lookups. */
 975                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
 976                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
 977                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 978                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
 979                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
 980                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 981                         }
 982                 }
 983                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
 984                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
 985                 }
 986                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
 987                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
 988                 }
 989                 /* increment the operations count */
 990                 OSAddAtomic(1, &vfs_nummntops);
 991                 enablequotas(mp, ctx);
 992
 993                 if (device_vnode) {
 994                         device_vnode->v_specflags |= SI_MOUNTEDON;
 995
 996                         /*
 997                          *   cache the IO attributes for the underlying physical media...
 998                          *   an error return indicates the underlying driver doesn't
 999                          *   support all the queries necessary... however, reasonable
1000                          *   defaults will have been set, so no reason to bail or care
1001                          */
1002                         vfs_init_io_attributes(device_vnode, mp);
1003                 }
1004
1005                 /* Now that mount is setup, notify the listeners */
1006                 vfs_notify_mount(pvp);
1007                 IOBSDMountChange(mp, kIOMountChangeMount);
1008
1009         } else {
1010                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1011                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1012                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1013                                         mp->mnt_vtable->vfc_name, error);
1014                 }
1015
1016                 vnode_lock_spin(vp);
1017                 CLR(vp->v_flag, VMOUNT);
1018                 vnode_unlock(vp);
1019                 mount_list_lock();
1020                 mp->mnt_vtable->vfc_refcount--;
1021                 mount_list_unlock();
1022
1023                 if (device_vnode ) {
1024                         vnode_rele(device_vnode);
1025                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD|FWRITE, ctx);
1026                 }
1027                 lck_rw_done(&mp->mnt_rwlock);
1028                 is_rwlock_locked = FALSE;
1029
1030                 /*
1031                  * if we get here, we have a mount structure that needs to be freed,
1032                  * but since the coveredvp hasn't yet been updated to point at it,
1033                  * no need to worry about other threads holding a crossref on this mp
1034                  * so it's ok to just free it
1035                  */
1036                 mount_lock_destroy(mp);
1037 #if CONFIG_MACF
1038                 mac_mount_label_destroy(mp);
1039 #endif
1040                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1041         }
1042 exit:
1043         /*
1044          * drop I/O count on the device vp if there was one
1045          */
1046         if (devpath && devvp)
1047                 vnode_put(devvp);
1048
1049         return(error);
1050
1051 /* Error condition exits */
1052 out4:
1053         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1054
1055         /*
1056          * If the mount has been placed on the covered vp,
1057          * it may have been discovered by now, so we have
1058          * to treat this just like an unmount
1059          */
1060         mount_lock_spin(mp);
1061         mp->mnt_lflag |= MNT_LDEAD;
1062         mount_unlock(mp);
1063
1064         if (device_vnode != NULLVP) {
1065                 vnode_rele(device_vnode);
1066                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1067                        ctx);
1068                 did_rele = TRUE;
1069         }
1070
1071         vnode_lock_spin(vp);
1072
1073         mp->mnt_crossref++;
1074         vp->v_mountedhere = (mount_t) 0;
1075
1076         vnode_unlock(vp);
1077
1078         if (have_usecount) {
1079                 vnode_rele(vp);
1080         }
1081 out3:
1082         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele))
1083                 vnode_rele(devvp);
1084 out2:
1085         if (devpath && devvp)
1086                 vnode_put(devvp);
1087 out1:
1088         /* Release mnt_rwlock only when it was taken */
1089         if (is_rwlock_locked == TRUE) {
1090                 lck_rw_done(&mp->mnt_rwlock);
1091         }
1092
1093         if (mntalloc) {
1094                 if (mp->mnt_crossref)
1095                         mount_dropcrossref(mp, vp, 0);
1096                 else {
1097                         mount_lock_destroy(mp);
1098 #if CONFIG_MACF
1099                         mac_mount_label_destroy(mp);
1100 #endif
1101                         FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1102                 }
1103         }
1104         if (vfsp_ref) {
1105                 mount_list_lock();
1106                 vfsp->vfc_refcount--;
1107                 mount_list_unlock();
1108         }
1109
1110         return(error);
1111 }
1112
1113 /*
1114  * Flush in-core data, check for competing mount attempts,
1115  * and set VMOUNT
1116  */
1117 int
1118 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1119 {
1120 #if !CONFIG_MACF
1121 #pragma unused(cnp,fsname)
1122 #endif
1123         struct vnode_attr va;
1124         int error;
1125
1126         if (!skip_auth) {
1127                 /*
1128                  * If the user is not root, ensure that they own the directory
1129                  * onto which we are attempting to mount.
1130                  */
1131                 VATTR_INIT(&va);
1132                 VATTR_WANTED(&va, va_uid);
1133                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1134                                 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1135                                  (!vfs_context_issuser(ctx)))) {
1136                         error = EPERM;
1137                         goto out;
1138                 }
1139         }
1140
1141         if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
1142                 goto out;
1143
1144         if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) )
1145                 goto out;
1146
1147         if (vp->v_type != VDIR) {
1148                 error = ENOTDIR;
1149                 goto out;
1150         }
1151
1152         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1153                 error = EBUSY;
1154                 goto out;
1155         }
1156
1157 #if CONFIG_MACF
1158         error = mac_mount_check_mount(ctx, vp,
1159             cnp, fsname);
1160         if (error != 0)
1161                 goto out;
1162 #endif
1163
1164         vnode_lock_spin(vp);
1165         SET(vp->v_flag, VMOUNT);
1166         vnode_unlock(vp);
1167
1168 out:
1169         return error;
1170 }
1171
1172 #if CONFIG_IMGSRC_ACCESS
1173
1174 #if DEBUG
1175 #define IMGSRC_DEBUG(args...) printf(args)
1176 #else
1177 #define IMGSRC_DEBUG(args...) do { } while(0)
1178 #endif
1179
1180 static int
1181 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1182 {
1183         struct nameidata nd;
1184         vnode_t vp, realdevvp;
1185         mode_t accessmode;
1186         int error;
1187
1188         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
1189         if ( (error = namei(&nd)) ) {
1190                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1191                 return error;
1192         }
1193
1194         vp = nd.ni_vp;
1195
1196         if (!vnode_isblk(vp)) {
1197                 IMGSRC_DEBUG("Not block device.\n");
1198                 error = ENOTBLK;
1199                 goto out;
1200         }
1201
1202         realdevvp = mp->mnt_devvp;
1203         if (realdevvp == NULLVP) {
1204                 IMGSRC_DEBUG("No device backs the mount.\n");
1205                 error = ENXIO;
1206                 goto out;
1207         }
1208
1209         error = vnode_getwithref(realdevvp);
1210         if (error != 0) {
1211                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1212                 goto out;
1213         }
1214
1215         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1216                 IMGSRC_DEBUG("Wrong dev_t.\n");
1217                 error = ENXIO;
1218                 goto out1;
1219         }
1220
1221         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1222
1223         /*
1224          * If mount by non-root, then verify that user has necessary
1225          * permissions on the device.
1226          */
1227         if (!vfs_context_issuser(ctx)) {
1228                 accessmode = KAUTH_VNODE_READ_DATA;
1229                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
1230                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1231                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1232                         IMGSRC_DEBUG("Access denied.\n");
1233                         goto out1;
1234                 }
1235         }
1236
1237         *devvpp = vp;
1238
1239 out1:
1240         vnode_put(realdevvp);
1241 out:
1242         nameidone(&nd);
1243         if (error) {
1244                 vnode_put(vp);
1245         }
1246
1247         return error;
1248 }
1249
1250 /*
1251  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1252  * and call checkdirs()
1253  */
1254 static int
1255 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1256 {
1257         int error;
1258
1259         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1260
1261         vnode_lock_spin(vp);
1262         CLR(vp->v_flag, VMOUNT);
1263         vp->v_mountedhere = mp;
1264         vnode_unlock(vp);
1265
1266         /*
1267          * taking the name_cache_lock exclusively will
1268          * insure that everyone is out of the fast path who
1269          * might be trying to use a now stale copy of
1270          * vp->v_mountedhere->mnt_realrootvp
1271          * bumping mount_generation causes the cached values
1272          * to be invalidated
1273          */
1274         name_cache_lock();
1275         mount_generation++;
1276         name_cache_unlock();
1277
1278         error = vnode_ref(vp);
1279         if (error != 0) {
1280                 goto out;
1281         }
1282
1283         error = checkdirs(vp, ctx);
1284         if (error != 0)  {
1285                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1286                 vnode_rele(vp);
1287                 goto out;
1288         }
1289
1290 out:
1291         if (error != 0) {
1292                 mp->mnt_vnodecovered = NULLVP;
1293         }
1294         return error;
1295 }
1296
1297 static void
1298 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1299 {
1300         vnode_rele(vp);
1301         vnode_lock_spin(vp);
1302         vp->v_mountedhere = (mount_t)NULL;
1303         vnode_unlock(vp);
1304
1305         mp->mnt_vnodecovered = NULLVP;
1306 }
1307
1308 static int
1309 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1310 {
1311         int error;
1312
1313         /* unmount in progress return error */
1314         mount_lock_spin(mp);
1315         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1316                 mount_unlock(mp);
1317                 return EBUSY;
1318         }
1319         mount_unlock(mp);
1320         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1321
1322         /*
1323          * We only allow the filesystem to be reloaded if it
1324          * is currently mounted read-only.
1325          */
1326         if ((flags & MNT_RELOAD) &&
1327                         ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1328                 error = ENOTSUP;
1329                 goto out;
1330         }
1331
1332         /*
1333          * Only root, or the user that did the original mount is
1334          * permitted to update it.
1335          */
1336         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1337                         (!vfs_context_issuser(ctx))) {
1338                 error = EPERM;
1339                 goto out;
1340         }
1341 #if CONFIG_MACF
1342         error = mac_mount_check_remount(ctx, mp);
1343         if (error != 0) {
1344                 goto out;
1345         }
1346 #endif
1347
1348 out:
1349         if (error) {
1350                 lck_rw_done(&mp->mnt_rwlock);
1351         }
1352
1353         return error;
1354 }
1355
1356 static void
1357 mount_end_update(mount_t mp)
1358 {
1359         lck_rw_done(&mp->mnt_rwlock);
1360 }
1361
1362 static int
1363 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1364 {
1365         vnode_t vp;
1366
1367         if (height >= MAX_IMAGEBOOT_NESTING) {
1368                 return EINVAL;
1369         }
1370
1371         vp = imgsrc_rootvnodes[height];
1372         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1373                 *rvpp = vp;
1374                 return 0;
1375         } else {
1376                 return ENOENT;
1377         }
1378 }
1379
1380 static int
1381 relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
1382                 const char *fsname, vfs_context_t ctx,
1383                 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1384 {
1385         int error;
1386         mount_t mp;
1387         boolean_t placed = FALSE;
1388         vnode_t devvp = NULLVP;
1389         struct vfstable *vfsp;
1390         user_addr_t devpath;
1391         char *old_mntonname;
1392         vnode_t rvp;
1393         uint32_t height;
1394         uint32_t flags;
1395
1396         /* If we didn't imageboot, nothing to move */
1397         if (imgsrc_rootvnodes[0] == NULLVP) {
1398                 return EINVAL;
1399         }
1400
1401         /* Only root can do this */
1402         if (!vfs_context_issuser(ctx)) {
1403                 return EPERM;
1404         }
1405
1406         IMGSRC_DEBUG("looking for root vnode.\n");
1407
1408         /*
1409          * Get root vnode of filesystem we're moving.
1410          */
1411         if (by_index) {
1412                 if (is64bit) {
1413                         struct user64_mnt_imgsrc_args mia64;
1414                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1415                         if (error != 0) {
1416                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1417                                 return error;
1418                         }
1419
1420                         height = mia64.mi_height;
1421                         flags = mia64.mi_flags;
1422                         devpath = mia64.mi_devpath;
1423                 } else {
1424                         struct user32_mnt_imgsrc_args mia32;
1425                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1426                         if (error != 0) {
1427                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1428                                 return error;
1429                         }
1430
1431                         height = mia32.mi_height;
1432                         flags = mia32.mi_flags;
1433                         devpath = mia32.mi_devpath;
1434                 }
1435         } else {
1436                 /*
1437                  * For binary compatibility--assumes one level of nesting.
1438                  */
1439                 if (is64bit) {
1440                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
1441                                 return error;
1442                 } else {
1443                         user32_addr_t tmp;
1444                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
1445                                 return error;
1446
1447                         /* munge into LP64 addr */
1448                         devpath = CAST_USER_ADDR_T(tmp);
1449                 }
1450
1451                 height = 0;
1452                 flags = 0;
1453         }
1454
1455         if (flags != 0) {
1456                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1457                 return EINVAL;
1458         }
1459
1460         error = get_imgsrc_rootvnode(height, &rvp);
1461         if (error != 0) {
1462                 IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
1463                 return error;
1464         }
1465
1466         IMGSRC_DEBUG("got root vnode.\n");
1467
1468         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1469
1470         /* Can only move once */
1471         mp = vnode_mount(rvp);
1472         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1473                 IMGSRC_DEBUG("Already moved.\n");
1474                 error = EBUSY;
1475                 goto out0;
1476         }
1477
1478         IMGSRC_DEBUG("Starting updated.\n");
1479
1480         /* Get exclusive rwlock on mount, authorize update on mp */
1481         error = mount_begin_update(mp , ctx, 0);
1482         if (error != 0) {
1483                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1484                 goto out0;
1485         }
1486
1487         /*
1488          * It can only be moved once.  Flag is set under the rwlock,
1489          * so we're now safe to proceed.
1490          */
1491         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1492                 IMGSRC_DEBUG("Already moved [2]\n");
1493                 goto out1;
1494         }
1495
1496
1497         IMGSRC_DEBUG("Preparing coveredvp.\n");
1498
1499         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1500         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1501         if (error != 0) {
1502                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1503                 goto out1;
1504         }
1505
1506         IMGSRC_DEBUG("Covered vp OK.\n");
1507
1508         /* Sanity check the name caller has provided */
1509         vfsp = mp->mnt_vtable;
1510         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1511                 IMGSRC_DEBUG("Wrong fs name.\n");
1512                 error = EINVAL;
1513                 goto out2;
1514         }
1515
1516         /* Check the device vnode and update mount-from name, for local filesystems */
1517         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1518                 IMGSRC_DEBUG("Local, doing device validation.\n");
1519
1520                 if (devpath != USER_ADDR_NULL) {
1521                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1522                         if (error) {
1523                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1524                                 goto out2;
1525                         }
1526
1527                         vnode_put(devvp);
1528                 }
1529         }
1530
1531         /*
1532          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1533          * and increment the name cache's mount generation
1534          */
1535
1536         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1537         error = place_mount_and_checkdirs(mp, vp, ctx);
1538         if (error != 0) {
1539                 goto out2;
1540         }
1541
1542         placed = TRUE;
1543
1544         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1545         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1546
1547         /* Forbid future moves */
1548         mount_lock(mp);
1549         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1550         mount_unlock(mp);
1551
1552         /* Finally, add to mount list, completely ready to go */
1553         if (mount_list_add(mp) != 0) {
1554                 /*
1555                  * The system is shutting down trying to umount
1556                  * everything, so fail with a plausible errno.
1557                  */
1558                 error = EBUSY;
1559                 goto out3;
1560         }
1561
1562         mount_end_update(mp);
1563         vnode_put(rvp);
1564         FREE(old_mntonname, M_TEMP);
1565
1566         vfs_notify_mount(pvp);
1567
1568         return 0;
1569 out3:
1570         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1571
1572         mount_lock(mp);
1573         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1574         mount_unlock(mp);
1575
1576 out2:
1577         /*
1578          * Placing the mp on the vnode clears VMOUNT,
1579          * so cleanup is different after that point
1580          */
1581         if (placed) {
1582                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1583                 undo_place_on_covered_vp(mp, vp);
1584         } else {
1585                 vnode_lock_spin(vp);
1586                 CLR(vp->v_flag, VMOUNT);
1587                 vnode_unlock(vp);
1588         }
1589 out1:
1590         mount_end_update(mp);
1591
1592 out0:
1593         vnode_put(rvp);
1594         FREE(old_mntonname, M_TEMP);
1595         return error;
1596 }
1597
1598 #endif /* CONFIG_IMGSRC_ACCESS */
1599
1600 void
1601 enablequotas(struct mount *mp, vfs_context_t ctx)
1602 {
1603         struct nameidata qnd;
1604         int type;
1605         char qfpath[MAXPATHLEN];
1606         const char *qfname = QUOTAFILENAME;
1607         const char *qfopsname = QUOTAOPSNAME;
1608         const char *qfextension[] = INITQFNAMES;
1609
1610         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1611         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0 ) {
1612                 return;
1613         }
1614         /*
1615          * Enable filesystem disk quotas if necessary.
1616          * We ignore errors as this should not interfere with final mount
1617          */
1618         for (type=0; type < MAXQUOTAS; type++) {
1619                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1620                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1621                        CAST_USER_ADDR_T(qfpath), ctx);
1622                 if (namei(&qnd) != 0)
1623                         continue;           /* option file to trigger quotas is not present */
1624                 vnode_put(qnd.ni_vp);
1625                 nameidone(&qnd);
1626                 snprintf(qfpath, sizeof(qfpath),  "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1627
1628                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1629         }
1630         return;
1631 }
1632
1633
1634 static int
1635 checkdirs_callback(proc_t p, void * arg)
1636 {
1637         struct cdirargs * cdrp = (struct cdirargs * )arg;
1638         vnode_t olddp = cdrp->olddp;
1639         vnode_t newdp = cdrp->newdp;
1640         struct filedesc *fdp;
1641         vnode_t tvp;
1642         vnode_t fdp_cvp;
1643         vnode_t fdp_rvp;
1644         int cdir_changed = 0;
1645         int rdir_changed = 0;
1646
1647         /*
1648          * XXX Also needs to iterate each thread in the process to see if it
1649          * XXX is using a per-thread current working directory, and, if so,
1650          * XXX update that as well.
1651          */
1652
1653         proc_fdlock(p);
1654         fdp = p->p_fd;
1655         if (fdp == (struct filedesc *)0) {
1656                 proc_fdunlock(p);
1657                 return(PROC_RETURNED);
1658         }
1659         fdp_cvp = fdp->fd_cdir;
1660         fdp_rvp = fdp->fd_rdir;
1661         proc_fdunlock(p);
1662
1663         if (fdp_cvp == olddp) {
1664                 vnode_ref(newdp);
1665                 tvp = fdp->fd_cdir;
1666                 fdp_cvp = newdp;
1667                 cdir_changed = 1;
1668                 vnode_rele(tvp);
1669         }
1670         if (fdp_rvp == olddp) {
1671                 vnode_ref(newdp);
1672                 tvp = fdp->fd_rdir;
1673                 fdp_rvp = newdp;
1674                 rdir_changed = 1;
1675                 vnode_rele(tvp);
1676         }
1677         if (cdir_changed || rdir_changed) {
1678                 proc_fdlock(p);
1679                 fdp->fd_cdir = fdp_cvp;
1680                 fdp->fd_rdir = fdp_rvp;
1681                 proc_fdunlock(p);
1682         }
1683         return(PROC_RETURNED);
1684 }
1685
1686
1687
1688 /*
1689  * Scan all active processes to see if any of them have a current
1690  * or root directory onto which the new filesystem has just been
1691  * mounted. If so, replace them with the new mount point.
1692  */
1693 static int
1694 checkdirs(vnode_t olddp, vfs_context_t ctx)
1695 {
1696         vnode_t newdp;
1697         vnode_t tvp;
1698         int err;
1699         struct cdirargs cdr;
1700
1701         if (olddp->v_usecount == 1)
1702                 return(0);
1703         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
1704
1705         if (err != 0) {
1706 #if DIAGNOSTIC
1707                 panic("mount: lost mount: error %d", err);
1708 #endif
1709                 return(err);
1710         }
1711
1712         cdr.olddp = olddp;
1713         cdr.newdp = newdp;
1714         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
1715         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
1716
1717         if (rootvnode == olddp) {
1718                 vnode_ref(newdp);
1719                 tvp = rootvnode;
1720                 rootvnode = newdp;
1721                 vnode_rele(tvp);
1722         }
1723
1724         vnode_put(newdp);
1725         return(0);
1726 }
1727
1728 /*
1729  * Unmount a file system.
1730  *
1731  * Note: unmount takes a path to the vnode mounted on as argument,
1732  * not special file (as before).
1733  */
1734 /* ARGSUSED */
1735 int
1736 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
1737 {
1738         vnode_t vp;
1739         struct mount *mp;
1740         int error;
1741         struct nameidata nd;
1742         vfs_context_t ctx = vfs_context_current();
1743
1744         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
1745                 UIO_USERSPACE, uap->path, ctx);
1746         error = namei(&nd);
1747         if (error)
1748                 return (error);
1749         vp = nd.ni_vp;
1750         mp = vp->v_mount;
1751         nameidone(&nd);
1752
1753 #if CONFIG_MACF
1754         error = mac_mount_check_umount(ctx, mp);
1755         if (error != 0) {
1756                 vnode_put(vp);
1757                 return (error);
1758         }
1759 #endif
1760         /*
1761          * Must be the root of the filesystem
1762          */
1763         if ((vp->v_flag & VROOT) == 0) {
1764                 vnode_put(vp);
1765                 return (EINVAL);
1766         }
1767         mount_ref(mp, 0);
1768         vnode_put(vp);
1769         /* safedounmount consumes the mount ref */
1770         return (safedounmount(mp, uap->flags, ctx));
1771 }
1772
1773 int
1774 vfs_unmountbyfsid(fsid_t * fsid, int flags, vfs_context_t ctx)
1775 {
1776         mount_t mp;
1777
1778         mp = mount_list_lookupby_fsid(fsid, 0, 1);
1779         if (mp == (mount_t)0) {
1780                 return(ENOENT);
1781         }
1782         mount_ref(mp, 0);
1783         mount_iterdrop(mp);
1784         /* safedounmount consumes the mount ref */
1785         return(safedounmount(mp, flags, ctx));
1786 }
1787
1788
1789 /*
1790  * The mount struct comes with a mount ref which will be consumed.
1791  * Do the actual file system unmount, prevent some common foot shooting.
1792  */
1793 int
1794 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
1795 {
1796         int error;
1797         proc_t p = vfs_context_proc(ctx);
1798
1799         /*
1800          * If the file system is not responding and MNT_NOBLOCK
1801          * is set and not a forced unmount then return EBUSY.
1802          */
1803         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
1804                 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
1805                 error = EBUSY;
1806                 goto out;
1807         }
1808
1809         /*
1810          * Skip authorization if the mount is tagged as permissive and
1811          * this is not a forced-unmount attempt.
1812          */
1813         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
1814                 /*
1815                  * Only root, or the user that did the original mount is
1816                  * permitted to unmount this filesystem.
1817                  */
1818                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
1819                                 (error = suser(kauth_cred_get(), &p->p_acflag)))
1820                         goto out;
1821         }
1822         /*
1823          * Don't allow unmounting the root file system.
1824          */
1825         if (mp->mnt_flag & MNT_ROOTFS) {
1826                 error = EBUSY; /* the root is always busy */
1827                 goto out;
1828         }
1829
1830 #ifdef CONFIG_IMGSRC_ACCESS
1831         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
1832                 error = EBUSY;
1833                 goto out;
1834         }
1835 #endif /* CONFIG_IMGSRC_ACCESS */
1836
1837         return (dounmount(mp, flags, 1, ctx));
1838
1839 out:
1840         mount_drop(mp, 0);
1841         return(error);
1842 }
1843
1844 /*
1845  * Do the actual file system unmount.
1846  */
1847 int
1848 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
1849 {
1850         vnode_t coveredvp = (vnode_t)0;
1851         int error;
1852         int needwakeup = 0;
1853         int forcedunmount = 0;
1854         int lflags = 0;
1855         struct vnode *devvp = NULLVP;
1856 #if CONFIG_TRIGGERS
1857         proc_t p = vfs_context_proc(ctx);
1858         int did_vflush = 0;
1859         int pflags_save = 0;
1860 #endif /* CONFIG_TRIGGERS */
1861
1862         mount_lock(mp);
1863
1864         /*
1865          * If already an unmount in progress just return EBUSY.
1866          * Even a forced unmount cannot override.
1867          */
1868         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1869                 if (withref != 0)
1870                         mount_drop(mp, 1);
1871                 mount_unlock(mp);
1872                 return (EBUSY);
1873         }
1874
1875         if (flags & MNT_FORCE) {
1876                 forcedunmount = 1;
1877                 mp->mnt_lflag |= MNT_LFORCE;
1878         }
1879
1880 #if CONFIG_TRIGGERS
1881         if (flags & MNT_NOBLOCK && p != kernproc)
1882                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
1883 #endif
1884
1885         mp->mnt_kern_flag |= MNTK_UNMOUNT;
1886         mp->mnt_lflag |= MNT_LUNMOUNT;
1887         mp->mnt_flag &=~ MNT_ASYNC;
1888         /*
1889          * anyone currently in the fast path that
1890          * trips over the cached rootvp will be
1891          * dumped out and forced into the slow path
1892          * to regenerate a new cached value
1893          */
1894         mp->mnt_realrootvp = NULLVP;
1895         mount_unlock(mp);
1896
1897         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
1898                 /*
1899                  * Force unmount any mounts in this filesystem.
1900                  * If any unmounts fail - just leave them dangling.
1901                  * Avoids recursion.
1902                  */
1903                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
1904         }
1905
1906         /*
1907          * taking the name_cache_lock exclusively will
1908          * insure that everyone is out of the fast path who
1909          * might be trying to use a now stale copy of
1910          * vp->v_mountedhere->mnt_realrootvp
1911          * bumping mount_generation causes the cached values
1912          * to be invalidated
1913          */
1914         name_cache_lock();
1915         mount_generation++;
1916         name_cache_unlock();
1917
1918
1919         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1920         if (withref != 0)
1921                 mount_drop(mp, 0);
1922 #if CONFIG_FSE
1923         fsevent_unmount(mp);  /* has to come first! */
1924 #endif
1925         error = 0;
1926         if (forcedunmount == 0) {
1927                 ubc_umount(mp); /* release cached vnodes */
1928                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1929                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
1930                         if (error) {
1931                                 mount_lock(mp);
1932                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1933                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1934                                 mp->mnt_lflag &= ~MNT_LFORCE;
1935                                 goto out;
1936                         }
1937                 }
1938         }
1939
1940         IOBSDMountChange(mp, kIOMountChangeUnmount);
1941
1942 #if CONFIG_TRIGGERS
1943         vfs_nested_trigger_unmounts(mp, flags, ctx);
1944         did_vflush = 1;
1945 #endif
1946         if (forcedunmount)
1947                 lflags |= FORCECLOSE;
1948         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
1949         if ((forcedunmount == 0) && error) {
1950                 mount_lock(mp);
1951                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1952                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1953                 mp->mnt_lflag &= ~MNT_LFORCE;
1954                 goto out;
1955         }
1956
1957         /* make sure there are no one in the mount iterations or lookup */
1958         mount_iterdrain(mp);
1959
1960         error = VFS_UNMOUNT(mp, flags, ctx);
1961         if (error) {
1962                 mount_iterreset(mp);
1963                 mount_lock(mp);
1964                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1965                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1966                 mp->mnt_lflag &= ~MNT_LFORCE;
1967                 goto out;
1968         }
1969
1970         /* increment the operations count */
1971         if (!error)
1972                 OSAddAtomic(1, &vfs_nummntops);
1973
1974         if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
1975                 /* hold an io reference and drop the usecount before close */
1976                 devvp = mp->mnt_devvp;
1977                 vnode_getalways(devvp);
1978                 vnode_rele(devvp);
1979                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1980                        ctx);
1981                 vnode_clearmountedon(devvp);
1982                 vnode_put(devvp);
1983         }
1984         lck_rw_done(&mp->mnt_rwlock);
1985         mount_list_remove(mp);
1986         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1987
1988         /* mark the mount point hook in the vp but not drop the ref yet */
1989         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
1990                 /*
1991                  * The covered vnode needs special handling. Trying to get an
1992                  * iocount must not block here as this may lead to deadlocks
1993                  * if the Filesystem to which the covered vnode belongs is
1994                  * undergoing forced unmounts. Since we hold a usecount, the
1995                  * vnode cannot be reused (it can, however, still be terminated)
1996                  */
1997                 vnode_getalways(coveredvp);
1998                 vnode_lock_spin(coveredvp);
1999
2000                 mp->mnt_crossref++;
2001                 coveredvp->v_mountedhere = (struct mount *)0;
2002                 CLR(coveredvp->v_flag, VMOUNT);
2003
2004                 vnode_unlock(coveredvp);
2005                 vnode_put(coveredvp);
2006         }
2007
2008         mount_list_lock();
2009         mp->mnt_vtable->vfc_refcount--;
2010         mount_list_unlock();
2011
2012         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2013         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2014         mount_lock(mp);
2015         mp->mnt_lflag |= MNT_LDEAD;
2016
2017         if (mp->mnt_lflag & MNT_LWAIT) {
2018                 /*
2019                  * do the wakeup here
2020                  * in case we block in mount_refdrain
2021                  * which will drop the mount lock
2022                  * and allow anyone blocked in vfs_busy
2023                  * to wakeup and see the LDEAD state
2024                  */
2025                 mp->mnt_lflag &= ~MNT_LWAIT;
2026                 wakeup((caddr_t)mp);
2027         }
2028         mount_refdrain(mp);
2029 out:
2030         if (mp->mnt_lflag & MNT_LWAIT) {
2031                 mp->mnt_lflag &= ~MNT_LWAIT;
2032                 needwakeup = 1;
2033         }
2034
2035 #if CONFIG_TRIGGERS
2036         if (flags & MNT_NOBLOCK && p != kernproc) {
2037                 // Restore P_NOREMOTEHANG bit to its previous value
2038                 if ((pflags_save & P_NOREMOTEHANG) == 0)
2039                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2040         }
2041
2042         /*
2043          * Callback and context are set together under the mount lock, and
2044          * never cleared, so we're safe to examine them here, drop the lock,
2045          * and call out.
2046          */
2047         if (mp->mnt_triggercallback != NULL) {
2048                 mount_unlock(mp);
2049                 if (error == 0) {
2050                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2051                 } else if (did_vflush) {
2052                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2053                 }
2054         } else {
2055                 mount_unlock(mp);
2056         }
2057 #else
2058         mount_unlock(mp);
2059 #endif /* CONFIG_TRIGGERS */
2060
2061         lck_rw_done(&mp->mnt_rwlock);
2062
2063         if (needwakeup)
2064                 wakeup((caddr_t)mp);
2065
2066         if (!error) {
2067                 if ((coveredvp != NULLVP)) {
2068                         vnode_t pvp = NULLVP;
2069
2070                         /*
2071                          * The covered vnode needs special handling. Trying to
2072                          * get an iocount must not block here as this may lead
2073                          * to deadlocks if the Filesystem to which the covered
2074                          * vnode belongs is undergoing forced unmounts. Since we
2075                          * hold a usecount, the  vnode cannot be reused
2076                          * (it can, however, still be terminated).
2077                          */
2078                         vnode_getalways(coveredvp);
2079
2080                         mount_dropcrossref(mp, coveredvp, 0);
2081                         /*
2082                          * We'll _try_ to detect if this really needs to be
2083                          * done. The coveredvp can only be in termination (or
2084                          * terminated) if the coveredvp's mount point is in a
2085                          * forced unmount (or has been) since we still hold the
2086                          * ref.
2087                          */
2088                         if (!vnode_isrecycled(coveredvp)) {
2089                                 pvp = vnode_getparent(coveredvp);
2090 #if CONFIG_TRIGGERS
2091                                 if (coveredvp->v_resolve) {
2092                                         vnode_trigger_rearm(coveredvp, ctx);
2093                                 }
2094 #endif
2095                         }
2096
2097                         vnode_rele(coveredvp);
2098                         vnode_put(coveredvp);
2099                         coveredvp = NULLVP;
2100
2101                         if (pvp) {
2102                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2103                                 vnode_put(pvp);
2104                         }
2105                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2106                                 mount_lock_destroy(mp);
2107 #if CONFIG_MACF
2108                                 mac_mount_label_destroy(mp);
2109 #endif
2110                                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2111                 } else
2112                         panic("dounmount: no coveredvp");
2113         }
2114         return (error);
2115 }
2116
2117 /*
2118  * Unmount any mounts in this filesystem.
2119  */
2120 void
2121 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2122 {
2123         mount_t smp;
2124         fsid_t *fsids, fsid;
2125         int fsids_sz;
2126         int count = 0, i, m = 0;
2127         vnode_t vp;
2128
2129         mount_list_lock();
2130
2131         // Get an array to hold the submounts fsids.
2132         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2133                 count++;
2134         fsids_sz = count * sizeof(fsid_t);
2135         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2136         if (fsids == NULL) {
2137                 mount_list_unlock();
2138                 goto out;
2139         }
2140         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2141
2142         /*
2143          * Fill the array with submount fsids.
2144          * Since mounts are always added to the tail of the mount list, the
2145          * list is always in mount order.
2146          * For each mount check if the mounted-on vnode belongs to a
2147          * mount that's already added to our array of mounts to be unmounted.
2148          */
2149         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2150                 vp = smp->mnt_vnodecovered;
2151                 if (vp == NULL)
2152                         continue;
2153                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2154                 for (i = 0; i <= m; i++) {
2155                         if (fsids[i].val[0] == fsid.val[0] &&
2156                             fsids[i].val[1] == fsid.val[1]) {
2157                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2158                                 break;
2159                         }
2160                 }
2161         }
2162         mount_list_unlock();
2163
2164         // Unmount the submounts in reverse order. Ignore errors.
2165         for (i = m; i > 0; i--) {
2166                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2167                 if (smp) {
2168                         mount_ref(smp, 0);
2169                         mount_iterdrop(smp);
2170                         (void) dounmount(smp, flags, 1, ctx);
2171                 }
2172         }
2173 out:
2174         if (fsids)
2175                 FREE(fsids, M_TEMP);
2176 }
2177
2178 void
2179 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2180 {
2181         vnode_lock(dp);
2182         mp->mnt_crossref--;
2183
2184         if (mp->mnt_crossref < 0)
2185                 panic("mount cross refs -ve");
2186
2187         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2188
2189                 if (need_put)
2190                         vnode_put_locked(dp);
2191                 vnode_unlock(dp);
2192
2193                 mount_lock_destroy(mp);
2194 #if CONFIG_MACF
2195                 mac_mount_label_destroy(mp);
2196 #endif
2197                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2198                 return;
2199         }
2200         if (need_put)
2201                 vnode_put_locked(dp);
2202         vnode_unlock(dp);
2203 }
2204
2205
2206 /*
2207  * Sync each mounted filesystem.
2208  */
2209 #if DIAGNOSTIC
2210 int syncprt = 0;
2211 #endif
2212
2213 int print_vmpage_stat=0;
2214 int sync_timeout = 60;  // Sync time limit (sec)
2215
2216 static int
2217 sync_callback(mount_t mp, __unused void *arg)
2218 {
2219         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2220                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2221
2222                 mp->mnt_flag &= ~MNT_ASYNC;
2223                 VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
2224                 if (asyncflag)
2225                         mp->mnt_flag |= MNT_ASYNC;
2226         }
2227
2228         return (VFS_RETURNED);
2229 }
2230
2231 /* ARGSUSED */
2232 int
2233 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2234 {
2235         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2236
2237         if (print_vmpage_stat) {
2238                 vm_countdirtypages();
2239         }
2240
2241 #if DIAGNOSTIC
2242         if (syncprt)
2243                 vfs_bufstats();
2244 #endif /* DIAGNOSTIC */
2245         return 0;
2246 }
2247
2248 static void
2249 sync_thread(void *arg, __unused wait_result_t wr)
2250 {
2251         int *timeout = (int *) arg;
2252
2253         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2254
2255         if (timeout)
2256                 wakeup((caddr_t) timeout);
2257         if (print_vmpage_stat) {
2258                 vm_countdirtypages();
2259         }
2260
2261 #if DIAGNOSTIC
2262         if (syncprt)
2263                 vfs_bufstats();
2264 #endif /* DIAGNOSTIC */
2265 }
2266
2267 /*
2268  * Sync in a separate thread so we can time out if it blocks.
2269  */
2270 static int
2271 sync_async(int timeout)
2272 {
2273         thread_t thd;
2274         int error;
2275         struct timespec ts = {timeout, 0};
2276
2277         lck_mtx_lock(sync_mtx_lck);
2278         if (kernel_thread_start(sync_thread, &timeout, &thd) != KERN_SUCCESS) {
2279                 printf("sync_thread failed\n");
2280                 lck_mtx_unlock(sync_mtx_lck);
2281                 return (0);
2282         }
2283
2284         error = msleep((caddr_t) &timeout, sync_mtx_lck, (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2285         if (error) {
2286                 printf("sync timed out: %d sec\n", timeout);
2287         }
2288         thread_deallocate(thd);
2289
2290         return (0);
2291 }
2292
2293 /*
2294  * An in-kernel sync for power management to call.
2295  */
2296 __private_extern__ int
2297 sync_internal(void)
2298 {
2299         (void) sync_async(sync_timeout);
2300
2301         return 0;
2302 } /* end of sync_internal call */
2303
2304 /*
2305  * Change filesystem quotas.
2306  */
2307 #if QUOTA
2308 int
2309 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2310 {
2311         struct mount *mp;
2312         int error, quota_cmd, quota_status;
2313         caddr_t datap;
2314         size_t fnamelen;
2315         struct nameidata nd;
2316         vfs_context_t ctx = vfs_context_current();
2317         struct dqblk my_dqblk;
2318
2319         AUDIT_ARG(uid, uap->uid);
2320         AUDIT_ARG(cmd, uap->cmd);
2321         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2322                uap->path, ctx);
2323         error = namei(&nd);
2324         if (error)
2325                 return (error);
2326         mp = nd.ni_vp->v_mount;
2327         vnode_put(nd.ni_vp);
2328         nameidone(&nd);
2329
2330         /* copyin any data we will need for downstream code */
2331         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2332
2333         switch (quota_cmd) {
2334         case Q_QUOTAON:
2335                 /* uap->arg specifies a file from which to take the quotas */
2336                 fnamelen = MAXPATHLEN;
2337                 datap = kalloc(MAXPATHLEN);
2338                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2339                 break;
2340         case Q_GETQUOTA:
2341                 /* uap->arg is a pointer to a dqblk structure. */
2342                 datap = (caddr_t) &my_dqblk;
2343                 break;
2344         case Q_SETQUOTA:
2345         case Q_SETUSE:
2346                 /* uap->arg is a pointer to a dqblk structure. */
2347                 datap = (caddr_t) &my_dqblk;
2348                 if (proc_is64bit(p)) {
2349                         struct user_dqblk       my_dqblk64;
2350                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof (my_dqblk64));
2351                         if (error == 0) {
2352                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2353                         }
2354                 }
2355                 else {
2356                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof (my_dqblk));
2357                 }
2358                 break;
2359         case Q_QUOTASTAT:
2360                 /* uap->arg is a pointer to an integer */
2361                 datap = (caddr_t) &quota_status;
2362                 break;
2363         default:
2364                 datap = NULL;
2365                 break;
2366         } /* switch */
2367
2368         if (error == 0) {
2369                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2370         }
2371
2372         switch (quota_cmd) {
2373         case Q_QUOTAON:
2374                 if (datap != NULL)
2375                         kfree(datap, MAXPATHLEN);
2376                 break;
2377         case Q_GETQUOTA:
2378                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2379                 if (error == 0) {
2380                         if (proc_is64bit(p)) {
2381                                 struct user_dqblk       my_dqblk64 = {.dqb_bhardlimit = 0};
2382                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2383                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof (my_dqblk64));
2384                         }
2385                         else {
2386                                 error = copyout(datap, uap->arg, sizeof (struct dqblk));
2387                         }
2388                 }
2389                 break;
2390         case Q_QUOTASTAT:
2391                 /* uap->arg is a pointer to an integer */
2392                 if (error == 0) {
2393                         error = copyout(datap, uap->arg, sizeof(quota_status));
2394                 }
2395                 break;
2396         default:
2397                 break;
2398         } /* switch */
2399
2400         return (error);
2401 }
2402 #else
2403 int
2404 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2405 {
2406         return (EOPNOTSUPP);
2407 }
2408 #endif /* QUOTA */
2409
2410 /*
2411  * Get filesystem statistics.
2412  *
2413  * Returns:     0                       Success
2414  *      namei:???
2415  *      vfs_update_vfsstat:???
2416  *      munge_statfs:EFAULT
2417  */
2418 /* ARGSUSED */
2419 int
2420 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2421 {
2422         struct mount *mp;
2423         struct vfsstatfs *sp;
2424         int error;
2425         struct nameidata nd;
2426         vfs_context_t ctx = vfs_context_current();
2427         vnode_t vp;
2428
2429         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2430                 UIO_USERSPACE, uap->path, ctx);
2431         error = namei(&nd);
2432         if (error)
2433                 return (error);
2434         vp = nd.ni_vp;
2435         mp = vp->v_mount;
2436         sp = &mp->mnt_vfsstat;
2437         nameidone(&nd);
2438
2439         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2440         if (error != 0) {
2441                 vnode_put(vp);
2442                 return (error);
2443         }
2444
2445         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2446         vnode_put(vp);
2447         return (error);
2448 }
2449
2450 /*
2451  * Get filesystem statistics.
2452  */
2453 /* ARGSUSED */
2454 int
2455 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2456 {
2457         vnode_t vp;
2458         struct mount *mp;
2459         struct vfsstatfs *sp;
2460         int error;
2461
2462         AUDIT_ARG(fd, uap->fd);
2463
2464         if ( (error = file_vnode(uap->fd, &vp)) )
2465                 return (error);
2466
2467         error = vnode_getwithref(vp);
2468         if (error) {
2469                 file_drop(uap->fd);
2470                 return (error);
2471         }
2472
2473         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2474
2475         mp = vp->v_mount;
2476         if (!mp) {
2477                 error = EBADF;
2478                 goto out;
2479         }
2480         sp = &mp->mnt_vfsstat;
2481         if ((error = vfs_update_vfsstat(mp,vfs_context_current(),VFS_USER_EVENT)) != 0) {
2482                 goto out;
2483         }
2484
2485         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2486
2487 out:
2488         file_drop(uap->fd);
2489         vnode_put(vp);
2490
2491         return (error);
2492 }
2493
2494 /*
2495  * Common routine to handle copying of statfs64 data to user space
2496  */
2497 static int
2498 statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
2499 {
2500         int error;
2501         struct statfs64 sfs;
2502
2503         bzero(&sfs, sizeof(sfs));
2504
2505         sfs.f_bsize = sfsp->f_bsize;
2506         sfs.f_iosize = (int32_t)sfsp->f_iosize;
2507         sfs.f_blocks = sfsp->f_blocks;
2508         sfs.f_bfree = sfsp->f_bfree;
2509         sfs.f_bavail = sfsp->f_bavail;
2510         sfs.f_files = sfsp->f_files;
2511         sfs.f_ffree = sfsp->f_ffree;
2512         sfs.f_fsid = sfsp->f_fsid;
2513         sfs.f_owner = sfsp->f_owner;
2514         sfs.f_type = mp->mnt_vtable->vfc_typenum;
2515         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2516         sfs.f_fssubtype = sfsp->f_fssubtype;
2517         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2518                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
2519         } else {
2520                 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
2521         }
2522         strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
2523         strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
2524
2525         error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
2526
2527         return(error);
2528 }
2529
2530 /*
2531  * Get file system statistics in 64-bit mode
2532  */
2533 int
2534 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
2535 {
2536         struct mount *mp;
2537         struct vfsstatfs *sp;
2538         int error;
2539         struct nameidata nd;
2540         vfs_context_t ctxp = vfs_context_current();
2541         vnode_t vp;
2542
2543         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2544                 UIO_USERSPACE, uap->path, ctxp);
2545         error = namei(&nd);
2546         if (error)
2547                 return (error);
2548         vp = nd.ni_vp;
2549         mp = vp->v_mount;
2550         sp = &mp->mnt_vfsstat;
2551         nameidone(&nd);
2552
2553         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
2554         if (error != 0) {
2555                 vnode_put(vp);
2556                 return (error);
2557         }
2558
2559         error = statfs64_common(mp, sp, uap->buf);
2560         vnode_put(vp);
2561
2562         return (error);
2563 }
2564
2565 /*
2566  * Get file system statistics in 64-bit mode
2567  */
2568 int
2569 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
2570 {
2571         struct vnode *vp;
2572         struct mount *mp;
2573         struct vfsstatfs *sp;
2574         int error;
2575
2576         AUDIT_ARG(fd, uap->fd);
2577
2578         if ( (error = file_vnode(uap->fd, &vp)) )
2579                 return (error);
2580
2581         error = vnode_getwithref(vp);
2582         if (error) {
2583                 file_drop(uap->fd);
2584                 return (error);
2585         }
2586
2587         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2588
2589         mp = vp->v_mount;
2590         if (!mp) {
2591                 error = EBADF;
2592                 goto out;
2593         }
2594         sp = &mp->mnt_vfsstat;
2595         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2596                 goto out;
2597         }
2598
2599         error = statfs64_common(mp, sp, uap->buf);
2600
2601 out:
2602         file_drop(uap->fd);
2603         vnode_put(vp);
2604
2605         return (error);
2606 }
2607
2608 struct getfsstat_struct {
2609         user_addr_t     sfsp;
2610         user_addr_t     *mp;
2611         int             count;
2612         int             maxcount;
2613         int             flags;
2614         int             error;
2615 };
2616
2617
2618 static int
2619 getfsstat_callback(mount_t mp, void * arg)
2620 {
2621
2622         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2623         struct vfsstatfs *sp;
2624         int error, my_size;
2625         vfs_context_t ctx = vfs_context_current();
2626
2627         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2628                 sp = &mp->mnt_vfsstat;
2629                 /*
2630                  * If MNT_NOWAIT is specified, do not refresh the
2631                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
2632                  */
2633                 if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2634                         (error = vfs_update_vfsstat(mp, ctx,
2635                             VFS_USER_EVENT))) {
2636                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2637                         return(VFS_RETURNED);
2638                 }
2639
2640                 /*
2641                  * Need to handle LP64 version of struct statfs
2642                  */
2643                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
2644                 if (error) {
2645                         fstp->error = error;
2646                         return(VFS_RETURNED_DONE);
2647                 }
2648                 fstp->sfsp += my_size;
2649
2650                 if (fstp->mp) {
2651 #if CONFIG_MACF
2652                         error = mac_mount_label_get(mp, *fstp->mp);
2653                         if (error) {
2654                                 fstp->error = error;
2655                                 return(VFS_RETURNED_DONE);
2656                         }
2657 #endif
2658                         fstp->mp++;
2659                 }
2660         }
2661         fstp->count++;
2662         return(VFS_RETURNED);
2663 }
2664
2665 /*
2666  * Get statistics on all filesystems.
2667  */
2668 int
2669 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
2670 {
2671         struct __mac_getfsstat_args muap;
2672
2673         muap.buf = uap->buf;
2674         muap.bufsize = uap->bufsize;
2675         muap.mac = USER_ADDR_NULL;
2676         muap.macsize = 0;
2677         muap.flags = uap->flags;
2678
2679         return (__mac_getfsstat(p, &muap, retval));
2680 }
2681
2682 /*
2683  * __mac_getfsstat: Get MAC-related file system statistics
2684  *
2685  * Parameters:    p                        (ignored)
2686  *                uap                      User argument descriptor (see below)
2687  *                retval                   Count of file system statistics (N stats)
2688  *
2689  * Indirect:      uap->bufsize             Buffer size
2690  *                uap->macsize             MAC info size
2691  *                uap->buf                 Buffer where information will be returned
2692  *                uap->mac                 MAC info
2693  *                uap->flags               File system flags
2694  *
2695  *
2696  * Returns:        0                       Success
2697  *                !0                       Not success
2698  *
2699  */
2700 int
2701 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
2702 {
2703         user_addr_t sfsp;
2704         user_addr_t *mp;
2705         size_t count, maxcount, bufsize, macsize;
2706         struct getfsstat_struct fst;
2707
2708         bufsize = (size_t) uap->bufsize;
2709         macsize = (size_t) uap->macsize;
2710
2711         if (IS_64BIT_PROCESS(p)) {
2712                 maxcount = bufsize / sizeof(struct user64_statfs);
2713         }
2714         else {
2715                 maxcount = bufsize / sizeof(struct user32_statfs);
2716         }
2717         sfsp = uap->buf;
2718         count = 0;
2719
2720         mp = NULL;
2721
2722 #if CONFIG_MACF
2723         if (uap->mac != USER_ADDR_NULL) {
2724                 u_int32_t *mp0;
2725                 int error;
2726                 unsigned int i;
2727
2728                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
2729                 if (count != maxcount)
2730                         return (EINVAL);
2731
2732                 /* Copy in the array */
2733                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
2734                 if (mp0 == NULL) {
2735                         return (ENOMEM);
2736                 }
2737
2738                 error = copyin(uap->mac, mp0, macsize);
2739                 if (error) {
2740                         FREE(mp0, M_MACTEMP);
2741                         return (error);
2742                 }
2743
2744                 /* Normalize to an array of user_addr_t */
2745                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
2746                 if (mp == NULL) {
2747                         FREE(mp0, M_MACTEMP);
2748                         return (ENOMEM);
2749                 }
2750
2751                 for (i = 0; i < count; i++) {
2752                         if (IS_64BIT_PROCESS(p))
2753                                 mp[i] = ((user_addr_t *)mp0)[i];
2754                         else
2755                                 mp[i] = (user_addr_t)mp0[i];
2756                 }
2757                 FREE(mp0, M_MACTEMP);
2758         }
2759 #endif
2760
2761
2762         fst.sfsp = sfsp;
2763         fst.mp = mp;
2764         fst.flags = uap->flags;
2765         fst.count = 0;
2766         fst.error = 0;
2767         fst.maxcount = maxcount;
2768
2769
2770         vfs_iterate(0, getfsstat_callback, &fst);
2771
2772         if (mp)
2773                 FREE(mp, M_MACTEMP);
2774
2775         if (fst.error ) {
2776                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2777                 return(fst.error);
2778         }
2779
2780         if (fst.sfsp && fst.count > fst.maxcount)
2781                 *retval = fst.maxcount;
2782         else
2783                 *retval = fst.count;
2784         return (0);
2785 }
2786
2787 static int
2788 getfsstat64_callback(mount_t mp, void * arg)
2789 {
2790         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2791         struct vfsstatfs *sp;
2792         int error;
2793
2794         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2795                 sp = &mp->mnt_vfsstat;
2796                 /*
2797                  * If MNT_NOWAIT is specified, do not refresh the fsstat
2798                  * cache. MNT_WAIT overrides MNT_NOWAIT.
2799                  *
2800                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
2801                  * getfsstat, since the constants are out of the same
2802                  * namespace.
2803                  */
2804                 if (((fstp->flags & MNT_NOWAIT) == 0 ||
2805                      (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2806                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
2807                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2808                         return(VFS_RETURNED);
2809                 }
2810
2811                 error = statfs64_common(mp, sp, fstp->sfsp);
2812                 if (error) {
2813                         fstp->error = error;
2814                         return(VFS_RETURNED_DONE);
2815                 }
2816                 fstp->sfsp += sizeof(struct statfs64);
2817         }
2818         fstp->count++;
2819         return(VFS_RETURNED);
2820 }
2821
2822 /*
2823  * Get statistics on all file systems in 64 bit mode.
2824  */
2825 int
2826 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
2827 {
2828         user_addr_t sfsp;
2829         int count, maxcount;
2830         struct getfsstat_struct fst;
2831
2832         maxcount = uap->bufsize / sizeof(struct statfs64);
2833
2834         sfsp = uap->buf;
2835         count = 0;
2836
2837         fst.sfsp = sfsp;
2838         fst.flags = uap->flags;
2839         fst.count = 0;
2840         fst.error = 0;
2841         fst.maxcount = maxcount;
2842
2843         vfs_iterate(0, getfsstat64_callback, &fst);
2844
2845         if (fst.error ) {
2846                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2847                 return(fst.error);
2848         }
2849
2850         if (fst.sfsp && fst.count > fst.maxcount)
2851                 *retval = fst.maxcount;
2852         else
2853                 *retval = fst.count;
2854
2855         return (0);
2856 }
2857
2858 /*
2859  * gets the associated vnode with the file descriptor passed.
2860  * as input
2861  *
2862  * INPUT
2863  * ctx - vfs context of caller
2864  * fd - file descriptor for which vnode is required.
2865  * vpp - Pointer to pointer to vnode to be returned.
2866  *
2867  * The vnode is returned with an iocount so any vnode obtained
2868  * by this call needs a vnode_put
2869  *
2870  */
2871 static int
2872 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
2873 {
2874         int error;
2875         vnode_t vp;
2876         struct fileproc *fp;
2877         proc_t p = vfs_context_proc(ctx);
2878
2879         *vpp =  NULLVP;
2880
2881         error = fp_getfvp(p, fd, &fp, &vp);
2882         if (error)
2883                 return (error);
2884
2885         error = vnode_getwithref(vp);
2886         if (error) {
2887                 (void)fp_drop(p, fd, fp, 0);
2888                 return (error);
2889         }
2890
2891         (void)fp_drop(p, fd, fp, 0);
2892         *vpp = vp;
2893         return (error);
2894 }
2895
2896 /*
2897  * Wrapper function around namei to start lookup from a directory
2898  * specified by a file descriptor ni_dirfd.
2899  *
2900  * In addition to all the errors returned by namei, this call can
2901  * return ENOTDIR if the file descriptor does not refer to a directory.
2902  * and EBADF if the file descriptor is not valid.
2903  */
2904 int
2905 nameiat(struct nameidata *ndp, int dirfd)
2906 {
2907         if ((dirfd != AT_FDCWD) &&
2908             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
2909             !(ndp->ni_cnd.cn_flags & USEDVP)) {
2910                 int error = 0;
2911                 char c;
2912
2913                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
2914                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
2915                         if (error)
2916                                 return (error);
2917                 } else {
2918                         c = *((char *)(ndp->ni_dirp));
2919                 }
2920
2921                 if (c != '/') {
2922                         vnode_t dvp_at;
2923
2924                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
2925                             &dvp_at);
2926                         if (error)
2927                                 return (error);
2928
2929                         if (vnode_vtype(dvp_at) != VDIR) {
2930                                 vnode_put(dvp_at);
2931                                 return (ENOTDIR);
2932                         }
2933
2934                         ndp->ni_dvp = dvp_at;
2935                         ndp->ni_cnd.cn_flags |= USEDVP;
2936                         error = namei(ndp);
2937                         ndp->ni_cnd.cn_flags &= ~USEDVP;
2938                         vnode_put(dvp_at);
2939                         return (error);
2940                 }
2941         }
2942
2943         return (namei(ndp));
2944 }
2945
2946 /*
2947  * Change current working directory to a given file descriptor.
2948  */
2949 /* ARGSUSED */
2950 static int
2951 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
2952 {
2953         struct filedesc *fdp = p->p_fd;
2954         vnode_t vp;
2955         vnode_t tdp;
2956         vnode_t tvp;
2957         struct mount *mp;
2958         int error;
2959         vfs_context_t ctx = vfs_context_current();
2960
2961         AUDIT_ARG(fd, uap->fd);
2962         if (per_thread && uap->fd == -1) {
2963                 /*
2964                  * Switching back from per-thread to per process CWD; verify we
2965                  * in fact have one before proceeding.  The only success case
2966                  * for this code path is to return 0 preemptively after zapping
2967                  * the thread structure contents.
2968                  */
2969                 thread_t th = vfs_context_thread(ctx);
2970                 if (th) {
2971                         uthread_t uth = get_bsdthread_info(th);
2972                         tvp = uth->uu_cdir;
2973                         uth->uu_cdir = NULLVP;
2974                         if (tvp != NULLVP) {
2975                                 vnode_rele(tvp);
2976                                 return (0);
2977                         }
2978                 }
2979                 return (EBADF);
2980         }
2981
2982         if ( (error = file_vnode(uap->fd, &vp)) )
2983                 return(error);
2984         if ( (error = vnode_getwithref(vp)) ) {
2985                 file_drop(uap->fd);
2986                 return(error);
2987         }
2988
2989         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
2990
2991         if (vp->v_type != VDIR) {
2992                 error = ENOTDIR;
2993                 goto out;
2994         }
2995
2996 #if CONFIG_MACF
2997         error = mac_vnode_check_chdir(ctx, vp);
2998         if (error)
2999                 goto out;
3000 #endif
3001         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3002         if (error)
3003                 goto out;
3004
3005         while (!error && (mp = vp->v_mountedhere) != NULL) {
3006                 if (vfs_busy(mp, LK_NOWAIT)) {
3007                         error = EACCES;
3008                         goto out;
3009                 }
3010                 error = VFS_ROOT(mp, &tdp, ctx);
3011                 vfs_unbusy(mp);
3012                 if (error)
3013                         break;
3014                 vnode_put(vp);
3015                 vp = tdp;
3016         }
3017         if (error)
3018                 goto out;
3019         if ( (error = vnode_ref(vp)) )
3020                 goto out;
3021         vnode_put(vp);
3022
3023         if (per_thread) {
3024                 thread_t th = vfs_context_thread(ctx);
3025                 if (th) {
3026                         uthread_t uth = get_bsdthread_info(th);
3027                         tvp = uth->uu_cdir;
3028                         uth->uu_cdir = vp;
3029                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3030                 } else {
3031                         vnode_rele(vp);
3032                         return (ENOENT);
3033                 }
3034         } else {
3035                 proc_fdlock(p);
3036                 tvp = fdp->fd_cdir;
3037                 fdp->fd_cdir = vp;
3038                 proc_fdunlock(p);
3039         }
3040
3041         if (tvp)
3042                 vnode_rele(tvp);
3043         file_drop(uap->fd);
3044
3045         return (0);
3046 out:
3047         vnode_put(vp);
3048         file_drop(uap->fd);
3049
3050         return(error);
3051 }
3052
3053 int
3054 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3055 {
3056         return common_fchdir(p, uap, 0);
3057 }
3058
3059 int
3060 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3061 {
3062         return common_fchdir(p, (void *)uap, 1);
3063 }
3064
3065 /*
3066  * Change current working directory (".").
3067  *
3068  * Returns:     0                       Success
3069  *      change_dir:ENOTDIR
3070  *      change_dir:???
3071  *      vnode_ref:ENOENT                No such file or directory
3072  */
3073 /* ARGSUSED */
3074 static int
3075 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3076 {
3077         struct filedesc *fdp = p->p_fd;
3078         int error;
3079         struct nameidata nd;
3080         vnode_t tvp;
3081         vfs_context_t ctx = vfs_context_current();
3082
3083         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3084                 UIO_USERSPACE, uap->path, ctx);
3085         error = change_dir(&nd, ctx);
3086         if (error)
3087                 return (error);
3088         if ( (error = vnode_ref(nd.ni_vp)) ) {
3089                 vnode_put(nd.ni_vp);
3090                 return (error);
3091         }
3092         /*
3093          * drop the iocount we picked up in change_dir
3094          */
3095         vnode_put(nd.ni_vp);
3096
3097         if (per_thread) {
3098                 thread_t th = vfs_context_thread(ctx);
3099                 if (th) {
3100                         uthread_t uth = get_bsdthread_info(th);
3101                         tvp = uth->uu_cdir;
3102                         uth->uu_cdir = nd.ni_vp;
3103                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3104                 } else {
3105                         vnode_rele(nd.ni_vp);
3106                         return (ENOENT);
3107                 }
3108         } else {
3109                 proc_fdlock(p);
3110                 tvp = fdp->fd_cdir;
3111                 fdp->fd_cdir = nd.ni_vp;
3112                 proc_fdunlock(p);
3113         }
3114
3115         if (tvp)
3116                 vnode_rele(tvp);
3117
3118         return (0);
3119 }
3120
3121
3122 /*
3123  * chdir
3124  *
3125  * Change current working directory (".") for the entire process
3126  *
3127  * Parameters:  p       Process requesting the call
3128  *              uap     User argument descriptor (see below)
3129  *              retval  (ignored)
3130  *
3131  * Indirect parameters: uap->path       Directory path
3132  *
3133  * Returns:     0                       Success
3134  *              common_chdir: ENOTDIR
3135  *              common_chdir: ENOENT    No such file or directory
3136  *              common_chdir: ???
3137  *
3138  */
3139 int
3140 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3141 {
3142         return common_chdir(p, (void *)uap, 0);
3143 }
3144
3145 /*
3146  * __pthread_chdir
3147  *
3148  * Change current working directory (".") for a single thread
3149  *
3150  * Parameters:  p       Process requesting the call
3151  *              uap     User argument descriptor (see below)
3152  *              retval  (ignored)
3153  *
3154  * Indirect parameters: uap->path       Directory path
3155  *
3156  * Returns:     0                       Success
3157  *              common_chdir: ENOTDIR
3158  *              common_chdir: ENOENT    No such file or directory
3159  *              common_chdir: ???
3160  *
3161  */
3162 int
3163 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3164 {
3165         return common_chdir(p, (void *)uap, 1);
3166 }
3167
3168
3169 /*
3170  * Change notion of root (``/'') directory.
3171  */
3172 /* ARGSUSED */
3173 int
3174 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3175 {
3176         struct filedesc *fdp = p->p_fd;
3177         int error;
3178         struct nameidata nd;
3179         vnode_t tvp;
3180         vfs_context_t ctx = vfs_context_current();
3181
3182         if ((error = suser(kauth_cred_get(), &p->p_acflag)))
3183                 return (error);
3184
3185         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3186                 UIO_USERSPACE, uap->path, ctx);
3187         error = change_dir(&nd, ctx);
3188         if (error)
3189                 return (error);
3190
3191 #if CONFIG_MACF
3192         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3193             &nd.ni_cnd);
3194         if (error) {
3195                 vnode_put(nd.ni_vp);
3196                 return (error);
3197         }
3198 #endif
3199
3200         if ( (error = vnode_ref(nd.ni_vp)) ) {
3201                 vnode_put(nd.ni_vp);
3202                 return (error);
3203         }
3204         vnode_put(nd.ni_vp);
3205
3206         proc_fdlock(p);
3207         tvp = fdp->fd_rdir;
3208         fdp->fd_rdir = nd.ni_vp;
3209         fdp->fd_flags |= FD_CHROOT;
3210         proc_fdunlock(p);
3211
3212         if (tvp != NULL)
3213                 vnode_rele(tvp);
3214
3215         return (0);
3216 }
3217
3218 /*
3219  * Common routine for chroot and chdir.
3220  *
3221  * Returns:     0                       Success
3222  *              ENOTDIR                 Not a directory
3223  *              namei:???               [anything namei can return]
3224  *              vnode_authorize:???     [anything vnode_authorize can return]
3225  */
3226 static int
3227 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3228 {
3229         vnode_t vp;
3230         int error;
3231
3232         if ((error = namei(ndp)))
3233                 return (error);
3234         nameidone(ndp);
3235         vp = ndp->ni_vp;
3236
3237         if (vp->v_type != VDIR) {
3238                 vnode_put(vp);
3239                 return (ENOTDIR);
3240         }
3241
3242 #if CONFIG_MACF
3243         error = mac_vnode_check_chdir(ctx, vp);
3244         if (error) {
3245                 vnode_put(vp);
3246                 return (error);
3247         }
3248 #endif
3249
3250         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3251         if (error) {
3252                 vnode_put(vp);
3253                 return (error);
3254         }
3255
3256         return (error);
3257 }
3258
3259 /*
3260  * Free the vnode data (for directories) associated with the file glob.
3261  */
3262 struct fd_vn_data *
3263 fg_vn_data_alloc(void)
3264 {
3265         struct fd_vn_data *fvdata;
3266
3267         /* Allocate per fd vnode data */
3268         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3269                M_FD_VN_DATA, M_WAITOK | M_ZERO);
3270         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3271         return fvdata;
3272 }
3273
3274 /*
3275  * Free the vnode data (for directories) associated with the file glob.
3276  */
3277 void
3278 fg_vn_data_free(void *fgvndata)
3279 {
3280         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3281
3282         if (fvdata->fv_buf)
3283                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3284         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3285         FREE(fvdata, M_FD_VN_DATA);
3286 }
3287
3288 /*
3289  * Check permissions, allocate an open file structure,
3290  * and call the device open routine if any.
3291  *
3292  * Returns:     0                       Success
3293  *              EINVAL
3294  *              EINTR
3295  *      falloc:ENFILE
3296  *      falloc:EMFILE
3297  *      falloc:ENOMEM
3298  *      vn_open_auth:???
3299  *      dupfdopen:???
3300  *      VNOP_ADVLOCK:???
3301  *      vnode_setsize:???
3302  *
3303  * XXX Need to implement uid, gid
3304  */
3305 int
3306 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3307     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3308     int32_t *retval)
3309 {
3310         proc_t p = vfs_context_proc(ctx);
3311         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3312         struct fileproc *fp;
3313         vnode_t vp;
3314         int flags, oflags;
3315         int type, indx, error;
3316         struct flock lf;
3317         struct vfs_context context;
3318
3319         oflags = uflags;
3320
3321         if ((oflags & O_ACCMODE) == O_ACCMODE)
3322                 return(EINVAL);
3323
3324         flags = FFLAGS(uflags);
3325         CLR(flags, FENCRYPTED);
3326         CLR(flags, FUNENCRYPTED);
3327
3328         AUDIT_ARG(fflags, oflags);
3329         AUDIT_ARG(mode, vap->va_mode);
3330
3331         if ((error = falloc_withalloc(p,
3332             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3333                 return (error);
3334         }
3335         uu->uu_dupfd = -indx - 1;
3336
3337         if ((error = vn_open_auth(ndp, &flags, vap))) {
3338                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)){        /* XXX from fdopen */
3339                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3340                                 fp_drop(p, indx, NULL, 0);
3341                                 *retval = indx;
3342                                 return (0);
3343                         }
3344                 }
3345                 if (error == ERESTART)
3346                         error = EINTR;
3347                 fp_free(p, indx, fp);
3348                 return (error);
3349         }
3350         uu->uu_dupfd = 0;
3351         vp = ndp->ni_vp;
3352
3353         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3354         fp->f_fglob->fg_ops = &vnops;
3355         fp->f_fglob->fg_data = (caddr_t)vp;
3356
3357         if (flags & (O_EXLOCK | O_SHLOCK)) {
3358                 lf.l_whence = SEEK_SET;
3359                 lf.l_start = 0;
3360                 lf.l_len = 0;
3361                 if (flags & O_EXLOCK)
3362                         lf.l_type = F_WRLCK;
3363                 else
3364                         lf.l_type = F_RDLCK;
3365                 type = F_FLOCK;
3366                 if ((flags & FNONBLOCK) == 0)
3367                         type |= F_WAIT;
3368 #if CONFIG_MACF
3369                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3370                     F_SETLK, &lf);
3371                 if (error)
3372                         goto bad;
3373 #endif
3374                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL)))
3375                         goto bad;
3376                 fp->f_fglob->fg_flag |= FHASLOCK;
3377         }
3378
3379         /* try to truncate by setting the size attribute */
3380         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0))
3381                 goto bad;
3382
3383         /*
3384          * For directories we hold some additional information in the fd.
3385          */
3386         if (vnode_vtype(vp) == VDIR) {
3387                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3388         } else {
3389                 fp->f_fglob->fg_vn_data = NULL;
3390         }
3391
3392         vnode_put(vp);
3393
3394         /*
3395          * The first terminal open (without a O_NOCTTY) by a session leader
3396          * results in it being set as the controlling terminal.
3397          */
3398         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3399             !(flags & O_NOCTTY)) {
3400                 int tmp = 0;
3401
3402                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3403                     (caddr_t)&tmp, ctx);
3404         }
3405
3406         proc_fdlock(p);
3407         if (flags & O_CLOEXEC)
3408                 *fdflags(p, indx) |= UF_EXCLOSE;
3409         if (flags & O_CLOFORK)
3410                 *fdflags(p, indx) |= UF_FORKCLOSE;
3411         procfdtbl_releasefd(p, indx, NULL);
3412         fp_drop(p, indx, fp, 1);
3413         proc_fdunlock(p);
3414
3415         *retval = indx;
3416
3417         return (0);
3418 bad:
3419         context = *vfs_context_current();
3420         context.vc_ucred = fp->f_fglob->fg_cred;
3421
3422         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
3423             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
3424                 lf.l_whence = SEEK_SET;
3425                 lf.l_start = 0;
3426                 lf.l_len = 0;
3427                 lf.l_type = F_UNLCK;
3428
3429                 (void)VNOP_ADVLOCK(
3430                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
3431         }
3432
3433         vn_close(vp, fp->f_fglob->fg_flag, &context);
3434         vnode_put(vp);
3435         fp_free(p, indx, fp);
3436
3437         return (error);
3438 }
3439
3440 /*
3441  * While most of the *at syscall handlers can call nameiat() which
3442  * is a wrapper around namei, the use of namei and initialisation
3443  * of nameidata are far removed and in different functions  - namei
3444  * gets called in vn_open_auth for open1. So we'll just do here what
3445  * nameiat() does.
3446  */
3447 static int
3448 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3449     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
3450     int dirfd)
3451 {
3452         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
3453                 int error;
3454                 char c;
3455
3456                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3457                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3458                         if (error)
3459                                 return (error);
3460                 } else {
3461                         c = *((char *)(ndp->ni_dirp));
3462                 }
3463
3464                 if (c != '/') {
3465                         vnode_t dvp_at;
3466
3467                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3468                             &dvp_at);
3469                         if (error)
3470                                 return (error);
3471
3472                         if (vnode_vtype(dvp_at) != VDIR) {
3473                                 vnode_put(dvp_at);
3474                                 return (ENOTDIR);
3475                         }
3476
3477                         ndp->ni_dvp = dvp_at;
3478                         ndp->ni_cnd.cn_flags |= USEDVP;
3479                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
3480                             retval);
3481                         vnode_put(dvp_at);
3482                         return (error);
3483                 }
3484         }
3485
3486         return (open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval));
3487 }
3488
3489 /*
3490  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
3491  *
3492  * Parameters:  p                       Process requesting the open
3493  *              uap                     User argument descriptor (see below)
3494  *              retval                  Pointer to an area to receive the
3495  *                                      return calue from the system call
3496  *
3497  * Indirect:    uap->path               Path to open (same as 'open')
3498  *              uap->flags              Flags to open (same as 'open'
3499  *              uap->uid                UID to set, if creating
3500  *              uap->gid                GID to set, if creating
3501  *              uap->mode               File mode, if creating (same as 'open')
3502  *              uap->xsecurity          ACL to set, if creating
3503  *
3504  * Returns:     0                       Success
3505  *              !0                      errno value
3506  *
3507  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3508  *
3509  * XXX:         We should enummerate the possible errno values here, and where
3510  *              in the code they originated.
3511  */
3512 int
3513 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
3514 {
3515         struct filedesc *fdp = p->p_fd;
3516         int ciferror;
3517         kauth_filesec_t xsecdst;
3518         struct vnode_attr va;
3519         struct nameidata nd;
3520         int cmode;
3521
3522         AUDIT_ARG(owner, uap->uid, uap->gid);
3523
3524         xsecdst = NULL;
3525         if ((uap->xsecurity != USER_ADDR_NULL) &&
3526             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
3527                 return ciferror;
3528
3529         VATTR_INIT(&va);
3530         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3531         VATTR_SET(&va, va_mode, cmode);
3532         if (uap->uid != KAUTH_UID_NONE)
3533                 VATTR_SET(&va, va_uid, uap->uid);
3534         if (uap->gid != KAUTH_GID_NONE)
3535                 VATTR_SET(&va, va_gid, uap->gid);
3536         if (xsecdst != NULL)
3537                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3538
3539         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3540                uap->path, vfs_context_current());
3541
3542         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
3543                          fileproc_alloc_init, NULL, retval);
3544         if (xsecdst != NULL)
3545                 kauth_filesec_free(xsecdst);
3546
3547         return ciferror;
3548 }
3549
3550 /*
3551  * Go through the data-protected atomically controlled open (2)
3552  *
3553  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
3554  */
3555 int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval) {
3556         int flags = uap->flags;
3557         int class = uap->class;
3558         int dpflags = uap->dpflags;
3559
3560         /*
3561          * Follow the same path as normal open(2)
3562          * Look up the item if it exists, and acquire the vnode.
3563          */
3564         struct filedesc *fdp = p->p_fd;
3565         struct vnode_attr va;
3566         struct nameidata nd;
3567         int cmode;
3568         int error;
3569
3570         VATTR_INIT(&va);
3571         /* Mask off all but regular access permissions */
3572         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3573         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3574
3575         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3576                uap->path, vfs_context_current());
3577
3578         /*
3579          * Initialize the extra fields in vnode_attr to pass down our
3580          * extra fields.
3581          * 1. target cprotect class.
3582          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
3583          */
3584         if (flags & O_CREAT) {
3585                /* lower level kernel code validates that the class is valid before applying it. */
3586                if (class != PROTECTION_CLASS_DEFAULT) {
3587                        /*
3588                         * PROTECTION_CLASS_DEFAULT implies that we make the class for this
3589                         * file behave the same as open (2)
3590                         */
3591                        VATTR_SET(&va, va_dataprotect_class, class);
3592                }
3593         }
3594
3595         if (dpflags & (O_DP_GETRAWENCRYPTED|O_DP_GETRAWUNENCRYPTED)) {
3596                 if ( flags & (O_RDWR | O_WRONLY)) {
3597                         /* Not allowed to write raw encrypted bytes */
3598                         return EINVAL;
3599                 }
3600                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
3601                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
3602                 }
3603                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
3604                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
3605                 }
3606         }
3607
3608         error = open1(vfs_context_current(), &nd, uap->flags, &va,
3609                       fileproc_alloc_init, NULL, retval);
3610
3611         return error;
3612 }
3613
3614 static int
3615 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
3616     int fd, enum uio_seg segflg, int *retval)
3617 {
3618         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
3619         struct vnode_attr va;
3620         struct nameidata nd;
3621         int cmode;
3622
3623         VATTR_INIT(&va);
3624         /* Mask off all but regular access permissions */
3625         cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3626         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3627
3628         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
3629             segflg, path, ctx);
3630
3631         return (open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
3632             retval, fd));
3633 }
3634
3635 int
3636 open(proc_t p, struct open_args *uap, int32_t *retval)
3637 {
3638         __pthread_testcancel(1);
3639         return(open_nocancel(p, (struct open_nocancel_args *)uap, retval));
3640 }
3641
3642 int
3643 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
3644     int32_t *retval)
3645 {
3646         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3647             uap->mode, AT_FDCWD, UIO_USERSPACE, retval));
3648 }
3649
3650 int
3651 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
3652                 int32_t *retval)
3653 {
3654         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3655             uap->mode, uap->fd, UIO_USERSPACE, retval));
3656 }
3657
3658 int
3659 openat(proc_t p, struct openat_args *uap, int32_t *retval)
3660 {
3661         __pthread_testcancel(1);
3662         return(openat_nocancel(p, (struct openat_nocancel_args *)uap, retval));
3663 }
3664
3665 /*
3666  * openbyid_np: open a file given a file system id and a file system object id
3667  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
3668  *      file systems that don't support object ids it is a node id (uint64_t).
3669  *
3670  * Parameters:  p                       Process requesting the open
3671  *              uap                     User argument descriptor (see below)
3672  *              retval                  Pointer to an area to receive the
3673  *                                      return calue from the system call
3674  *
3675  * Indirect:    uap->path               Path to open (same as 'open')
3676  *
3677  *              uap->fsid               id of target file system
3678  *              uap->objid              id of target file system object
3679  *              uap->flags              Flags to open (same as 'open')
3680  *
3681  * Returns:     0                       Success
3682  *              !0                      errno value
3683  *
3684  *
3685  * XXX:         We should enummerate the possible errno values here, and where
3686  *              in the code they originated.
3687  */
3688 int
3689 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
3690 {
3691         fsid_t fsid;
3692         uint64_t objid;
3693         int error;
3694         char *buf = NULL;
3695         int buflen = MAXPATHLEN;
3696         int pathlen = 0;
3697         vfs_context_t ctx = vfs_context_current();
3698
3699         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
3700                 return (error);
3701         }
3702
3703         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
3704                 return (error);
3705         }
3706
3707         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
3708         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
3709                 return (error);
3710         }
3711
3712         AUDIT_ARG(value32, fsid.val[0]);
3713         AUDIT_ARG(value64, objid);
3714
3715         /*resolve path from fsis, objid*/
3716         do {
3717                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
3718                 if (buf == NULL) {
3719                         return (ENOMEM);
3720                 }
3721
3722                 error = fsgetpath_internal(
3723                         ctx, fsid.val[0], objid,
3724                         buflen, buf, &pathlen);
3725
3726                 if (error) {
3727                         FREE(buf, M_TEMP);
3728                         buf = NULL;
3729                 }
3730         } while (error == ENOSPC && (buflen += MAXPATHLEN));
3731
3732         if (error) {
3733                 return error;
3734         }
3735
3736         buf[pathlen] = 0;
3737
3738         error = openat_internal(
3739                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
3740
3741         FREE(buf, M_TEMP);
3742
3743         return error;
3744 }
3745
3746
3747 /*
3748  * Create a special file.
3749  */
3750 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
3751
3752 int
3753 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
3754 {
3755         struct vnode_attr va;
3756         vfs_context_t ctx = vfs_context_current();
3757         int error;
3758         struct nameidata nd;
3759         vnode_t vp, dvp;
3760
3761         VATTR_INIT(&va);
3762         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3763         VATTR_SET(&va, va_rdev, uap->dev);
3764
3765         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
3766         if ((uap->mode & S_IFMT) == S_IFIFO)
3767                 return(mkfifo1(ctx, uap->path, &va));
3768
3769         AUDIT_ARG(mode, uap->mode);
3770         AUDIT_ARG(value32, uap->dev);
3771
3772         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
3773                 return (error);
3774         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
3775                 UIO_USERSPACE, uap->path, ctx);
3776         error = namei(&nd);
3777         if (error)
3778                 return (error);
3779         dvp = nd.ni_dvp;
3780         vp = nd.ni_vp;
3781
3782         if (vp != NULL) {
3783                 error = EEXIST;
3784                 goto out;
3785         }
3786
3787         switch (uap->mode & S_IFMT) {
3788         case S_IFCHR:
3789                 VATTR_SET(&va, va_type, VCHR);
3790                 break;
3791         case S_IFBLK:
3792                 VATTR_SET(&va, va_type, VBLK);
3793                 break;
3794         default:
3795                 error = EINVAL;
3796                 goto out;
3797         }
3798
3799 #if CONFIG_MACF
3800         error = mac_vnode_check_create(ctx,
3801             nd.ni_dvp, &nd.ni_cnd, &va);
3802         if (error)
3803                 goto out;
3804 #endif
3805
3806         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
3807                 goto out;
3808
3809         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0)
3810                 goto out;
3811
3812         if (vp) {
3813                 int     update_flags = 0;
3814
3815                 // Make sure the name & parent pointers are hooked up
3816                 if (vp->v_name == NULL)
3817                         update_flags |= VNODE_UPDATE_NAME;
3818                 if (vp->v_parent == NULLVP)
3819                         update_flags |= VNODE_UPDATE_PARENT;
3820
3821                 if (update_flags)
3822                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
3823
3824 #if CONFIG_FSE
3825                 add_fsevent(FSE_CREATE_FILE, ctx,
3826                     FSE_ARG_VNODE, vp,
3827                     FSE_ARG_DONE);
3828 #endif
3829         }
3830
3831 out:
3832         /*
3833          * nameidone has to happen before we vnode_put(dvp)
3834          * since it may need to release the fs_nodelock on the dvp
3835          */
3836         nameidone(&nd);
3837
3838         if (vp)
3839                 vnode_put(vp);
3840         vnode_put(dvp);
3841
3842         return (error);
3843 }
3844
3845 /*
3846  * Create a named pipe.
3847  *
3848  * Returns:     0                       Success
3849  *              EEXIST
3850  *      namei:???
3851  *      vnode_authorize:???
3852  *      vn_create:???
3853  */
3854 static int
3855 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
3856 {
3857         vnode_t vp, dvp;
3858         int error;
3859         struct nameidata nd;
3860
3861         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
3862                 UIO_USERSPACE, upath, ctx);
3863         error = namei(&nd);
3864         if (error)
3865                 return (error);
3866         dvp = nd.ni_dvp;
3867         vp = nd.ni_vp;
3868
3869         /* check that this is a new file and authorize addition */
3870         if (vp != NULL) {
3871                 error = EEXIST;
3872                 goto out;
3873         }
3874         VATTR_SET(vap, va_type, VFIFO);
3875
3876         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0)
3877                 goto out;
3878
3879         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
3880 out:
3881         /*
3882          * nameidone has to happen before we vnode_put(dvp)
3883          * since it may need to release the fs_nodelock on the dvp
3884          */
3885         nameidone(&nd);
3886
3887         if (vp)
3888                 vnode_put(vp);
3889         vnode_put(dvp);
3890
3891         return error;
3892 }
3893
3894
3895 /*
3896  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
3897  *
3898  * Parameters:  p                       Process requesting the open
3899  *              uap                     User argument descriptor (see below)
3900  *              retval                  (Ignored)
3901  *
3902  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
3903  *              uap->uid                UID to set
3904  *              uap->gid                GID to set
3905  *              uap->mode               File mode to set (same as 'mkfifo')
3906  *              uap->xsecurity          ACL to set, if creating
3907  *
3908  * Returns:     0                       Success
3909  *              !0                      errno value
3910  *
3911  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3912  *
3913  * XXX:         We should enummerate the possible errno values here, and where
3914  *              in the code they originated.
3915  */
3916 int
3917 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
3918 {
3919         int ciferror;
3920         kauth_filesec_t xsecdst;
3921         struct vnode_attr va;
3922
3923         AUDIT_ARG(owner, uap->uid, uap->gid);
3924
3925         xsecdst = KAUTH_FILESEC_NONE;
3926         if (uap->xsecurity != USER_ADDR_NULL) {
3927                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
3928                         return ciferror;
3929         }
3930
3931         VATTR_INIT(&va);
3932         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3933         if (uap->uid != KAUTH_UID_NONE)
3934                 VATTR_SET(&va, va_uid, uap->uid);
3935         if (uap->gid != KAUTH_GID_NONE)
3936                 VATTR_SET(&va, va_gid, uap->gid);
3937         if (xsecdst != KAUTH_FILESEC_NONE)
3938                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3939
3940         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
3941
3942         if (xsecdst != KAUTH_FILESEC_NONE)
3943                 kauth_filesec_free(xsecdst);
3944         return ciferror;
3945 }
3946
3947 /* ARGSUSED */
3948 int
3949 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
3950 {
3951         struct vnode_attr va;
3952
3953         VATTR_INIT(&va);
3954         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3955
3956         return(mkfifo1(vfs_context_current(), uap->path, &va));
3957 }
3958
3959
3960 static char *
3961 my_strrchr(char *p, int ch)
3962 {
3963         char *save;
3964
3965         for (save = NULL;; ++p) {
3966                 if (*p == ch)
3967                         save = p;
3968                 if (!*p)
3969                         return(save);
3970         }
3971         /* NOTREACHED */
3972 }
3973
3974 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
3975
3976 int
3977 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
3978 {
3979         int ret, len = _len;
3980
3981         *truncated_path = 0;
3982         ret = vn_getpath(dvp, path, &len);
3983         if (ret == 0 && len < (MAXPATHLEN - 1)) {
3984                 if (leafname) {
3985                         path[len-1] = '/';
3986                         len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + 1;
3987                         if (len > MAXPATHLEN) {
3988                                 char *ptr;
3989
3990                                 // the string got truncated!
3991                                 *truncated_path = 1;
3992                                 ptr = my_strrchr(path, '/');
3993                                 if (ptr) {
3994                                         *ptr = '\0';   // chop off the string at the last directory component
3995                                 }
3996                                 len = strlen(path) + 1;
3997                         }
3998                 }
3999         } else if (ret == 0) {
4000                 *truncated_path = 1;
4001         } else if (ret != 0) {
4002                 struct vnode *mydvp=dvp;
4003
4004                 if (ret != ENOSPC) {
4005                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4006                                dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4007                 }
4008                 *truncated_path = 1;
4009
4010                 do {
4011                         if (mydvp->v_parent != NULL) {
4012                                 mydvp = mydvp->v_parent;
4013                         } else if (mydvp->v_mount) {
4014                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4015                                 break;
4016                         } else {
4017                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4018                                 strlcpy(path, "/", _len);
4019                                 len = 2;
4020                                 mydvp = NULL;
4021                         }
4022
4023                         if (mydvp == NULL) {
4024                                 break;
4025                         }
4026
4027                         len = _len;
4028                         ret = vn_getpath(mydvp, path, &len);
4029                 } while (ret == ENOSPC);
4030         }
4031
4032         return len;
4033 }
4034
4035
4036 /*
4037  * Make a hard file link.
4038  *
4039  * Returns:     0                       Success
4040  *              EPERM
4041  *              EEXIST
4042  *              EXDEV
4043  *      namei:???
4044  *      vnode_authorize:???
4045  *      VNOP_LINK:???
4046  */
4047 /* ARGSUSED */
4048 static int
4049 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4050     user_addr_t link, int flag, enum uio_seg segflg)
4051 {
4052         vnode_t vp, dvp, lvp;
4053         struct nameidata nd;
4054         int follow;
4055         int error;
4056 #if CONFIG_FSE
4057         fse_info finfo;
4058 #endif
4059         int need_event, has_listeners;
4060         char *target_path = NULL;
4061         int truncated=0;
4062
4063         vp = dvp = lvp = NULLVP;
4064
4065         /* look up the object we are linking to */
4066         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4067         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4068             segflg, path, ctx);
4069
4070         error = nameiat(&nd, fd1);
4071         if (error)
4072                 return (error);
4073         vp = nd.ni_vp;
4074
4075         nameidone(&nd);
4076
4077         /*
4078          * Normally, linking to directories is not supported.
4079          * However, some file systems may have limited support.
4080          */
4081         if (vp->v_type == VDIR) {
4082                 if (!(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) {
4083                         error = EPERM;   /* POSIX */
4084                         goto out;
4085                 }
4086                 /* Linking to a directory requires ownership. */
4087                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4088                         struct vnode_attr dva;
4089
4090                         VATTR_INIT(&dva);
4091                         VATTR_WANTED(&dva, va_uid);
4092                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4093                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4094                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4095                                 error = EACCES;
4096                                 goto out;
4097                         }
4098                 }
4099         }
4100
4101         /* lookup the target node */
4102 #if CONFIG_TRIGGERS
4103         nd.ni_op = OP_LINK;
4104 #endif
4105         nd.ni_cnd.cn_nameiop = CREATE;
4106         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4107         nd.ni_dirp = link;
4108         error = nameiat(&nd, fd2);
4109         if (error != 0)
4110                 goto out;
4111         dvp = nd.ni_dvp;
4112         lvp = nd.ni_vp;
4113
4114 #if CONFIG_MACF
4115         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0)
4116                 goto out2;
4117 #endif
4118
4119         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4120         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0)
4121                 goto out2;
4122
4123         /* target node must not exist */
4124         if (lvp != NULLVP) {
4125                 error = EEXIST;
4126                 goto out2;
4127         }
4128         /* cannot link across mountpoints */
4129         if (vnode_mount(vp) != vnode_mount(dvp)) {
4130                 error = EXDEV;
4131                 goto out2;
4132         }
4133
4134         /* authorize creation of the target note */
4135         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
4136                 goto out2;
4137
4138         /* and finally make the link */
4139         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4140         if (error)
4141                 goto out2;
4142
4143 #if CONFIG_MACF
4144         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4145 #endif
4146
4147 #if CONFIG_FSE
4148         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4149 #else
4150         need_event = 0;
4151 #endif
4152         has_listeners = kauth_authorize_fileop_has_listeners();
4153
4154         if (need_event || has_listeners) {
4155                 char *link_to_path = NULL;
4156                 int len, link_name_len;
4157
4158                 /* build the path to the new link file */
4159                 GET_PATH(target_path);
4160                 if (target_path == NULL) {
4161                         error = ENOMEM;
4162                         goto out2;
4163                 }
4164
4165                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4166
4167                 if (has_listeners) {
4168                         /* build the path to file we are linking to */
4169                         GET_PATH(link_to_path);
4170                         if (link_to_path == NULL) {
4171                                 error = ENOMEM;
4172                                 goto out2;
4173                         }
4174
4175                         link_name_len = MAXPATHLEN;
4176                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4177                                 /*
4178                                  * Call out to allow 3rd party notification of rename.
4179                                  * Ignore result of kauth_authorize_fileop call.
4180                                  */
4181                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4182                                                        (uintptr_t)link_to_path,
4183                                                        (uintptr_t)target_path);
4184                         }
4185                         if (link_to_path != NULL) {
4186                                 RELEASE_PATH(link_to_path);
4187                         }
4188                 }
4189 #if CONFIG_FSE
4190                 if (need_event) {
4191                         /* construct fsevent */
4192                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4193                                 if (truncated) {
4194                                         finfo.mode |= FSE_TRUNCATED_PATH;
4195                                 }
4196
4197                                 // build the path to the destination of the link
4198                                 add_fsevent(FSE_CREATE_FILE, ctx,
4199                                             FSE_ARG_STRING, len, target_path,
4200                                             FSE_ARG_FINFO, &finfo,
4201                                             FSE_ARG_DONE);
4202                         }
4203                         if (vp->v_parent) {
4204                             add_fsevent(FSE_STAT_CHANGED, ctx,
4205                                 FSE_ARG_VNODE, vp->v_parent,
4206                                 FSE_ARG_DONE);
4207                         }
4208                 }
4209 #endif
4210         }
4211 out2:
4212         /*
4213          * nameidone has to happen before we vnode_put(dvp)
4214          * since it may need to release the fs_nodelock on the dvp
4215          */
4216         nameidone(&nd);
4217         if (target_path != NULL) {
4218                 RELEASE_PATH(target_path);
4219         }
4220 out:
4221         if (lvp)
4222                 vnode_put(lvp);
4223         if (dvp)
4224                 vnode_put(dvp);
4225         vnode_put(vp);
4226         return (error);
4227 }
4228
4229 int
4230 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4231 {
4232         return (linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4233             AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE));
4234 }
4235
4236 int
4237 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4238 {
4239         if (uap->flag & ~AT_SYMLINK_FOLLOW)
4240                 return (EINVAL);
4241
4242         return (linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4243             uap->fd2, uap->link, uap->flag, UIO_USERSPACE));
4244 }
4245
4246 /*
4247  * Make a symbolic link.
4248  *
4249  * We could add support for ACLs here too...
4250  */
4251 /* ARGSUSED */
4252 static int
4253 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4254     user_addr_t link, enum uio_seg segflg)
4255 {
4256         struct vnode_attr va;
4257         char *path;
4258         int error;
4259         struct nameidata nd;
4260         vnode_t vp, dvp;
4261         uint32_t dfflags;       // Directory file flags
4262         size_t dummy=0;
4263         proc_t p;
4264
4265         error = 0;
4266         if (UIO_SEG_IS_USER_SPACE(segflg)) {
4267                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
4268                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
4269         } else {
4270                 path = (char *)path_data;
4271         }
4272         if (error)
4273                 goto out;
4274         AUDIT_ARG(text, path);  /* This is the link string */
4275
4276         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
4277             segflg, link, ctx);
4278
4279         error = nameiat(&nd, fd);
4280         if (error)
4281                 goto out;
4282         dvp = nd.ni_dvp;
4283         vp = nd.ni_vp;
4284
4285         p = vfs_context_proc(ctx);
4286         VATTR_INIT(&va);
4287         VATTR_SET(&va, va_type, VLNK);
4288         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
4289
4290         /*
4291          * Handle inheritance of restricted flag
4292          */
4293         error = vnode_flags(dvp, &dfflags, ctx);
4294         if (error)
4295                 goto skipit;
4296         if (dfflags & SF_RESTRICTED)
4297                 VATTR_SET(&va, va_flags, SF_RESTRICTED);
4298
4299 #if CONFIG_MACF
4300         error = mac_vnode_check_create(ctx,
4301                         dvp, &nd.ni_cnd, &va);
4302 #endif
4303         if (error != 0) {
4304             goto skipit;
4305         }
4306
4307         if (vp != NULL) {
4308             error = EEXIST;
4309             goto skipit;
4310         }
4311
4312         /* authorize */
4313         if (error == 0)
4314                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
4315         /* get default ownership, etc. */
4316         if (error == 0)
4317                 error = vnode_authattr_new(dvp, &va, 0, ctx);
4318         if (error == 0)
4319                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
4320
4321 #if CONFIG_MACF
4322         if (error == 0 && vp)
4323                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
4324 #endif
4325
4326         /* do fallback attribute handling */
4327         if (error == 0 && vp)
4328                 error = vnode_setattr_fallback(vp, &va, ctx);
4329
4330         if (error == 0) {
4331                 int     update_flags = 0;
4332
4333                 /*check if a new vnode was created, else try to get one*/
4334                 if (vp == NULL) {
4335                         nd.ni_cnd.cn_nameiop = LOOKUP;
4336 #if CONFIG_TRIGGERS
4337                         nd.ni_op = OP_LOOKUP;
4338 #endif
4339                         nd.ni_cnd.cn_flags = 0;
4340                         error = nameiat(&nd, fd);
4341                         vp = nd.ni_vp;
4342
4343                         if (vp == NULL)
4344                                 goto skipit;
4345                 }
4346
4347 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
4348                 /* call out to allow 3rd party notification of rename.
4349                  * Ignore result of kauth_authorize_fileop call.
4350                  */
4351                 if (kauth_authorize_fileop_has_listeners() &&
4352                     namei(&nd) == 0) {
4353                         char *new_link_path = NULL;
4354                         int             len;
4355
4356                         /* build the path to the new link file */
4357                         new_link_path = get_pathbuff();
4358                         len = MAXPATHLEN;
4359                         vn_getpath(dvp, new_link_path, &len);
4360                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
4361                                 new_link_path[len - 1] = '/';
4362                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN-len);
4363                         }
4364
4365                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
4366                                            (uintptr_t)path, (uintptr_t)new_link_path);
4367                         if (new_link_path != NULL)
4368                                 release_pathbuff(new_link_path);
4369                 }
4370 #endif
4371                 // Make sure the name & parent pointers are hooked up
4372                 if (vp->v_name == NULL)
4373                         update_flags |= VNODE_UPDATE_NAME;
4374                 if (vp->v_parent == NULLVP)
4375                         update_flags |= VNODE_UPDATE_PARENT;
4376
4377                 if (update_flags)
4378                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4379
4380 #if CONFIG_FSE
4381                 add_fsevent(FSE_CREATE_FILE, ctx,
4382                             FSE_ARG_VNODE, vp,
4383                             FSE_ARG_DONE);
4384 #endif
4385         }
4386
4387 skipit:
4388         /*
4389          * nameidone has to happen before we vnode_put(dvp)
4390          * since it may need to release the fs_nodelock on the dvp
4391          */
4392         nameidone(&nd);
4393
4394         if (vp)
4395                 vnode_put(vp);
4396         vnode_put(dvp);
4397 out:
4398         if (path && (path != (char *)path_data))
4399                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
4400
4401         return (error);
4402 }
4403
4404 int
4405 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
4406 {
4407         return (symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
4408             uap->link, UIO_USERSPACE));
4409 }
4410
4411 int
4412 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
4413     __unused int32_t *retval)
4414 {
4415         return (symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
4416             uap->path2, UIO_USERSPACE));
4417 }
4418
4419 /*
4420  * Delete a whiteout from the filesystem.
4421  * No longer supported.
4422  */
4423 int
4424 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
4425 {
4426         return (ENOTSUP);
4427 }
4428
4429 /*
4430  * Delete a name from the filesystem.
4431  */
4432 /* ARGSUSED */
4433 static int
4434 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
4435     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
4436 {
4437         struct nameidata nd;
4438         vnode_t vp, dvp;
4439         int error;
4440         struct componentname *cnp;
4441         char  *path = NULL;
4442         int  len=0;
4443 #if CONFIG_FSE
4444         fse_info  finfo;
4445         struct vnode_attr va;
4446 #endif
4447         int flags;
4448         int need_event;
4449         int has_listeners;
4450         int truncated_path;
4451         int batched;
4452         struct vnode_attr *vap;
4453         int do_retry;
4454         int retry_count = 0;
4455         int cn_flags;
4456
4457         cn_flags = LOCKPARENT;
4458         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH))
4459                 cn_flags |= AUDITVNPATH1;
4460         /* If a starting dvp is passed, it trumps any fd passed. */
4461         if (start_dvp)
4462                 cn_flags |= USEDVP;
4463
4464 #if NAMEDRSRCFORK
4465         /* unlink or delete is allowed on rsrc forks and named streams */
4466         cn_flags |= CN_ALLOWRSRCFORK;
4467 #endif
4468
4469 retry:
4470         do_retry = 0;
4471         flags = 0;
4472         need_event = 0;
4473         has_listeners = 0;
4474         truncated_path = 0;
4475         vap = NULL;
4476
4477         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
4478
4479         nd.ni_dvp = start_dvp;
4480         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
4481         cnp = &nd.ni_cnd;
4482
4483 lookup_continue:
4484         error = nameiat(&nd, fd);
4485         if (error)
4486                 return (error);
4487
4488         dvp = nd.ni_dvp;
4489         vp = nd.ni_vp;
4490
4491
4492         /* With Carbon delete semantics, busy files cannot be deleted */
4493         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
4494                 flags |= VNODE_REMOVE_NODELETEBUSY;
4495         }
4496
4497         /* Skip any potential upcalls if told to. */
4498         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
4499                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
4500         }
4501
4502         if (vp) {
4503                 batched = vnode_compound_remove_available(vp);
4504                 /*
4505                  * The root of a mounted filesystem cannot be deleted.
4506                  */
4507                 if (vp->v_flag & VROOT) {
4508                         error = EBUSY;
4509                 }
4510
4511                 if (!batched) {
4512                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
4513                         if (error) {
4514                                 if (error == ENOENT) {
4515                                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4516                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4517                                                 do_retry = 1;
4518                                                 retry_count++;
4519                                         }
4520                                 }
4521                                 goto out;
4522                         }
4523                 }
4524         } else {
4525                 batched = 1;
4526
4527                 if (!vnode_compound_remove_available(dvp)) {
4528                         panic("No vp, but no compound remove?");
4529                 }
4530         }
4531
4532 #if CONFIG_FSE
4533         need_event = need_fsevent(FSE_DELETE, dvp);
4534         if (need_event) {
4535                 if (!batched) {
4536                         if ((vp->v_flag & VISHARDLINK) == 0) {
4537                                 /* XXX need to get these data in batched VNOP */
4538                                 get_fse_info(vp, &finfo, ctx);
4539                         }
4540                 } else {
4541                         error = vfs_get_notify_attributes(&va);
4542                         if (error) {
4543                                 goto out;
4544                         }
4545
4546                         vap = &va;
4547                 }
4548         }
4549 #endif
4550         has_listeners = kauth_authorize_fileop_has_listeners();
4551         if (need_event || has_listeners) {
4552                 if (path == NULL) {
4553                         GET_PATH(path);
4554                         if (path == NULL) {
4555                                 error = ENOMEM;
4556                                 goto out;
4557                         }
4558                 }
4559                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
4560         }
4561
4562 #if NAMEDRSRCFORK
4563         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK)
4564                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
4565         else
4566 #endif
4567         {
4568                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
4569                 vp = nd.ni_vp;
4570                 if (error == EKEEPLOOKING) {
4571                         if (!batched) {
4572                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
4573                         }
4574
4575                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
4576                                 panic("EKEEPLOOKING, but continue flag not set?");
4577                         }
4578
4579                         if (vnode_isdir(vp)) {
4580                                 error = EISDIR;
4581                                 goto out;
4582                         }
4583                         goto lookup_continue;
4584                 } else if (error == ENOENT && batched) {
4585                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4586                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4587                                 /*
4588                                  * For compound VNOPs, the authorization callback may
4589                                  * return ENOENT in case of racing hardlink lookups
4590                                  * hitting the name  cache, redrive the lookup.
4591                                  */
4592                                 do_retry = 1;
4593                                 retry_count += 1;
4594                                 goto out;
4595                         }
4596                 }
4597         }
4598
4599         /*
4600          * Call out to allow 3rd party notification of delete.
4601          * Ignore result of kauth_authorize_fileop call.
4602          */
4603         if (!error) {
4604                 if (has_listeners) {
4605                         kauth_authorize_fileop(vfs_context_ucred(ctx),
4606                                 KAUTH_FILEOP_DELETE,
4607                                 (uintptr_t)vp,
4608                                 (uintptr_t)path);
4609                 }
4610
4611                 if (vp->v_flag & VISHARDLINK) {
4612                     //
4613                     // if a hardlink gets deleted we want to blow away the
4614                     // v_parent link because the path that got us to this
4615                     // instance of the link is no longer valid.  this will
4616                     // force the next call to get the path to ask the file
4617                     // system instead of just following the v_parent link.
4618                     //
4619                     vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
4620                 }
4621
4622 #if CONFIG_FSE
4623                 if (need_event) {
4624                         if (vp->v_flag & VISHARDLINK) {
4625                                 get_fse_info(vp, &finfo, ctx);
4626                         } else if (vap) {
4627                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
4628                         }
4629                         if (truncated_path) {
4630                                 finfo.mode |= FSE_TRUNCATED_PATH;
4631                         }
4632                         add_fsevent(FSE_DELETE, ctx,
4633                                                 FSE_ARG_STRING, len, path,
4634                                                 FSE_ARG_FINFO, &finfo,
4635                                                 FSE_ARG_DONE);
4636                 }
4637 #endif
4638         }
4639
4640 out:
4641         if (path != NULL)
4642                 RELEASE_PATH(path);
4643
4644 #if NAMEDRSRCFORK
4645         /* recycle the deleted rsrc fork vnode to force a reclaim, which
4646          * will cause its shadow file to go away if necessary.
4647          */
4648          if (vp && (vnode_isnamedstream(vp)) &&
4649                 (vp->v_parent != NULLVP) &&
4650                 vnode_isshadow(vp)) {
4651                         vnode_recycle(vp);
4652          }
4653 #endif
4654         /*
4655          * nameidone has to happen before we vnode_put(dvp)
4656          * since it may need to release the fs_nodelock on the dvp
4657          */
4658         nameidone(&nd);
4659         vnode_put(dvp);
4660         if (vp) {
4661                 vnode_put(vp);
4662         }
4663
4664         if (do_retry) {
4665                 goto retry;
4666         }
4667
4668         return (error);
4669 }
4670
4671 int
4672 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
4673     enum uio_seg segflg, int unlink_flags)
4674 {
4675         return (unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
4676             unlink_flags));
4677 }
4678
4679 /*
4680  * Delete a name from the filesystem using Carbon semantics.
4681  */
4682 int
4683 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
4684 {
4685         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4686             uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY));
4687 }
4688
4689 /*
4690  * Delete a name from the filesystem using POSIX semantics.
4691  */
4692 int
4693 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
4694 {
4695         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4696             uap->path, UIO_USERSPACE, 0));
4697 }
4698
4699 int
4700 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
4701 {
4702         if (uap->flag & ~AT_REMOVEDIR)
4703                 return (EINVAL);
4704
4705         if (uap->flag & AT_REMOVEDIR)
4706                 return (rmdirat_internal(vfs_context_current(), uap->fd,
4707                     uap->path, UIO_USERSPACE));
4708         else
4709                 return (unlinkat_internal(vfs_context_current(), uap->fd,
4710                     NULLVP, uap->path, UIO_USERSPACE, 0));
4711 }
4712
4713 /*
4714  * Reposition read/write file offset.
4715  */
4716 int
4717 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
4718 {
4719         struct fileproc *fp;
4720         vnode_t vp;
4721         struct vfs_context *ctx;
4722         off_t offset = uap->offset, file_size;
4723         int error;
4724
4725         if ( (error = fp_getfvp(p,uap->fd, &fp, &vp)) ) {
4726                 if (error == ENOTSUP)
4727                         return (ESPIPE);
4728                 return (error);
4729         }
4730         if (vnode_isfifo(vp)) {
4731                 file_drop(uap->fd);
4732                 return(ESPIPE);
4733         }
4734
4735
4736         ctx = vfs_context_current();
4737 #if CONFIG_MACF
4738         if (uap->whence == L_INCR && uap->offset == 0)
4739                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
4740                     fp->f_fglob);
4741         else
4742                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
4743                     fp->f_fglob);
4744         if (error) {
4745                 file_drop(uap->fd);
4746                 return (error);
4747         }
4748 #endif
4749         if ( (error = vnode_getwithref(vp)) ) {
4750                 file_drop(uap->fd);
4751                 return(error);
4752         }
4753
4754         switch (uap->whence) {
4755         case L_INCR:
4756                 offset += fp->f_fglob->fg_offset;
4757                 break;
4758         case L_XTND:
4759                 if ((error = vnode_size(vp, &file_size, ctx)) != 0)
4760                         break;
4761                 offset += file_size;
4762                 break;
4763         case L_SET:
4764                 break;
4765         default:
4766                 error = EINVAL;
4767         }
4768         if (error == 0) {
4769                 if (uap->offset > 0 && offset < 0) {
4770                         /* Incremented/relative move past max size */
4771                         error = EOVERFLOW;
4772                 } else {
4773                         /*
4774                          * Allow negative offsets on character devices, per
4775                          * POSIX 1003.1-2001.  Most likely for writing disk
4776                          * labels.
4777                          */
4778                         if (offset < 0 && vp->v_type != VCHR) {
4779                                 /* Decremented/relative move before start */
4780                                 error = EINVAL;
4781                         } else {
4782                                 /* Success */
4783                                 fp->f_fglob->fg_offset = offset;
4784                                 *retval = fp->f_fglob->fg_offset;
4785                         }
4786                 }
4787         }
4788
4789         /*
4790          * An lseek can affect whether data is "available to read."  Use
4791          * hint of NOTE_NONE so no EVFILT_VNODE events fire
4792          */
4793         post_event_if_success(vp, error, NOTE_NONE);
4794         (void)vnode_put(vp);
4795         file_drop(uap->fd);
4796         return (error);
4797 }
4798
4799
4800 /*
4801  * Check access permissions.
4802  *
4803  * Returns:     0                       Success
4804  *              vnode_authorize:???
4805  */
4806 static int
4807 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
4808 {
4809         kauth_action_t action;
4810         int error;
4811
4812         /*
4813          * If just the regular access bits, convert them to something
4814          * that vnode_authorize will understand.
4815          */
4816         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
4817                 action = 0;
4818                 if (uflags & R_OK)
4819                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
4820                 if (uflags & W_OK) {
4821                         if (vnode_isdir(vp)) {
4822                                 action |= KAUTH_VNODE_ADD_FILE |
4823                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
4824                                 /* might want delete rights here too */
4825                         } else {
4826                                 action |= KAUTH_VNODE_WRITE_DATA;
4827                         }
4828                 }
4829                 if (uflags & X_OK) {
4830                         if (vnode_isdir(vp)) {
4831                                 action |= KAUTH_VNODE_SEARCH;
4832                         } else {
4833                                 action |= KAUTH_VNODE_EXECUTE;
4834                         }
4835                 }
4836         } else {
4837                 /* take advantage of definition of uflags */
4838                 action = uflags >> 8;
4839         }
4840
4841 #if CONFIG_MACF
4842         error = mac_vnode_check_access(ctx, vp, uflags);
4843         if (error)
4844                 return (error);
4845 #endif /* MAC */
4846
4847         /* action == 0 means only check for existence */
4848         if (action != 0) {
4849                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
4850         } else {
4851                 error = 0;
4852         }
4853
4854         return(error);
4855 }
4856
4857
4858
4859 /*
4860  * access_extended: Check access permissions in bulk.
4861  *
4862  * Description: uap->entries            Pointer to an array of accessx
4863  *                                      descriptor structs, plus one or
4864  *                                      more NULL terminated strings (see
4865  *                                      "Notes" section below).
4866  *              uap->size               Size of the area pointed to by
4867  *                                      uap->entries.
4868  *              uap->results            Pointer to the results array.
4869  *
4870  * Returns:     0                       Success
4871  *              ENOMEM                  Insufficient memory
4872  *              EINVAL                  Invalid arguments
4873  *              namei:EFAULT            Bad address
4874  *              namei:ENAMETOOLONG      Filename too long
4875  *              namei:ENOENT            No such file or directory
4876  *              namei:ELOOP             Too many levels of symbolic links
4877  *              namei:EBADF             Bad file descriptor
4878  *              namei:ENOTDIR           Not a directory
4879  *              namei:???
4880  *              access1:
4881  *
4882  * Implicit returns:
4883  *              uap->results            Array contents modified
4884  *
4885  * Notes:       The uap->entries are structured as an arbitrary length array
4886  *              of accessx descriptors, followed by one or more NULL terminated
4887  *              strings
4888  *
4889  *                      struct accessx_descriptor[0]
4890  *                      ...
4891  *                      struct accessx_descriptor[n]
4892  *                      char name_data[0];
4893  *
4894  *              We determine the entry count by walking the buffer containing
4895  *              the uap->entries argument descriptor.  For each descriptor we
4896  *              see, the valid values for the offset ad_name_offset will be
4897  *              in the byte range:
4898  *
4899  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
4900  *                                              to
4901  *                              [ uap->entries + uap->size - 2 ]
4902  *
4903  *              since we must have at least one string, and the string must
4904  *              be at least one character plus the NULL terminator in length.
4905  *
4906  * XXX:         Need to support the check-as uid argument
4907  */
4908 int
4909 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
4910 {
4911         struct accessx_descriptor *input = NULL;
4912         errno_t *result = NULL;
4913         errno_t error = 0;
4914         int wantdelete = 0;
4915         unsigned int desc_max, desc_actual, i, j;
4916         struct vfs_context context;
4917         struct nameidata nd;
4918         int niopts;
4919         vnode_t vp = NULL;
4920         vnode_t dvp = NULL;
4921 #define ACCESSX_MAX_DESCR_ON_STACK 10
4922         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
4923
4924         context.vc_ucred = NULL;
4925
4926         /*
4927          * Validate parameters; if valid, copy the descriptor array and string
4928          * arguments into local memory.  Before proceeding, the following
4929          * conditions must have been met:
4930          *
4931          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
4932          * o    There must be sufficient room in the request for at least one
4933          *      descriptor and a one yte NUL terminated string.
4934          * o    The allocation of local storage must not fail.
4935          */
4936         if (uap->size > ACCESSX_MAX_TABLESIZE)
4937                 return(ENOMEM);
4938         if (uap->size < (sizeof(struct accessx_descriptor) + 2))
4939                 return(EINVAL);
4940         if (uap->size <= sizeof (stack_input)) {
4941                 input = stack_input;
4942         } else {
4943         MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
4944         if (input == NULL) {
4945                 error = ENOMEM;
4946                 goto out;
4947         }
4948         }
4949         error = copyin(uap->entries, input, uap->size);
4950         if (error)
4951                 goto out;
4952
4953         AUDIT_ARG(opaque, input, uap->size);
4954
4955         /*
4956          * Force NUL termination of the copyin buffer to avoid nami() running
4957          * off the end.  If the caller passes us bogus data, they may get a
4958          * bogus result.
4959          */
4960         ((char *)input)[uap->size - 1] = 0;
4961
4962         /*
4963          * Access is defined as checking against the process' real identity,
4964          * even if operations are checking the effective identity.  This
4965          * requires that we use a local vfs context.
4966          */
4967         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
4968         context.vc_thread = current_thread();
4969
4970         /*
4971          * Find out how many entries we have, so we can allocate the result
4972          * array by walking the list and adjusting the count downward by the
4973          * earliest string offset we see.
4974          */
4975         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
4976         desc_actual = desc_max;
4977         for (i = 0; i < desc_actual; i++) {
4978                 /*
4979                  * Take the offset to the name string for this entry and
4980                  * convert to an input array index, which would be one off
4981                  * the end of the array if this entry was the lowest-addressed
4982                  * name string.
4983                  */
4984                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
4985
4986                 /*
4987                  * An offset greater than the max allowable offset is an error.
4988                  * It is also an error for any valid entry to point
4989                  * to a location prior to the end of the current entry, if
4990                  * it's not a reference to the string of the previous entry.
4991                  */
4992                 if (j > desc_max || (j != 0 && j <= i)) {
4993                         error = EINVAL;
4994                         goto out;
4995                 }
4996
4997                 /*
4998                  * An offset of 0 means use the previous descriptor's offset;
4999                  * this is used to chain multiple requests for the same file
5000                  * to avoid multiple lookups.
5001                  */
5002                 if (j == 0) {
5003                         /* This is not valid for the first entry */
5004                         if (i == 0) {
5005                                 error = EINVAL;
5006                                 goto out;
5007                         }
5008                         continue;
5009                 }
5010
5011                 /*
5012                  * If the offset of the string for this descriptor is before
5013                  * what we believe is the current actual last descriptor,
5014                  * then we need to adjust our estimate downward; this permits
5015                  * the string table following the last descriptor to be out
5016                  * of order relative to the descriptor list.
5017                  */
5018                 if (j < desc_actual)
5019                         desc_actual = j;
5020         }
5021
5022         /*
5023          * We limit the actual number of descriptors we are willing to process
5024          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5025          * requested does not exceed this limit,
5026          */
5027         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5028                 error = ENOMEM;
5029                 goto out;
5030         }
5031         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK);
5032         if (result == NULL) {
5033                 error = ENOMEM;
5034                 goto out;
5035         }
5036
5037         /*
5038          * Do the work by iterating over the descriptor entries we know to
5039          * at least appear to contain valid data.
5040          */
5041         error = 0;
5042         for (i = 0; i < desc_actual; i++) {
5043                 /*
5044                  * If the ad_name_offset is 0, then we use the previous
5045                  * results to make the check; otherwise, we are looking up
5046                  * a new file name.
5047                  */
5048                 if (input[i].ad_name_offset != 0) {
5049                         /* discard old vnodes */
5050                         if (vp) {
5051                                 vnode_put(vp);
5052                                 vp = NULL;
5053                         }
5054                         if (dvp) {
5055                                 vnode_put(dvp);
5056                                 dvp = NULL;
5057                         }
5058
5059                         /*
5060                          * Scan forward in the descriptor list to see if we
5061                          * need the parent vnode.  We will need it if we are
5062                          * deleting, since we must have rights  to remove
5063                          * entries in the parent directory, as well as the
5064                          * rights to delete the object itself.
5065                          */
5066                         wantdelete = input[i].ad_flags & _DELETE_OK;
5067                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++)
5068                                 if (input[j].ad_flags & _DELETE_OK)
5069                                         wantdelete = 1;
5070
5071                         niopts = FOLLOW | AUDITVNPATH1;
5072
5073                         /* need parent for vnode_authorize for deletion test */
5074                         if (wantdelete)
5075                                 niopts |= WANTPARENT;
5076
5077                         /* do the lookup */
5078                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5079                                CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5080                                &context);
5081                         error = namei(&nd);
5082                         if (!error) {
5083                                 vp = nd.ni_vp;
5084                                 if (wantdelete)
5085                                         dvp = nd.ni_dvp;
5086                         }
5087                         nameidone(&nd);
5088                 }
5089
5090                 /*
5091                  * Handle lookup errors.
5092                  */
5093                 switch(error) {
5094                 case ENOENT:
5095                 case EACCES:
5096                 case EPERM:
5097                 case ENOTDIR:
5098                         result[i] = error;
5099                         break;
5100                 case 0:
5101                         /* run this access check */
5102                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5103                         break;
5104                 default:
5105                         /* fatal lookup error */
5106
5107                         goto out;
5108                 }
5109         }
5110
5111         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5112
5113         /* copy out results */
5114         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5115
5116 out:
5117         if (input && input != stack_input)
5118                 FREE(input, M_TEMP);
5119         if (result)
5120                 FREE(result, M_TEMP);
5121         if (vp)
5122                 vnode_put(vp);
5123         if (dvp)
5124                 vnode_put(dvp);
5125         if (IS_VALID_CRED(context.vc_ucred))
5126                 kauth_cred_unref(&context.vc_ucred);
5127         return(error);
5128 }
5129
5130
5131 /*
5132  * Returns:     0                       Success
5133  *              namei:EFAULT            Bad address
5134  *              namei:ENAMETOOLONG      Filename too long
5135  *              namei:ENOENT            No such file or directory
5136  *              namei:ELOOP             Too many levels of symbolic links
5137  *              namei:EBADF             Bad file descriptor
5138  *              namei:ENOTDIR           Not a directory
5139  *              namei:???
5140  *              access1:
5141  */
5142 static int
5143 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5144     int flag, enum uio_seg segflg)
5145 {
5146         int error;
5147         struct nameidata nd;
5148         int niopts;
5149         struct vfs_context context;
5150 #if NAMEDRSRCFORK
5151         int is_namedstream = 0;
5152 #endif
5153
5154         /*
5155          * Unless the AT_EACCESS option is used, Access is defined as checking
5156          * against the process' real identity, even if operations are checking
5157          * the effective identity.  So we need to tweak the credential
5158          * in the context for that case.
5159          */
5160         if (!(flag & AT_EACCESS))
5161                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5162         else
5163                 context.vc_ucred = ctx->vc_ucred;
5164         context.vc_thread = ctx->vc_thread;
5165
5166
5167         niopts = FOLLOW | AUDITVNPATH1;
5168         /* need parent for vnode_authorize for deletion test */
5169         if (amode & _DELETE_OK)
5170                 niopts |= WANTPARENT;
5171         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5172                path, &context);
5173
5174 #if NAMEDRSRCFORK
5175         /* access(F_OK) calls are allowed for resource forks. */
5176         if (amode == F_OK)
5177                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5178 #endif
5179         error = nameiat(&nd, fd);
5180         if (error)
5181                 goto out;
5182
5183 #if NAMEDRSRCFORK
5184         /* Grab reference on the shadow stream file vnode to
5185          * force an inactive on release which will mark it
5186          * for recycle.
5187          */
5188         if (vnode_isnamedstream(nd.ni_vp) &&
5189             (nd.ni_vp->v_parent != NULLVP) &&
5190             vnode_isshadow(nd.ni_vp)) {
5191                 is_namedstream = 1;
5192                 vnode_ref(nd.ni_vp);
5193         }
5194 #endif
5195
5196         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
5197
5198 #if NAMEDRSRCFORK
5199         if (is_namedstream) {
5200                 vnode_rele(nd.ni_vp);
5201         }
5202 #endif
5203
5204         vnode_put(nd.ni_vp);
5205         if (amode & _DELETE_OK)
5206                 vnode_put(nd.ni_dvp);
5207         nameidone(&nd);
5208
5209 out:
5210         if (!(flag & AT_EACCESS))
5211                 kauth_cred_unref(&context.vc_ucred);
5212         return (error);
5213 }
5214
5215 int
5216 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
5217 {
5218         return (faccessat_internal(vfs_context_current(), AT_FDCWD,
5219             uap->path, uap->flags, 0, UIO_USERSPACE));
5220 }
5221
5222 int
5223 faccessat(__unused proc_t p, struct faccessat_args *uap,
5224           __unused int32_t *retval)
5225 {
5226         if (uap->flag & ~AT_EACCESS)
5227                 return (EINVAL);
5228
5229         return (faccessat_internal(vfs_context_current(), uap->fd,
5230             uap->path, uap->amode, uap->flag, UIO_USERSPACE));
5231 }
5232
5233 /*
5234  * Returns:     0                       Success
5235  *              EFAULT
5236  *      copyout:EFAULT
5237  *      namei:???
5238  *      vn_stat:???
5239  */
5240 static int
5241 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
5242     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
5243     enum uio_seg segflg, int fd, int flag)
5244 {
5245         struct nameidata nd;
5246         int follow;
5247         union {
5248                 struct stat sb;
5249                 struct stat64 sb64;
5250         } source;
5251         union {
5252                 struct user64_stat user64_sb;
5253                 struct user32_stat user32_sb;
5254                 struct user64_stat64 user64_sb64;
5255                 struct user32_stat64 user32_sb64;
5256         } dest;
5257         caddr_t sbp;
5258         int error, my_size;
5259         kauth_filesec_t fsec;
5260         size_t xsecurity_bufsize;
5261         void * statptr;
5262
5263         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5264         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
5265             segflg, path, ctx);
5266
5267 #if NAMEDRSRCFORK
5268         int is_namedstream = 0;
5269         /* stat calls are allowed for resource forks. */
5270         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5271 #endif
5272         error = nameiat(&nd, fd);
5273         if (error)
5274                 return (error);
5275         fsec = KAUTH_FILESEC_NONE;
5276
5277         statptr = (void *)&source;
5278
5279 #if NAMEDRSRCFORK
5280         /* Grab reference on the shadow stream file vnode to
5281          * force an inactive on release which will mark it
5282          * for recycle.
5283          */
5284         if (vnode_isnamedstream(nd.ni_vp) &&
5285             (nd.ni_vp->v_parent != NULLVP) &&
5286             vnode_isshadow(nd.ni_vp)) {
5287                 is_namedstream = 1;
5288                 vnode_ref(nd.ni_vp);
5289         }
5290 #endif
5291
5292         error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
5293
5294 #if NAMEDRSRCFORK
5295         if (is_namedstream) {
5296                 vnode_rele(nd.ni_vp);
5297         }
5298 #endif
5299         vnode_put(nd.ni_vp);
5300         nameidone(&nd);
5301
5302         if (error)
5303                 return (error);
5304         /* Zap spare fields */
5305         if (isstat64 != 0) {
5306                 source.sb64.st_lspare = 0;
5307                 source.sb64.st_qspare[0] = 0LL;
5308                 source.sb64.st_qspare[1] = 0LL;
5309                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5310                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
5311                         my_size = sizeof(dest.user64_sb64);
5312                         sbp = (caddr_t)&dest.user64_sb64;
5313                 } else {
5314                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
5315                         my_size = sizeof(dest.user32_sb64);
5316                         sbp = (caddr_t)&dest.user32_sb64;
5317                 }
5318                 /*
5319                  * Check if we raced (post lookup) against the last unlink of a file.
5320                  */
5321                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
5322                         source.sb64.st_nlink = 1;
5323                 }
5324         } else {
5325                 source.sb.st_lspare = 0;
5326                 source.sb.st_qspare[0] = 0LL;
5327                 source.sb.st_qspare[1] = 0LL;
5328                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5329                         munge_user64_stat(&source.sb, &dest.user64_sb);
5330                         my_size = sizeof(dest.user64_sb);
5331                         sbp = (caddr_t)&dest.user64_sb;
5332                 } else {
5333                         munge_user32_stat(&source.sb, &dest.user32_sb);
5334                         my_size = sizeof(dest.user32_sb);
5335                         sbp = (caddr_t)&dest.user32_sb;
5336                 }
5337
5338                 /*
5339                  * Check if we raced (post lookup) against the last unlink of a file.
5340                  */
5341                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
5342                         source.sb.st_nlink = 1;
5343                 }
5344         }
5345         if ((error = copyout(sbp, ub, my_size)) != 0)
5346                 goto out;
5347
5348         /* caller wants extended security information? */
5349         if (xsecurity != USER_ADDR_NULL) {
5350
5351                 /* did we get any? */
5352                 if (fsec == KAUTH_FILESEC_NONE) {
5353                         if (susize(xsecurity_size, 0) != 0) {
5354                                 error = EFAULT;
5355                                 goto out;
5356                         }
5357                 } else {
5358                         /* find the user buffer size */
5359                         xsecurity_bufsize = fusize(xsecurity_size);
5360
5361                         /* copy out the actual data size */
5362                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
5363                                 error = EFAULT;
5364                                 goto out;
5365                         }
5366
5367                         /* if the caller supplied enough room, copy out to it */
5368                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec))
5369                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
5370                 }
5371         }
5372 out:
5373         if (fsec != KAUTH_FILESEC_NONE)
5374                 kauth_filesec_free(fsec);
5375         return (error);
5376 }
5377
5378 /*
5379  * stat_extended: Get file status; with extended security (ACL).
5380  *
5381  * Parameters:    p                       (ignored)
5382  *                uap                     User argument descriptor (see below)
5383  *                retval                  (ignored)
5384  *
5385  * Indirect:      uap->path               Path of file to get status from
5386  *                uap->ub                 User buffer (holds file status info)
5387  *                uap->xsecurity          ACL to get (extended security)
5388  *                uap->xsecurity_size     Size of ACL
5389  *
5390  * Returns:        0                      Success
5391  *                !0                      errno value
5392  *
5393  */
5394 int
5395 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
5396     __unused int32_t *retval)
5397 {
5398         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5399             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5400             0));
5401 }
5402
5403 /*
5404  * Returns:     0                       Success
5405  *      fstatat_internal:???            [see fstatat_internal() in this file]
5406  */
5407 int
5408 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
5409 {
5410         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5411             0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0));
5412 }
5413
5414 int
5415 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
5416 {
5417         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5418             0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0));
5419 }
5420
5421 /*
5422  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
5423  *
5424  * Parameters:    p                       (ignored)
5425  *                uap                     User argument descriptor (see below)
5426  *                retval                  (ignored)
5427  *
5428  * Indirect:      uap->path               Path of file to get status from
5429  *                uap->ub                 User buffer (holds file status info)
5430  *                uap->xsecurity          ACL to get (extended security)
5431  *                uap->xsecurity_size     Size of ACL
5432  *
5433  * Returns:        0                      Success
5434  *                !0                      errno value
5435  *
5436  */
5437 int
5438 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
5439 {
5440         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5441             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5442             0));
5443 }
5444
5445 /*
5446  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
5447  *
5448  * Parameters:    p                       (ignored)
5449  *                uap                     User argument descriptor (see below)
5450  *                retval                  (ignored)
5451  *
5452  * Indirect:      uap->path               Path of file to get status from
5453  *                uap->ub                 User buffer (holds file status info)
5454  *                uap->xsecurity          ACL to get (extended security)
5455  *                uap->xsecurity_size     Size of ACL
5456  *
5457  * Returns:        0                      Success
5458  *                !0                      errno value
5459  *
5460  */
5461 int
5462 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
5463 {
5464         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5465             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5466             AT_SYMLINK_NOFOLLOW));
5467 }
5468
5469 /*
5470  * Get file status; this version does not follow links.
5471  */
5472 int
5473 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
5474 {
5475         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5476             0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5477 }
5478
5479 int
5480 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
5481 {
5482         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5483             0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5484 }
5485
5486 /*
5487  * lstat64_extended: Get file status; can handle large inode numbers; does not
5488  * follow links; with extended security (ACL).
5489  *
5490  * Parameters:    p                       (ignored)
5491  *                uap                     User argument descriptor (see below)
5492  *                retval                  (ignored)
5493  *
5494  * Indirect:      uap->path               Path of file to get status from
5495  *                uap->ub                 User buffer (holds file status info)
5496  *                uap->xsecurity          ACL to get (extended security)
5497  *                uap->xsecurity_size     Size of ACL
5498  *
5499  * Returns:        0                      Success
5500  *                !0                      errno value
5501  *
5502  */
5503 int
5504 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
5505 {
5506         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5507             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5508             AT_SYMLINK_NOFOLLOW));
5509 }
5510
5511 int
5512 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
5513 {
5514         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5515                 return (EINVAL);
5516
5517         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5518             0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag));
5519 }
5520
5521 int
5522 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
5523     __unused int32_t *retval)
5524 {
5525         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5526                 return (EINVAL);
5527
5528         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5529             0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag));
5530 }
5531
5532 /*
5533  * Get configurable pathname variables.
5534  *
5535  * Returns:     0                       Success
5536  *      namei:???
5537  *      vn_pathconf:???
5538  *
5539  * Notes:       Global implementation  constants are intended to be
5540  *              implemented in this function directly; all other constants
5541  *              are per-FS implementation, and therefore must be handled in
5542  *              each respective FS, instead.
5543  *
5544  * XXX We implement some things globally right now that should actually be
5545  * XXX per-FS; we will need to deal with this at some point.
5546  */
5547 /* ARGSUSED */
5548 int
5549 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
5550 {
5551         int error;
5552         struct nameidata nd;
5553         vfs_context_t ctx = vfs_context_current();
5554
5555         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
5556                 UIO_USERSPACE, uap->path, ctx);
5557         error = namei(&nd);
5558         if (error)
5559                 return (error);
5560
5561         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
5562
5563         vnode_put(nd.ni_vp);
5564         nameidone(&nd);
5565         return (error);
5566 }
5567
5568 /*
5569  * Return target name of a symbolic link.
5570  */
5571 /* ARGSUSED */
5572 static int
5573 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
5574     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
5575     int *retval)
5576 {
5577         vnode_t vp;
5578         uio_t auio;
5579         int error;
5580         struct nameidata nd;
5581         char uio_buf[ UIO_SIZEOF(1) ];
5582
5583         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
5584             seg, path, ctx);
5585
5586         error = nameiat(&nd, fd);
5587         if (error)
5588                 return (error);
5589         vp = nd.ni_vp;
5590
5591         nameidone(&nd);
5592
5593         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
5594                                     &uio_buf[0], sizeof(uio_buf));
5595         uio_addiov(auio, buf, bufsize);
5596         if (vp->v_type != VLNK) {
5597                 error = EINVAL;
5598         } else {
5599 #if CONFIG_MACF
5600                 error = mac_vnode_check_readlink(ctx, vp);
5601 #endif
5602                 if (error == 0)
5603                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
5604                                                 ctx);
5605                 if (error == 0)
5606                         error = VNOP_READLINK(vp, auio, ctx);
5607         }
5608         vnode_put(vp);
5609
5610         *retval = bufsize - (int)uio_resid(auio);
5611         return (error);
5612 }
5613
5614 int
5615 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
5616 {
5617         enum uio_seg procseg;
5618
5619         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5620         return (readlinkat_internal(vfs_context_current(), AT_FDCWD,
5621             CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
5622             uap->count, procseg, retval));
5623 }
5624
5625 int
5626 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
5627 {
5628         enum uio_seg procseg;
5629
5630         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5631         return (readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
5632             procseg, uap->buf, uap->bufsize, procseg, retval));
5633 }
5634
5635 /*
5636  * Change file flags.
5637  */
5638 static int
5639 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
5640 {
5641         struct vnode_attr va;
5642         kauth_action_t action;
5643         int error;
5644
5645         VATTR_INIT(&va);
5646         VATTR_SET(&va, va_flags, flags);
5647
5648 #if CONFIG_MACF
5649         error = mac_vnode_check_setflags(ctx, vp, flags);
5650         if (error)
5651                 goto out;
5652 #endif
5653
5654         /* request authorisation, disregard immutability */
5655         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5656                 goto out;
5657         /*
5658          * Request that the auth layer disregard those file flags it's allowed to when
5659          * authorizing this operation; we need to do this in order to be able to
5660          * clear immutable flags.
5661          */
5662         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0))
5663                 goto out;
5664         error = vnode_setattr(vp, &va, ctx);
5665
5666         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
5667                 error = ENOTSUP;
5668         }
5669 out:
5670         vnode_put(vp);
5671         return(error);
5672 }
5673
5674 /*
5675  * Change flags of a file given a path name.
5676  */
5677 /* ARGSUSED */
5678 int
5679 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
5680 {
5681         vnode_t vp;
5682         vfs_context_t ctx = vfs_context_current();
5683         int error;
5684         struct nameidata nd;
5685
5686         AUDIT_ARG(fflags, uap->flags);
5687         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
5688                 UIO_USERSPACE, uap->path, ctx);
5689         error = namei(&nd);
5690         if (error)
5691                 return (error);
5692         vp = nd.ni_vp;
5693         nameidone(&nd);
5694
5695         error = chflags1(vp, uap->flags, ctx);
5696
5697         return(error);
5698 }
5699
5700 /*
5701  * Change flags of a file given a file descriptor.
5702  */
5703 /* ARGSUSED */
5704 int
5705 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
5706 {
5707         vnode_t vp;
5708         int error;
5709
5710         AUDIT_ARG(fd, uap->fd);
5711         AUDIT_ARG(fflags, uap->flags);
5712         if ( (error = file_vnode(uap->fd, &vp)) )
5713                 return (error);
5714
5715         if ((error = vnode_getwithref(vp))) {
5716                 file_drop(uap->fd);
5717                 return(error);
5718         }
5719
5720         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5721
5722         error = chflags1(vp, uap->flags, vfs_context_current());
5723
5724         file_drop(uap->fd);
5725         return (error);
5726 }
5727
5728 /*
5729  * Change security information on a filesystem object.
5730  *
5731  * Returns:     0                       Success
5732  *              EPERM                   Operation not permitted
5733  *              vnode_authattr:???      [anything vnode_authattr can return]
5734  *              vnode_authorize:???     [anything vnode_authorize can return]
5735  *              vnode_setattr:???       [anything vnode_setattr can return]
5736  *
5737  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
5738  *              translated to EPERM before being returned.
5739  */
5740 static int
5741 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
5742 {
5743         kauth_action_t action;
5744         int error;
5745
5746         AUDIT_ARG(mode, vap->va_mode);
5747         /* XXX audit new args */
5748
5749 #if NAMEDSTREAMS
5750         /* chmod calls are not allowed for resource forks. */
5751         if (vp->v_flag & VISNAMEDSTREAM) {
5752                 return (EPERM);
5753         }
5754 #endif
5755
5756 #if CONFIG_MACF
5757         if (VATTR_IS_ACTIVE(vap, va_mode) &&
5758             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0)
5759                 return (error);
5760 #endif
5761
5762         /* make sure that the caller is allowed to set this security information */
5763         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
5764             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
5765                 if (error == EACCES)
5766                         error = EPERM;
5767                 return(error);
5768         }
5769
5770         error = vnode_setattr(vp, vap, ctx);
5771
5772         return (error);
5773 }
5774
5775
5776 /*
5777  * Change mode of a file given a path name.
5778  *
5779  * Returns:     0                       Success
5780  *              namei:???               [anything namei can return]
5781  *              chmod_vnode:???         [anything chmod_vnode can return]
5782  */
5783 static int
5784 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
5785     int fd, int flag, enum uio_seg segflg)
5786 {
5787         struct nameidata nd;
5788         int follow, error;
5789
5790         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5791         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
5792             segflg, path, ctx);
5793         if ((error = nameiat(&nd, fd)))
5794                 return (error);
5795         error = chmod_vnode(ctx, nd.ni_vp, vap);
5796         vnode_put(nd.ni_vp);
5797         nameidone(&nd);
5798         return(error);
5799 }
5800
5801 /*
5802  * chmod_extended: Change the mode of a file given a path name; with extended
5803  * argument list (including extended security (ACL)).
5804  *
5805  * Parameters:  p                       Process requesting the open
5806  *              uap                     User argument descriptor (see below)
5807  *              retval                  (ignored)
5808  *
5809  * Indirect:    uap->path               Path to object (same as 'chmod')
5810  *              uap->uid                UID to set
5811  *              uap->gid                GID to set
5812  *              uap->mode               File mode to set (same as 'chmod')
5813  *              uap->xsecurity          ACL to set (or delete)
5814  *
5815  * Returns:     0                       Success
5816  *              !0                      errno value
5817  *
5818  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
5819  *
5820  * XXX:         We should enummerate the possible errno values here, and where
5821  *              in the code they originated.
5822  */
5823 int
5824 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
5825 {
5826         int error;
5827         struct vnode_attr va;
5828         kauth_filesec_t xsecdst;
5829
5830         AUDIT_ARG(owner, uap->uid, uap->gid);
5831
5832         VATTR_INIT(&va);
5833         if (uap->mode != -1)
5834                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5835         if (uap->uid != KAUTH_UID_NONE)
5836                 VATTR_SET(&va, va_uid, uap->uid);
5837         if (uap->gid != KAUTH_GID_NONE)
5838                 VATTR_SET(&va, va_gid, uap->gid);
5839
5840         xsecdst = NULL;
5841         switch(uap->xsecurity) {
5842                 /* explicit remove request */
5843         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
5844                 VATTR_SET(&va, va_acl, NULL);
5845                 break;
5846                 /* not being set */
5847         case USER_ADDR_NULL:
5848                 break;
5849         default:
5850                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
5851                         return(error);
5852                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5853                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
5854         }
5855
5856         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
5857             UIO_USERSPACE);
5858
5859         if (xsecdst != NULL)
5860                 kauth_filesec_free(xsecdst);
5861         return(error);
5862 }
5863
5864 /*
5865  * Returns:     0                       Success
5866  *              chmodat:???             [anything chmodat can return]
5867  */
5868 static int
5869 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
5870     int flag, enum uio_seg segflg)
5871 {
5872         struct vnode_attr va;
5873
5874         VATTR_INIT(&va);
5875         VATTR_SET(&va, va_mode, mode & ALLPERMS);
5876
5877         return (chmodat(ctx, path, &va, fd, flag, segflg));
5878 }
5879
5880 int
5881 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
5882 {
5883         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
5884             AT_FDCWD, 0, UIO_USERSPACE));
5885 }
5886
5887 int
5888 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
5889 {
5890         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5891                 return (EINVAL);
5892
5893         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
5894             uap->fd, uap->flag, UIO_USERSPACE));
5895 }
5896
5897 /*
5898  * Change mode of a file given a file descriptor.
5899  */
5900 static int
5901 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
5902 {
5903         vnode_t vp;
5904         int error;
5905
5906         AUDIT_ARG(fd, fd);
5907
5908         if ((error = file_vnode(fd, &vp)) != 0)
5909                 return (error);
5910         if ((error = vnode_getwithref(vp)) != 0) {
5911                 file_drop(fd);
5912                 return(error);
5913         }
5914         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5915
5916         error = chmod_vnode(vfs_context_current(), vp, vap);
5917         (void)vnode_put(vp);
5918         file_drop(fd);
5919
5920         return (error);
5921 }
5922
5923 /*
5924  * fchmod_extended: Change mode of a file given a file descriptor; with
5925  * extended argument list (including extended security (ACL)).
5926  *
5927  * Parameters:    p                       Process requesting to change file mode
5928  *                uap                     User argument descriptor (see below)
5929  *                retval                  (ignored)
5930  *
5931  * Indirect:      uap->mode               File mode to set (same as 'chmod')
5932  *                uap->uid                UID to set
5933  *                uap->gid                GID to set
5934  *                uap->xsecurity          ACL to set (or delete)
5935  *                uap->fd                 File descriptor of file to change mode
5936  *
5937  * Returns:        0                      Success
5938  *                !0                      errno value
5939  *
5940  */
5941 int
5942 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
5943 {
5944         int error;
5945         struct vnode_attr va;
5946         kauth_filesec_t xsecdst;
5947
5948         AUDIT_ARG(owner, uap->uid, uap->gid);
5949
5950         VATTR_INIT(&va);
5951         if (uap->mode != -1)
5952                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5953         if (uap->uid != KAUTH_UID_NONE)
5954                 VATTR_SET(&va, va_uid, uap->uid);
5955         if (uap->gid != KAUTH_GID_NONE)
5956                 VATTR_SET(&va, va_gid, uap->gid);
5957
5958         xsecdst = NULL;
5959         switch(uap->xsecurity) {
5960         case USER_ADDR_NULL:
5961                 VATTR_SET(&va, va_acl, NULL);
5962                 break;
5963         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
5964                 VATTR_SET(&va, va_acl, NULL);
5965                 break;
5966                 /* not being set */
5967         case CAST_USER_ADDR_T(-1):
5968                 break;
5969         default:
5970                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
5971                         return(error);
5972                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
5973         }
5974
5975         error = fchmod1(p, uap->fd, &va);
5976
5977
5978         switch(uap->xsecurity) {
5979         case USER_ADDR_NULL:
5980         case CAST_USER_ADDR_T(-1):
5981                 break;
5982         default:
5983                 if (xsecdst != NULL)
5984                         kauth_filesec_free(xsecdst);
5985         }
5986         return(error);
5987 }
5988
5989 int
5990 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
5991 {
5992         struct vnode_attr va;
5993
5994         VATTR_INIT(&va);
5995         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
5996
5997         return(fchmod1(p, uap->fd, &va));
5998 }
5999
6000
6001 /*
6002  * Set ownership given a path name.
6003  */
6004 /* ARGSUSED */
6005 static int
6006 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6007    gid_t gid, int flag, enum uio_seg segflg)
6008 {
6009         vnode_t vp;
6010         struct vnode_attr va;
6011         int error;
6012         struct nameidata nd;
6013         int follow;
6014         kauth_action_t action;
6015
6016         AUDIT_ARG(owner, uid, gid);
6017
6018         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6019         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6020             path, ctx);
6021         error = nameiat(&nd, fd);
6022         if (error)
6023                 return (error);
6024         vp = nd.ni_vp;
6025
6026         nameidone(&nd);
6027
6028         VATTR_INIT(&va);
6029         if (uid != (uid_t)VNOVAL)
6030                 VATTR_SET(&va, va_uid, uid);
6031         if (gid != (gid_t)VNOVAL)
6032                 VATTR_SET(&va, va_gid, gid);
6033
6034 #if CONFIG_MACF
6035         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6036         if (error)
6037                 goto out;
6038 #endif
6039
6040         /* preflight and authorize attribute changes */
6041         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6042                 goto out;
6043         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6044                 goto out;
6045         error = vnode_setattr(vp, &va, ctx);
6046
6047 out:
6048         /*
6049          * EACCES is only allowed from namei(); permissions failure should
6050          * return EPERM, so we need to translate the error code.
6051          */
6052         if (error == EACCES)
6053                 error = EPERM;
6054
6055         vnode_put(vp);
6056         return (error);
6057 }
6058
6059 int
6060 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
6061 {
6062         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6063             uap->uid, uap->gid, 0, UIO_USERSPACE));
6064 }
6065
6066 int
6067 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
6068 {
6069         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6070             uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE));
6071 }
6072
6073 int
6074 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
6075 {
6076         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6077                 return (EINVAL);
6078
6079         return (fchownat_internal(vfs_context_current(), uap->fd, uap->path,
6080             uap->uid, uap->gid, uap->flag, UIO_USERSPACE));
6081 }
6082
6083 /*
6084  * Set ownership given a file descriptor.
6085  */
6086 /* ARGSUSED */
6087 int
6088 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
6089 {
6090         struct vnode_attr va;
6091         vfs_context_t ctx = vfs_context_current();
6092         vnode_t vp;
6093         int error;
6094         kauth_action_t action;
6095
6096         AUDIT_ARG(owner, uap->uid, uap->gid);
6097         AUDIT_ARG(fd, uap->fd);
6098
6099         if ( (error = file_vnode(uap->fd, &vp)) )
6100                 return (error);
6101
6102         if ( (error = vnode_getwithref(vp)) ) {
6103                 file_drop(uap->fd);
6104                 return(error);
6105         }
6106         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6107
6108         VATTR_INIT(&va);
6109         if (uap->uid != VNOVAL)
6110                 VATTR_SET(&va, va_uid, uap->uid);
6111         if (uap->gid != VNOVAL)
6112                 VATTR_SET(&va, va_gid, uap->gid);
6113
6114 #if NAMEDSTREAMS
6115         /* chown calls are not allowed for resource forks. */
6116         if (vp->v_flag & VISNAMEDSTREAM) {
6117                 error = EPERM;
6118                 goto out;
6119         }
6120 #endif
6121
6122 #if CONFIG_MACF
6123         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
6124         if (error)
6125                 goto out;
6126 #endif
6127
6128         /* preflight and authorize attribute changes */
6129         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6130                 goto out;
6131         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6132                 if (error == EACCES)
6133                         error = EPERM;
6134                 goto out;
6135         }
6136         error = vnode_setattr(vp, &va, ctx);
6137
6138 out:
6139         (void)vnode_put(vp);
6140         file_drop(uap->fd);
6141         return (error);
6142 }
6143
6144 static int
6145 getutimes(user_addr_t usrtvp, struct timespec *tsp)
6146 {
6147         int error;
6148
6149         if (usrtvp == USER_ADDR_NULL) {
6150                 struct timeval old_tv;
6151                 /* XXX Y2038 bug because of microtime argument */
6152                 microtime(&old_tv);
6153                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
6154                 tsp[1] = tsp[0];
6155         } else {
6156                 if (IS_64BIT_PROCESS(current_proc())) {
6157                         struct user64_timeval tv[2];
6158                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6159                         if (error)
6160                                 return (error);
6161                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6162                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6163                 } else {
6164                         struct user32_timeval tv[2];
6165                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6166                         if (error)
6167                                 return (error);
6168                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6169                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6170                 }
6171         }
6172         return 0;
6173 }
6174
6175 static int
6176 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
6177         int nullflag)
6178 {
6179         int error;
6180         struct vnode_attr va;
6181         kauth_action_t action;
6182
6183         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6184
6185         VATTR_INIT(&va);
6186         VATTR_SET(&va, va_access_time, ts[0]);
6187         VATTR_SET(&va, va_modify_time, ts[1]);
6188         if (nullflag)
6189                 va.va_vaflags |= VA_UTIMES_NULL;
6190
6191 #if NAMEDSTREAMS
6192         /* utimes calls are not allowed for resource forks. */
6193         if (vp->v_flag & VISNAMEDSTREAM) {
6194                 error = EPERM;
6195                 goto out;
6196         }
6197 #endif
6198
6199 #if CONFIG_MACF
6200         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
6201         if (error)
6202                 goto out;
6203 #endif
6204         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6205                 if (!nullflag && error == EACCES)
6206                         error = EPERM;
6207                 goto out;
6208         }
6209
6210         /* since we may not need to auth anything, check here */
6211         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6212                 if (!nullflag && error == EACCES)
6213                         error = EPERM;
6214                 goto out;
6215         }
6216         error = vnode_setattr(vp, &va, ctx);
6217
6218 out:
6219         return error;
6220 }
6221
6222 /*
6223  * Set the access and modification times of a file.
6224  */
6225 /* ARGSUSED */
6226 int
6227 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
6228 {
6229         struct timespec ts[2];
6230         user_addr_t usrtvp;
6231         int error;
6232         struct nameidata nd;
6233         vfs_context_t ctx = vfs_context_current();
6234
6235         /*
6236          * AUDIT: Needed to change the order of operations to do the
6237          * name lookup first because auditing wants the path.
6238          */
6239         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6240                 UIO_USERSPACE, uap->path, ctx);
6241         error = namei(&nd);
6242         if (error)
6243                 return (error);
6244         nameidone(&nd);
6245
6246         /*
6247          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
6248          * the current time instead.
6249          */
6250         usrtvp = uap->tptr;
6251         if ((error = getutimes(usrtvp, ts)) != 0)
6252                 goto out;
6253
6254         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
6255
6256 out:
6257         vnode_put(nd.ni_vp);
6258         return (error);
6259 }
6260
6261 /*
6262  * Set the access and modification times of a file.
6263  */
6264 /* ARGSUSED */
6265 int
6266 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
6267 {
6268         struct timespec ts[2];
6269         vnode_t vp;
6270         user_addr_t usrtvp;
6271         int error;
6272
6273         AUDIT_ARG(fd, uap->fd);
6274         usrtvp = uap->tptr;
6275         if ((error = getutimes(usrtvp, ts)) != 0)
6276                 return (error);
6277         if ((error = file_vnode(uap->fd, &vp)) != 0)
6278                 return (error);
6279         if((error = vnode_getwithref(vp))) {
6280                 file_drop(uap->fd);
6281                 return(error);
6282         }
6283
6284         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
6285         vnode_put(vp);
6286         file_drop(uap->fd);
6287         return(error);
6288 }
6289
6290 /*
6291  * Truncate a file given its path name.
6292  */
6293 /* ARGSUSED */
6294 int
6295 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
6296 {
6297         vnode_t vp;
6298         struct vnode_attr va;
6299         vfs_context_t ctx = vfs_context_current();
6300         int error;
6301         struct nameidata nd;
6302         kauth_action_t action;
6303
6304         if (uap->length < 0)
6305                 return(EINVAL);
6306         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
6307                 UIO_USERSPACE, uap->path, ctx);
6308         if ((error = namei(&nd)))
6309                 return (error);
6310         vp = nd.ni_vp;
6311
6312         nameidone(&nd);
6313
6314         VATTR_INIT(&va);
6315         VATTR_SET(&va, va_data_size, uap->length);
6316
6317 #if CONFIG_MACF
6318         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
6319         if (error)
6320                 goto out;
6321 #endif
6322
6323         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6324                 goto out;
6325         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6326                 goto out;
6327         error = vnode_setattr(vp, &va, ctx);
6328 out:
6329         vnode_put(vp);
6330         return (error);
6331 }
6332
6333 /*
6334  * Truncate a file given a file descriptor.
6335  */
6336 /* ARGSUSED */
6337 int
6338 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
6339 {
6340         vfs_context_t ctx = vfs_context_current();
6341         struct vnode_attr va;
6342         vnode_t vp;
6343         struct fileproc *fp;
6344         int error ;
6345         int fd = uap->fd;
6346
6347         AUDIT_ARG(fd, uap->fd);
6348         if (uap->length < 0)
6349                 return(EINVAL);
6350
6351         if ( (error = fp_lookup(p,fd,&fp,0)) ) {
6352                 return(error);
6353         }
6354
6355         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
6356         case DTYPE_PSXSHM:
6357                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
6358                 goto out;
6359         case DTYPE_VNODE:
6360                 break;
6361         default:
6362                 error = EINVAL;
6363                 goto out;
6364         }
6365
6366         vp = (vnode_t)fp->f_fglob->fg_data;
6367
6368         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
6369                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
6370                 error = EINVAL;
6371                 goto out;
6372         }
6373
6374         if ((error = vnode_getwithref(vp)) != 0) {
6375                 goto out;
6376         }
6377
6378         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6379
6380 #if CONFIG_MACF
6381         error = mac_vnode_check_truncate(ctx,
6382             fp->f_fglob->fg_cred, vp);
6383         if (error) {
6384                 (void)vnode_put(vp);
6385                 goto out;
6386         }
6387 #endif
6388         VATTR_INIT(&va);
6389         VATTR_SET(&va, va_data_size, uap->length);
6390         error = vnode_setattr(vp, &va, ctx);
6391         (void)vnode_put(vp);
6392 out:
6393         file_drop(fd);
6394         return (error);
6395 }
6396
6397
6398 /*
6399  * Sync an open file with synchronized I/O _file_ integrity completion
6400  */
6401 /* ARGSUSED */
6402 int
6403 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
6404 {
6405         __pthread_testcancel(1);
6406         return(fsync_common(p, uap, MNT_WAIT));
6407 }
6408
6409
6410 /*
6411  * Sync an open file with synchronized I/O _file_ integrity completion
6412  *
6413  * Notes:       This is a legacy support function that does not test for
6414  *              thread cancellation points.
6415  */
6416 /* ARGSUSED */
6417 int
6418 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
6419 {
6420         return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT));
6421 }
6422
6423
6424 /*
6425  * Sync an open file with synchronized I/O _data_ integrity completion
6426  */
6427 /* ARGSUSED */
6428 int
6429 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
6430 {
6431         __pthread_testcancel(1);
6432         return(fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT));
6433 }
6434
6435
6436 /*
6437  * fsync_common
6438  *
6439  * Common fsync code to support both synchronized I/O file integrity completion
6440  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
6441  *
6442  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
6443  * will only guarantee that the file data contents are retrievable.  If
6444  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
6445  * includes additional metadata unnecessary for retrieving the file data
6446  * contents, such as atime, mtime, ctime, etc., also be committed to stable
6447  * storage.
6448  *
6449  * Parameters:  p                               The process
6450  *              uap->fd                         The descriptor to synchronize
6451  *              flags                           The data integrity flags
6452  *
6453  * Returns:     int                             Success
6454  *      fp_getfvp:EBADF                         Bad file descriptor
6455  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
6456  *      VNOP_FSYNC:???                          unspecified
6457  *
6458  * Notes:       We use struct fsync_args because it is a short name, and all
6459  *              caller argument structures are otherwise identical.
6460  */
6461 static int
6462 fsync_common(proc_t p, struct fsync_args *uap, int flags)
6463 {
6464         vnode_t vp;
6465         struct fileproc *fp;
6466         vfs_context_t ctx = vfs_context_current();
6467         int error;
6468
6469         AUDIT_ARG(fd, uap->fd);
6470
6471         if ( (error = fp_getfvp(p, uap->fd, &fp, &vp)) )
6472                 return (error);
6473         if ( (error = vnode_getwithref(vp)) ) {
6474                 file_drop(uap->fd);
6475                 return(error);
6476         }
6477
6478         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6479
6480         error = VNOP_FSYNC(vp, flags, ctx);
6481
6482 #if NAMEDRSRCFORK
6483         /* Sync resource fork shadow file if necessary. */
6484         if ((error == 0) &&
6485             (vp->v_flag & VISNAMEDSTREAM) &&
6486             (vp->v_parent != NULLVP) &&
6487             vnode_isshadow(vp) &&
6488             (fp->f_flags & FP_WRITTEN)) {
6489                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
6490         }
6491 #endif
6492
6493         (void)vnode_put(vp);
6494         file_drop(uap->fd);
6495         return (error);
6496 }
6497
6498 /*
6499  * Duplicate files.  Source must be a file, target must be a file or
6500  * must not exist.
6501  *
6502  * XXX Copyfile authorisation checking is woefully inadequate, and will not
6503  *     perform inheritance correctly.
6504  */
6505 /* ARGSUSED */
6506 int
6507 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
6508 {
6509         vnode_t tvp, fvp, tdvp, sdvp;
6510         struct nameidata fromnd, tond;
6511         int error;
6512         vfs_context_t ctx = vfs_context_current();
6513
6514         /* Check that the flags are valid. */
6515
6516         if (uap->flags & ~CPF_MASK) {
6517                 return(EINVAL);
6518         }
6519
6520         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
6521                 UIO_USERSPACE, uap->from, ctx);
6522         if ((error = namei(&fromnd)))
6523                 return (error);
6524         fvp = fromnd.ni_vp;
6525
6526         NDINIT(&tond, CREATE, OP_LINK,
6527                LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6528                UIO_USERSPACE, uap->to, ctx);
6529         if ((error = namei(&tond))) {
6530                 goto out1;
6531         }
6532         tdvp = tond.ni_dvp;
6533         tvp = tond.ni_vp;
6534
6535         if (tvp != NULL) {
6536                 if (!(uap->flags & CPF_OVERWRITE)) {
6537                         error = EEXIST;
6538                         goto out;
6539                 }
6540         }
6541         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
6542                 error = EISDIR;
6543                 goto out;
6544         }
6545
6546         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
6547                 goto out;
6548
6549         if (fvp == tdvp)
6550                 error = EINVAL;
6551         /*
6552          * If source is the same as the destination (that is the
6553          * same inode number) then there is nothing to do.
6554          * (fixed to have POSIX semantics - CSM 3/2/98)
6555          */
6556         if (fvp == tvp)
6557                 error = -1;
6558         if (!error)
6559                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
6560 out:
6561         sdvp = tond.ni_startdir;
6562         /*
6563          * nameidone has to happen before we vnode_put(tdvp)
6564          * since it may need to release the fs_nodelock on the tdvp
6565          */
6566         nameidone(&tond);
6567
6568         if (tvp)
6569                 vnode_put(tvp);
6570         vnode_put(tdvp);
6571         vnode_put(sdvp);
6572 out1:
6573         vnode_put(fvp);
6574
6575         nameidone(&fromnd);
6576
6577         if (error == -1)
6578                 return (0);
6579         return (error);
6580 }
6581
6582
6583 /*
6584  * Rename files.  Source and destination must either both be directories,
6585  * or both not be directories.  If target is a directory, it must be empty.
6586  */
6587 /* ARGSUSED */
6588 static int
6589 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
6590     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
6591 {
6592         vnode_t tvp, tdvp;
6593         vnode_t fvp, fdvp;
6594         struct nameidata *fromnd, *tond;
6595         int error;
6596         int do_retry;
6597         int retry_count;
6598         int mntrename;
6599         int need_event;
6600         const char *oname = NULL;
6601         char *from_name = NULL, *to_name = NULL;
6602         int from_len=0, to_len=0;
6603         int holding_mntlock;
6604         mount_t locked_mp = NULL;
6605         vnode_t oparent = NULLVP;
6606 #if CONFIG_FSE
6607         fse_info from_finfo, to_finfo;
6608 #endif
6609         int from_truncated=0, to_truncated;
6610         int batched = 0;
6611         struct vnode_attr *fvap, *tvap;
6612         int continuing = 0;
6613         /* carving out a chunk for structs that are too big to be on stack. */
6614         struct {
6615                 struct nameidata from_node, to_node;
6616                 struct vnode_attr fv_attr, tv_attr;
6617         } * __rename_data;
6618         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
6619         fromnd = &__rename_data->from_node;
6620         tond = &__rename_data->to_node;
6621
6622         holding_mntlock = 0;
6623         do_retry = 0;
6624         retry_count = 0;
6625 retry:
6626         fvp = tvp = NULL;
6627         fdvp = tdvp = NULL;
6628         fvap = tvap = NULL;
6629         mntrename = FALSE;
6630
6631         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
6632             segflg, from, ctx);
6633         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
6634
6635         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6636             segflg, to, ctx);
6637         tond->ni_flag = NAMEI_COMPOUNDRENAME;
6638
6639 continue_lookup:
6640         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
6641                 if ( (error = nameiat(fromnd, fromfd)) )
6642                         goto out1;
6643                 fdvp = fromnd->ni_dvp;
6644                 fvp  = fromnd->ni_vp;
6645
6646                 if (fvp && fvp->v_type == VDIR)
6647                         tond->ni_cnd.cn_flags |= WILLBEDIR;
6648         }
6649
6650         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
6651                 if ( (error = nameiat(tond, tofd)) ) {
6652                         /*
6653                          * Translate error code for rename("dir1", "dir2/.").
6654                          */
6655                         if (error == EISDIR && fvp->v_type == VDIR)
6656                                 error = EINVAL;
6657                         goto out1;
6658                 }
6659                 tdvp = tond->ni_dvp;
6660                 tvp  = tond->ni_vp;
6661         }
6662
6663         batched = vnode_compound_rename_available(fdvp);
6664         if (!fvp) {
6665                 /*
6666                  * Claim: this check will never reject a valid rename.
6667                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
6668                  * Suppose fdvp and tdvp are not on the same mount.
6669                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
6670                  *      then you can't move it to within another dir on the same mountpoint.
6671                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
6672                  *
6673                  * If this check passes, then we are safe to pass these vnodes to the same FS.
6674                  */
6675                 if (fdvp->v_mount != tdvp->v_mount) {
6676                         error = EXDEV;
6677                         goto out1;
6678                 }
6679                 goto skipped_lookup;
6680         }
6681
6682         if (!batched) {
6683                 error = vn_authorize_rename(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, NULL);
6684                 if (error) {
6685                         if (error == ENOENT) {
6686                                 assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
6687                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6688                                         /*
6689                                          * We encountered a race where after doing the namei, tvp stops
6690                                          * being valid. If so, simply re-drive the rename call from the
6691                                          * top.
6692                                          */
6693                                         do_retry = 1;
6694                                         retry_count += 1;
6695                                 }
6696                         }
6697                         goto out1;
6698                 }
6699         }
6700
6701         /*
6702          * If the source and destination are the same (i.e. they're
6703          * links to the same vnode) and the target file system is
6704          * case sensitive, then there is nothing to do.
6705          *
6706          * XXX Come back to this.
6707          */
6708         if (fvp == tvp) {
6709                 int pathconf_val;
6710
6711                 /*
6712                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
6713                  * then assume that this file system is case sensitive.
6714                  */
6715                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
6716                     pathconf_val != 0) {
6717                         goto out1;
6718                 }
6719         }
6720
6721         /*
6722          * Allow the renaming of mount points.
6723          * - target must not exist
6724          * - target must reside in the same directory as source
6725          * - union mounts cannot be renamed
6726          * - "/" cannot be renamed
6727          *
6728          * XXX Handle this in VFS after a continued lookup (if we missed
6729          * in the cache to start off)
6730          */
6731         if ((fvp->v_flag & VROOT) &&
6732             (fvp->v_type == VDIR) &&
6733             (tvp == NULL)  &&
6734             (fvp->v_mountedhere == NULL)  &&
6735             (fdvp == tdvp)  &&
6736             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0)  &&
6737             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
6738                 vnode_t coveredvp;
6739
6740                 /* switch fvp to the covered vnode */
6741                 coveredvp = fvp->v_mount->mnt_vnodecovered;
6742                 if ( (vnode_getwithref(coveredvp)) ) {
6743                         error = ENOENT;
6744                         goto out1;
6745                 }
6746                 vnode_put(fvp);
6747
6748                 fvp = coveredvp;
6749                 mntrename = TRUE;
6750         }
6751         /*
6752          * Check for cross-device rename.
6753          */
6754         if ((fvp->v_mount != tdvp->v_mount) ||
6755             (tvp && (fvp->v_mount != tvp->v_mount))) {
6756                 error = EXDEV;
6757                 goto out1;
6758         }
6759
6760         /*
6761          * If source is the same as the destination (that is the
6762          * same inode number) then there is nothing to do...
6763          * EXCEPT if the underlying file system supports case
6764          * insensitivity and is case preserving.  In this case
6765          * the file system needs to handle the special case of
6766          * getting the same vnode as target (fvp) and source (tvp).
6767          *
6768          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
6769          * and _PC_CASE_PRESERVING can have this exception, and they need to
6770          * handle the special case of getting the same vnode as target and
6771          * source.  NOTE: Then the target is unlocked going into vnop_rename,
6772          * so not to cause locking problems. There is a single reference on tvp.
6773          *
6774          * NOTE - that fvp == tvp also occurs if they are hard linked and
6775          * that correct behaviour then is just to return success without doing
6776          * anything.
6777          *
6778          * XXX filesystem should take care of this itself, perhaps...
6779          */
6780         if (fvp == tvp && fdvp == tdvp) {
6781                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
6782                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
6783                           fromnd->ni_cnd.cn_namelen)) {
6784                         goto out1;
6785                 }
6786         }
6787
6788         if (holding_mntlock && fvp->v_mount != locked_mp) {
6789                 /*
6790                  * we're holding a reference and lock
6791                  * on locked_mp, but it no longer matches
6792                  * what we want to do... so drop our hold
6793                  */
6794                 mount_unlock_renames(locked_mp);
6795                 mount_drop(locked_mp, 0);
6796                 holding_mntlock = 0;
6797         }
6798         if (tdvp != fdvp && fvp->v_type == VDIR) {
6799                 /*
6800                  * serialize renames that re-shape
6801                  * the tree... if holding_mntlock is
6802                  * set, then we're ready to go...
6803                  * otherwise we
6804                  * first need to drop the iocounts
6805                  * we picked up, second take the
6806                  * lock to serialize the access,
6807                  * then finally start the lookup
6808                  * process over with the lock held
6809                  */
6810                 if (!holding_mntlock) {
6811                         /*
6812                          * need to grab a reference on
6813                          * the mount point before we
6814                          * drop all the iocounts... once
6815                          * the iocounts are gone, the mount
6816                          * could follow
6817                          */
6818                         locked_mp = fvp->v_mount;
6819                         mount_ref(locked_mp, 0);
6820
6821                         /*
6822                          * nameidone has to happen before we vnode_put(tvp)
6823                          * since it may need to release the fs_nodelock on the tvp
6824                          */
6825                         nameidone(tond);
6826
6827                         if (tvp)
6828                                 vnode_put(tvp);
6829                         vnode_put(tdvp);
6830
6831                         /*
6832                          * nameidone has to happen before we vnode_put(fdvp)
6833                          * since it may need to release the fs_nodelock on the fvp
6834                          */
6835                         nameidone(fromnd);
6836
6837                         vnode_put(fvp);
6838                         vnode_put(fdvp);
6839
6840                         mount_lock_renames(locked_mp);
6841                         holding_mntlock = 1;
6842
6843                         goto retry;
6844                 }
6845         } else {
6846                 /*
6847                  * when we dropped the iocounts to take
6848                  * the lock, we allowed the identity of
6849                  * the various vnodes to change... if they did,
6850                  * we may no longer be dealing with a rename
6851                  * that reshapes the tree... once we're holding
6852                  * the iocounts, the vnodes can't change type
6853                  * so we're free to drop the lock at this point
6854                  * and continue on
6855                  */
6856                 if (holding_mntlock) {
6857                         mount_unlock_renames(locked_mp);
6858                         mount_drop(locked_mp, 0);
6859                         holding_mntlock = 0;
6860                 }
6861         }
6862
6863         // save these off so we can later verify that fvp is the same
6864         oname   = fvp->v_name;
6865         oparent = fvp->v_parent;
6866
6867 skipped_lookup:
6868 #if CONFIG_FSE
6869         need_event = need_fsevent(FSE_RENAME, fdvp);
6870         if (need_event) {
6871                 if (fvp) {
6872                         get_fse_info(fvp, &from_finfo, ctx);
6873                 } else {
6874                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
6875                         if (error) {
6876                                 goto out1;
6877                         }
6878
6879                         fvap = &__rename_data->fv_attr;
6880                 }
6881
6882                 if (tvp) {
6883                         get_fse_info(tvp, &to_finfo, ctx);
6884                 } else if (batched) {
6885                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
6886                         if (error) {
6887                                 goto out1;
6888                         }
6889
6890                         tvap = &__rename_data->tv_attr;
6891                 }
6892         }
6893 #else
6894         need_event = 0;
6895 #endif /* CONFIG_FSE */
6896
6897         if (need_event || kauth_authorize_fileop_has_listeners()) {
6898                 if (from_name == NULL) {
6899                         GET_PATH(from_name);
6900                         if (from_name == NULL) {
6901                                 error = ENOMEM;
6902                                 goto out1;
6903                         }
6904                 }
6905
6906                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
6907
6908                 if (to_name == NULL) {
6909                         GET_PATH(to_name);
6910                         if (to_name == NULL) {
6911                                 error = ENOMEM;
6912                                 goto out1;
6913                         }
6914                 }
6915
6916                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
6917         }
6918 #if CONFIG_SECLUDED_RENAME
6919         if (flags & VFS_SECLUDE_RENAME) {
6920                 fromnd->ni_cnd.cn_flags |=  CN_SECLUDE_RENAME;
6921         }
6922 #else
6923         #pragma unused(flags)
6924 #endif
6925         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
6926                             tdvp, &tvp, &tond->ni_cnd, tvap,
6927                             0, ctx);
6928
6929         if (holding_mntlock) {
6930                 /*
6931                  * we can drop our serialization
6932                  * lock now
6933                  */
6934                 mount_unlock_renames(locked_mp);
6935                 mount_drop(locked_mp, 0);
6936                 holding_mntlock = 0;
6937         }
6938         if (error) {
6939                 if (error == EKEEPLOOKING) {
6940                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6941                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
6942                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
6943                                 }
6944                         }
6945
6946                         fromnd->ni_vp = fvp;
6947                         tond->ni_vp = tvp;
6948
6949                         goto continue_lookup;
6950                 }
6951
6952                 /*
6953                  * We may encounter a race in the VNOP where the destination didn't
6954                  * exist when we did the namei, but it does by the time we go and
6955                  * try to create the entry. In this case, we should re-drive this rename
6956                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
6957                  * but other filesystems susceptible to this race could return it, too.
6958                  */
6959                 if (error == ERECYCLE) {
6960                         do_retry = 1;
6961                 }
6962
6963                 /*
6964                  * For compound VNOPs, the authorization callback may return
6965                  * ENOENT in case of racing hardlink lookups hitting the name
6966                  * cache, redrive the lookup.
6967                  */
6968                 if (batched && error == ENOENT) {
6969                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
6970                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
6971                                 do_retry = 1;
6972                                 retry_count += 1;
6973                         }
6974                 }
6975
6976                 goto out1;
6977         }
6978
6979         /* call out to allow 3rd party notification of rename.
6980          * Ignore result of kauth_authorize_fileop call.
6981          */
6982         kauth_authorize_fileop(vfs_context_ucred(ctx),
6983                         KAUTH_FILEOP_RENAME,
6984                         (uintptr_t)from_name, (uintptr_t)to_name);
6985
6986 #if CONFIG_FSE
6987         if (from_name != NULL && to_name != NULL) {
6988                 if (from_truncated || to_truncated) {
6989                         // set it here since only the from_finfo gets reported up to user space
6990                         from_finfo.mode |= FSE_TRUNCATED_PATH;
6991                 }
6992
6993                 if (tvap && tvp) {
6994                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
6995                 }
6996                 if (fvap) {
6997                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
6998                 }
6999
7000                 if (tvp) {
7001                         add_fsevent(FSE_RENAME, ctx,
7002                                     FSE_ARG_STRING, from_len, from_name,
7003                                     FSE_ARG_FINFO, &from_finfo,
7004                                     FSE_ARG_STRING, to_len, to_name,
7005                                     FSE_ARG_FINFO, &to_finfo,
7006                                     FSE_ARG_DONE);
7007                 } else {
7008                         add_fsevent(FSE_RENAME, ctx,
7009                                     FSE_ARG_STRING, from_len, from_name,
7010                                     FSE_ARG_FINFO, &from_finfo,
7011                                     FSE_ARG_STRING, to_len, to_name,
7012                                     FSE_ARG_DONE);
7013                 }
7014         }
7015 #endif /* CONFIG_FSE */
7016
7017         /*
7018          * update filesystem's mount point data
7019          */
7020         if (mntrename) {
7021                 char *cp, *pathend, *mpname;
7022                 char * tobuf;
7023                 struct mount *mp;
7024                 int maxlen;
7025                 size_t len = 0;
7026
7027                 mp = fvp->v_mountedhere;
7028
7029                 if (vfs_busy(mp, LK_NOWAIT)) {
7030                         error = EBUSY;
7031                         goto out1;
7032                 }
7033                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
7034
7035                 if (UIO_SEG_IS_USER_SPACE(segflg))
7036                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
7037                 else
7038                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
7039                 if (!error) {
7040                         /* find current mount point prefix */
7041                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
7042                         for (cp = pathend; *cp != '\0'; ++cp) {
7043                                 if (*cp == '/')
7044                                         pathend = cp + 1;
7045                         }
7046                         /* find last component of target name */
7047                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
7048                                 if (*cp == '/')
7049                                         mpname = cp + 1;
7050                         }
7051                         /* append name to prefix */
7052                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
7053                         bzero(pathend, maxlen);
7054                         strlcpy(pathend, mpname, maxlen);
7055                 }
7056                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
7057
7058                 vfs_unbusy(mp);
7059         }
7060         /*
7061          * fix up name & parent pointers.  note that we first
7062          * check that fvp has the same name/parent pointers it
7063          * had before the rename call... this is a 'weak' check
7064          * at best...
7065          *
7066          * XXX oparent and oname may not be set in the compound vnop case
7067          */
7068         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
7069                 int update_flags;
7070
7071                 update_flags = VNODE_UPDATE_NAME;
7072
7073                 if (fdvp != tdvp)
7074                         update_flags |= VNODE_UPDATE_PARENT;
7075
7076                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
7077         }
7078 out1:
7079         if (to_name != NULL) {
7080                 RELEASE_PATH(to_name);
7081                 to_name = NULL;
7082         }
7083         if (from_name != NULL) {
7084                 RELEASE_PATH(from_name);
7085                 from_name = NULL;
7086         }
7087         if (holding_mntlock) {
7088                 mount_unlock_renames(locked_mp);
7089                 mount_drop(locked_mp, 0);
7090                 holding_mntlock = 0;
7091         }
7092         if (tdvp) {
7093                 /*
7094                  * nameidone has to happen before we vnode_put(tdvp)
7095                  * since it may need to release the fs_nodelock on the tdvp
7096                  */
7097                 nameidone(tond);
7098
7099                 if (tvp)
7100                         vnode_put(tvp);
7101                 vnode_put(tdvp);
7102         }
7103         if (fdvp) {
7104                 /*
7105                  * nameidone has to happen before we vnode_put(fdvp)
7106                  * since it may need to release the fs_nodelock on the fdvp
7107                  */
7108                 nameidone(fromnd);
7109
7110                 if (fvp)
7111                         vnode_put(fvp);
7112                 vnode_put(fdvp);
7113         }
7114
7115         /*
7116          * If things changed after we did the namei, then we will re-drive
7117          * this rename call from the top.
7118          */
7119         if (do_retry) {
7120                 do_retry = 0;
7121                 goto retry;
7122         }
7123
7124         FREE(__rename_data, M_TEMP);
7125         return (error);
7126 }
7127
7128 int
7129 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
7130 {
7131         return (renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
7132             AT_FDCWD, uap->to, UIO_USERSPACE, 0));
7133 }
7134
7135 #if CONFIG_SECLUDED_RENAME
7136 int rename_ext(__unused proc_t p, struct rename_ext_args *uap, __unused int32_t *retval)
7137 {
7138         return renameat_internal(
7139                 vfs_context_current(),
7140                 AT_FDCWD, uap->from,
7141                 AT_FDCWD, uap->to,
7142                 UIO_USERSPACE, uap->flags);
7143 }
7144 #endif
7145
7146 int
7147 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
7148 {
7149         return (renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
7150             uap->tofd, uap->to, UIO_USERSPACE, 0));
7151 }
7152
7153 /*
7154  * Make a directory file.
7155  *
7156  * Returns:     0                       Success
7157  *              EEXIST
7158  *      namei:???
7159  *      vnode_authorize:???
7160  *      vn_create:???
7161  */
7162 /* ARGSUSED */
7163 static int
7164 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
7165     enum uio_seg segflg)
7166 {
7167         vnode_t vp, dvp;
7168         int error;
7169         int update_flags = 0;
7170         int batched;
7171         struct nameidata nd;
7172
7173         AUDIT_ARG(mode, vap->va_mode);
7174         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
7175                path, ctx);
7176         nd.ni_cnd.cn_flags |= WILLBEDIR;
7177         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
7178
7179 continue_lookup:
7180         error = nameiat(&nd, fd);
7181         if (error)
7182                 return (error);
7183         dvp = nd.ni_dvp;
7184         vp = nd.ni_vp;
7185
7186         if (vp != NULL) {
7187                 error = EEXIST;
7188                 goto out;
7189         }
7190
7191         batched = vnode_compound_mkdir_available(dvp);
7192
7193         VATTR_SET(vap, va_type, VDIR);
7194
7195         /*
7196          * XXX
7197          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
7198          * only get EXISTS or EISDIR for existing path components, and not that it could see
7199          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
7200          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
7201          */
7202         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
7203                 if (error == EACCES || error == EPERM) {
7204                         int error2;
7205
7206                         nameidone(&nd);
7207                         vnode_put(dvp);
7208                         dvp = NULLVP;
7209
7210                         /*
7211                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
7212                          * rather than EACCESS if the target exists.
7213                          */
7214                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
7215                                         path, ctx);
7216                         error2 = nameiat(&nd, fd);
7217                         if (error2) {
7218                                 goto out;
7219                         } else {
7220                                 vp = nd.ni_vp;
7221                                 error = EEXIST;
7222                                 goto out;
7223                         }
7224                 }
7225
7226                 goto out;
7227         }
7228
7229         /*
7230          * make the directory
7231          */
7232         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
7233                 if (error == EKEEPLOOKING) {
7234                         nd.ni_vp = vp;
7235                         goto continue_lookup;
7236                 }
7237
7238                 goto out;
7239         }
7240
7241         // Make sure the name & parent pointers are hooked up
7242         if (vp->v_name == NULL)
7243                 update_flags |= VNODE_UPDATE_NAME;
7244         if (vp->v_parent == NULLVP)
7245                 update_flags |= VNODE_UPDATE_PARENT;
7246
7247         if (update_flags)
7248                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
7249
7250 #if CONFIG_FSE
7251         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
7252 #endif
7253
7254 out:
7255         /*
7256          * nameidone has to happen before we vnode_put(dvp)
7257          * since it may need to release the fs_nodelock on the dvp
7258          */
7259         nameidone(&nd);
7260
7261         if (vp)
7262                 vnode_put(vp);
7263         if (dvp)
7264                 vnode_put(dvp);
7265
7266         return (error);
7267 }
7268
7269 /*
7270  * mkdir_extended: Create a directory; with extended security (ACL).
7271  *
7272  * Parameters:    p                       Process requesting to create the directory
7273  *                uap                     User argument descriptor (see below)
7274  *                retval                  (ignored)
7275  *
7276  * Indirect:      uap->path               Path of directory to create
7277  *                uap->mode               Access permissions to set
7278  *                uap->xsecurity          ACL to set
7279  *
7280  * Returns:        0                      Success
7281  *                !0                      Not success
7282  *
7283  */
7284 int
7285 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
7286 {
7287         int ciferror;
7288         kauth_filesec_t xsecdst;
7289         struct vnode_attr va;
7290
7291         AUDIT_ARG(owner, uap->uid, uap->gid);
7292
7293         xsecdst = NULL;
7294         if ((uap->xsecurity != USER_ADDR_NULL) &&
7295             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
7296                 return ciferror;
7297
7298         VATTR_INIT(&va);
7299         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7300         if (xsecdst != NULL)
7301                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7302
7303         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7304             UIO_USERSPACE);
7305         if (xsecdst != NULL)
7306                 kauth_filesec_free(xsecdst);
7307         return ciferror;
7308 }
7309
7310 int
7311 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
7312 {
7313         struct vnode_attr va;
7314
7315         VATTR_INIT(&va);
7316         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7317
7318         return (mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7319             UIO_USERSPACE));
7320 }
7321
7322 int
7323 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
7324 {
7325         struct vnode_attr va;
7326
7327         VATTR_INIT(&va);
7328         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7329
7330         return(mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
7331             UIO_USERSPACE));
7332 }
7333
7334 static int
7335 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
7336     enum uio_seg segflg)
7337 {
7338         vnode_t vp, dvp;
7339         int error;
7340         struct nameidata nd;
7341         char     *path = NULL;
7342         int       len=0;
7343         int has_listeners = 0;
7344         int need_event = 0;
7345         int truncated = 0;
7346 #if CONFIG_FSE
7347         struct vnode_attr va;
7348 #endif /* CONFIG_FSE */
7349         struct vnode_attr *vap = NULL;
7350         int restart_count = 0;
7351         int batched;
7352
7353         int restart_flag;
7354
7355         /*
7356          * This loop exists to restart rmdir in the unlikely case that two
7357          * processes are simultaneously trying to remove the same directory
7358          * containing orphaned appleDouble files.
7359          */
7360         do {
7361                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
7362                     segflg, dirpath, ctx);
7363                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
7364 continue_lookup:
7365                 restart_flag = 0;
7366                 vap = NULL;
7367
7368                 error = nameiat(&nd, fd);
7369                 if (error)
7370                         return (error);
7371
7372                 dvp = nd.ni_dvp;
7373                 vp = nd.ni_vp;
7374
7375                 if (vp) {
7376                         batched = vnode_compound_rmdir_available(vp);
7377
7378                         if (vp->v_flag & VROOT) {
7379                                 /*
7380                                  * The root of a mounted filesystem cannot be deleted.
7381                                  */
7382                                 error = EBUSY;
7383                                 goto out;
7384                         }
7385
7386                         /*
7387                          * Removed a check here; we used to abort if vp's vid
7388                          * was not the same as what we'd seen the last time around.
7389                          * I do not think that check was valid, because if we retry
7390                          * and all dirents are gone, the directory could legitimately
7391                          * be recycled but still be present in a situation where we would
7392                          * have had permission to delete.  Therefore, we won't make
7393                          * an effort to preserve that check now that we may not have a
7394                          * vp here.
7395                          */
7396
7397                         if (!batched) {
7398                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
7399                                 if (error) {
7400                                         if (error == ENOENT) {
7401                                                 assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7402                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7403                                                         restart_flag = 1;
7404                                                         restart_count += 1;
7405                                                 }
7406                                         }
7407                                         goto out;
7408                                 }
7409                         }
7410                 } else {
7411                         batched = 1;
7412
7413                         if (!vnode_compound_rmdir_available(dvp)) {
7414                                 panic("No error, but no compound rmdir?");
7415                         }
7416                 }
7417
7418 #if CONFIG_FSE
7419                 fse_info  finfo;
7420
7421                 need_event = need_fsevent(FSE_DELETE, dvp);
7422                 if (need_event) {
7423                         if (!batched) {
7424                                 get_fse_info(vp, &finfo, ctx);
7425                         } else {
7426                                 error = vfs_get_notify_attributes(&va);
7427                                 if (error) {
7428                                         goto out;
7429                                 }
7430
7431                                 vap = &va;
7432                         }
7433                 }
7434 #endif
7435                 has_listeners = kauth_authorize_fileop_has_listeners();
7436                 if (need_event || has_listeners) {
7437                         if (path == NULL) {
7438                                 GET_PATH(path);
7439                                 if (path == NULL) {
7440                                         error = ENOMEM;
7441                                         goto out;
7442                                 }
7443                         }
7444
7445                         len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
7446 #if CONFIG_FSE
7447                         if (truncated) {
7448                                 finfo.mode |= FSE_TRUNCATED_PATH;
7449                         }
7450 #endif
7451                 }
7452
7453                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
7454                 nd.ni_vp = vp;
7455                 if (vp == NULLVP) {
7456                         /* Couldn't find a vnode */
7457                         goto out;
7458                 }
7459
7460                 if (error == EKEEPLOOKING) {
7461                         goto continue_lookup;
7462                 } else if (batched && error == ENOENT) {
7463                         assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7464                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7465                                 /*
7466                                  * For compound VNOPs, the authorization callback
7467                                  * may return ENOENT in case of racing hard link lookups
7468                                  * redrive the lookup.
7469                                  */
7470                                 restart_flag = 1;
7471                                 restart_count += 1;
7472                                 goto out;
7473                         }
7474                 }
7475 #if CONFIG_APPLEDOUBLE
7476                 /*
7477                  * Special case to remove orphaned AppleDouble
7478                  * files. I don't like putting this in the kernel,
7479                  * but carbon does not like putting this in carbon either,
7480                  * so here we are.
7481                  */
7482                 if (error == ENOTEMPTY) {
7483                         error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
7484                         if (error == EBUSY) {
7485                                 goto out;
7486                         }
7487
7488
7489                         /*
7490                          * Assuming everything went well, we will try the RMDIR again
7491                          */
7492                         if (!error)
7493                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
7494                 }
7495 #endif /* CONFIG_APPLEDOUBLE */
7496                 /*
7497                  * Call out to allow 3rd party notification of delete.
7498                  * Ignore result of kauth_authorize_fileop call.
7499                  */
7500                 if (!error) {
7501                         if (has_listeners) {
7502                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
7503                                                 KAUTH_FILEOP_DELETE,
7504                                                 (uintptr_t)vp,
7505                                                 (uintptr_t)path);
7506                         }
7507
7508                         if (vp->v_flag & VISHARDLINK) {
7509                                 // see the comment in unlink1() about why we update
7510                                 // the parent of a hard link when it is removed
7511                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
7512                         }
7513
7514 #if CONFIG_FSE
7515                         if (need_event) {
7516                                 if (vap) {
7517                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
7518                                 }
7519                                 add_fsevent(FSE_DELETE, ctx,
7520                                                 FSE_ARG_STRING, len, path,
7521                                                 FSE_ARG_FINFO, &finfo,
7522                                                 FSE_ARG_DONE);
7523                         }
7524 #endif
7525                 }
7526
7527 out:
7528                 if (path != NULL) {
7529                         RELEASE_PATH(path);
7530                         path = NULL;
7531                 }
7532                 /*
7533                  * nameidone has to happen before we vnode_put(dvp)
7534                  * since it may need to release the fs_nodelock on the dvp
7535                  */
7536                 nameidone(&nd);
7537                 vnode_put(dvp);
7538
7539                 if (vp)
7540                         vnode_put(vp);
7541
7542                 if (restart_flag == 0) {
7543                         wakeup_one((caddr_t)vp);
7544                         return (error);
7545                 }
7546                 tsleep(vp, PVFS, "rm AD", 1);
7547
7548         } while (restart_flag != 0);
7549
7550         return (error);
7551
7552 }
7553
7554 /*
7555  * Remove a directory file.
7556  */
7557 /* ARGSUSED */
7558 int
7559 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
7560 {
7561         return (rmdirat_internal(vfs_context_current(), AT_FDCWD,
7562             CAST_USER_ADDR_T(uap->path), UIO_USERSPACE));
7563 }
7564
7565 /* Get direntry length padded to 8 byte alignment */
7566 #define DIRENT64_LEN(namlen) \
7567         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
7568
7569 errno_t
7570 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
7571                 int *numdirent, vfs_context_t ctxp)
7572 {
7573         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
7574         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
7575                    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0))  {
7576                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
7577         } else {
7578                 size_t bufsize;
7579                 void * bufptr;
7580                 uio_t auio;
7581                 struct direntry *entry64;
7582                 struct dirent *dep;
7583                 int bytesread;
7584                 int error;
7585
7586                 /*
7587                  * Our kernel buffer needs to be smaller since re-packing
7588                  * will expand each dirent.  The worse case (when the name
7589                  * length is 3) corresponds to a struct direntry size of 32
7590                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
7591                  * (4-byte aligned).  So having a buffer that is 3/8 the size
7592                  * will prevent us from reading more than we can pack.
7593                  *
7594                  * Since this buffer is wired memory, we will limit the
7595                  * buffer size to a maximum of 32K. We would really like to
7596                  * use 32K in the MIN(), but we use magic number 87371 to
7597                  * prevent uio_resid() * 3 / 8 from overflowing.
7598                  */
7599                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
7600                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
7601                 if (bufptr == NULL) {
7602                         return ENOMEM;
7603                 }
7604
7605                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
7606                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
7607                 auio->uio_offset = uio->uio_offset;
7608
7609                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
7610
7611                 dep = (struct dirent *)bufptr;
7612                 bytesread = bufsize - uio_resid(auio);
7613
7614                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
7615                        M_TEMP, M_WAITOK);
7616                 /*
7617                  * Convert all the entries and copy them out to user's buffer.
7618                  */
7619                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
7620                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
7621
7622                         bzero(entry64, enbufsize);
7623                         /* Convert a dirent to a dirent64. */
7624                         entry64->d_ino = dep->d_ino;
7625                         entry64->d_seekoff = 0;
7626                         entry64->d_reclen = enbufsize;
7627                         entry64->d_namlen = dep->d_namlen;
7628                         entry64->d_type = dep->d_type;
7629                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
7630
7631                         /* Move to next entry. */
7632                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
7633
7634                         /* Copy entry64 to user's buffer. */
7635                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
7636                 }
7637
7638                 /* Update the real offset using the offset we got from VNOP_READDIR. */
7639                 if (error == 0) {
7640                         uio->uio_offset = auio->uio_offset;
7641                 }
7642                 uio_free(auio);
7643                 FREE(bufptr, M_TEMP);
7644                 FREE(entry64, M_TEMP);
7645                 return (error);
7646         }
7647 }
7648
7649 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
7650
7651 /*
7652  * Read a block of directory entries in a file system independent format.
7653  */
7654 static int
7655 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
7656                      off_t *offset, int flags)
7657 {
7658         vnode_t vp;
7659         struct vfs_context context = *vfs_context_current();    /* local copy */
7660         struct fileproc *fp;
7661         uio_t auio;
7662         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7663         off_t loff;
7664         int error, eofflag, numdirent;
7665         char uio_buf[ UIO_SIZEOF(1) ];
7666
7667         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
7668         if (error) {
7669                 return (error);
7670         }
7671         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7672                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7673                 error = EBADF;
7674                 goto out;
7675         }
7676
7677         if (bufsize > GETDIRENTRIES_MAXBUFSIZE)
7678                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
7679
7680 #if CONFIG_MACF
7681         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
7682         if (error)
7683                 goto out;
7684 #endif
7685         if ( (error = vnode_getwithref(vp)) ) {
7686                 goto out;
7687         }
7688         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7689
7690 unionread:
7691         if (vp->v_type != VDIR) {
7692                 (void)vnode_put(vp);
7693                 error = EINVAL;
7694                 goto out;
7695         }
7696
7697 #if CONFIG_MACF
7698         error = mac_vnode_check_readdir(&context, vp);
7699         if (error != 0) {
7700                 (void)vnode_put(vp);
7701                 goto out;
7702         }
7703 #endif /* MAC */
7704
7705         loff = fp->f_fglob->fg_offset;
7706         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
7707         uio_addiov(auio, bufp, bufsize);
7708
7709         if (flags & VNODE_READDIR_EXTENDED) {
7710                 error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
7711                 fp->f_fglob->fg_offset = uio_offset(auio);
7712         } else {
7713                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
7714                 fp->f_fglob->fg_offset = uio_offset(auio);
7715         }
7716         if (error) {
7717                 (void)vnode_put(vp);
7718                 goto out;
7719         }
7720
7721         if ((user_ssize_t)bufsize == uio_resid(auio)){
7722                 if (union_dircheckp) {
7723                         error = union_dircheckp(&vp, fp, &context);
7724                         if (error == -1)
7725                                 goto unionread;
7726                         if (error)
7727                                 goto out;
7728                 }
7729
7730                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
7731                         struct vnode *tvp = vp;
7732                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
7733                                 vnode_ref(vp);
7734                                 fp->f_fglob->fg_data = (caddr_t) vp;
7735                                 fp->f_fglob->fg_offset = 0;
7736                                 vnode_rele(tvp);
7737                                 vnode_put(tvp);
7738                                 goto unionread;
7739                         }
7740                         vp = tvp;
7741                 }
7742         }
7743
7744         vnode_put(vp);
7745         if (offset) {
7746                 *offset = loff;
7747         }
7748
7749         *bytesread = bufsize - uio_resid(auio);
7750 out:
7751         file_drop(fd);
7752         return (error);
7753 }
7754
7755
7756 int
7757 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
7758 {
7759         off_t offset;
7760         ssize_t bytesread;
7761         int error;
7762
7763         AUDIT_ARG(fd, uap->fd);
7764         error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
7765
7766         if (error == 0) {
7767                 if (proc_is64bit(p)) {
7768                         user64_long_t base = (user64_long_t)offset;
7769                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
7770                 } else {
7771                         user32_long_t base = (user32_long_t)offset;
7772                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
7773                 }
7774                 *retval = bytesread;
7775         }
7776         return (error);
7777 }
7778
7779 int
7780 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
7781 {
7782         off_t offset;
7783         ssize_t bytesread;
7784         int error;
7785
7786         AUDIT_ARG(fd, uap->fd);
7787         error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
7788
7789         if (error == 0) {
7790                 *retval = bytesread;
7791                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
7792         }
7793         return (error);
7794 }
7795
7796
7797 /*
7798  * Set the mode mask for creation of filesystem nodes.
7799  * XXX implement xsecurity
7800  */
7801 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
7802 static int
7803 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
7804 {
7805         struct filedesc *fdp;
7806
7807         AUDIT_ARG(mask, newmask);
7808         proc_fdlock(p);
7809         fdp = p->p_fd;
7810         *retval = fdp->fd_cmask;
7811         fdp->fd_cmask = newmask & ALLPERMS;
7812         proc_fdunlock(p);
7813         return (0);
7814 }
7815
7816 /*
7817  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
7818  *
7819  * Parameters:    p                       Process requesting to set the umask
7820  *                uap                     User argument descriptor (see below)
7821  *                retval                  umask of the process (parameter p)
7822  *
7823  * Indirect:      uap->newmask            umask to set
7824  *                uap->xsecurity          ACL to set
7825  *
7826  * Returns:        0                      Success
7827  *                !0                      Not success
7828  *
7829  */
7830 int
7831 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
7832 {
7833         int ciferror;
7834         kauth_filesec_t xsecdst;
7835
7836         xsecdst = KAUTH_FILESEC_NONE;
7837         if (uap->xsecurity != USER_ADDR_NULL) {
7838                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
7839                         return ciferror;
7840         } else {
7841                 xsecdst = KAUTH_FILESEC_NONE;
7842         }
7843
7844         ciferror = umask1(p, uap->newmask, xsecdst, retval);
7845
7846         if (xsecdst != KAUTH_FILESEC_NONE)
7847                 kauth_filesec_free(xsecdst);
7848         return ciferror;
7849 }
7850
7851 int
7852 umask(proc_t p, struct umask_args *uap, int32_t *retval)
7853 {
7854         return(umask1(p, uap->newmask, UMASK_NOXSECURITY, retval));
7855 }
7856
7857 /*
7858  * Void all references to file by ripping underlying filesystem
7859  * away from vnode.
7860  */
7861 /* ARGSUSED */
7862 int
7863 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
7864 {
7865         vnode_t vp;
7866         struct vnode_attr va;
7867         vfs_context_t ctx = vfs_context_current();
7868         int error;
7869         struct nameidata nd;
7870
7871         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
7872                uap->path, ctx);
7873         error = namei(&nd);
7874         if (error)
7875                 return (error);
7876         vp = nd.ni_vp;
7877
7878         nameidone(&nd);
7879
7880         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
7881                 error = ENOTSUP;
7882                 goto out;
7883         }
7884
7885         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
7886                 error = EBUSY;
7887                 goto out;
7888         }
7889
7890 #if CONFIG_MACF
7891         error = mac_vnode_check_revoke(ctx, vp);
7892         if (error)
7893                 goto out;
7894 #endif
7895
7896         VATTR_INIT(&va);
7897         VATTR_WANTED(&va, va_uid);
7898         if ((error = vnode_getattr(vp, &va, ctx)))
7899                 goto out;
7900         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
7901             (error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
7902                 goto out;
7903         if (vp->v_usecount > 0 || (vnode_isaliased(vp)))
7904                 VNOP_REVOKE(vp, REVOKEALL, ctx);
7905 out:
7906         vnode_put(vp);
7907         return (error);
7908 }
7909
7910
7911 /*
7912  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
7913  *  The following system calls are designed to support features
7914  *  which are specific to the HFS & HFS Plus volume formats
7915  */
7916
7917
7918 /*
7919  * Obtain attribute information on objects in a directory while enumerating
7920  * the directory.
7921  */
7922 /* ARGSUSED */
7923 int
7924 getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
7925 {
7926         vnode_t vp;
7927         struct fileproc *fp;
7928         uio_t auio = NULL;
7929         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
7930         uint32_t count, savecount;
7931         uint32_t newstate;
7932         int error, eofflag;
7933         uint32_t loff;
7934         struct attrlist attributelist;
7935         vfs_context_t ctx = vfs_context_current();
7936         int fd = uap->fd;
7937         char uio_buf[ UIO_SIZEOF(1) ];
7938         kauth_action_t action;
7939
7940         AUDIT_ARG(fd, fd);
7941
7942         /* Get the attributes into kernel space */
7943         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
7944                 return(error);
7945         }
7946         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
7947                 return(error);
7948         }
7949         savecount = count;
7950         if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) {
7951                 return (error);
7952         }
7953         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7954                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7955                 error = EBADF;
7956                 goto out;
7957         }
7958
7959
7960 #if CONFIG_MACF
7961         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
7962             fp->f_fglob);
7963         if (error)
7964                 goto out;
7965 #endif
7966
7967
7968         if ( (error = vnode_getwithref(vp)) )
7969                 goto out;
7970
7971         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7972
7973 unionread:
7974         if (vp->v_type != VDIR) {
7975                 (void)vnode_put(vp);
7976                 error = EINVAL;
7977                 goto out;
7978         }
7979
7980 #if CONFIG_MACF
7981         error = mac_vnode_check_readdir(ctx, vp);
7982         if (error != 0) {
7983                 (void)vnode_put(vp);
7984                 goto out;
7985         }
7986 #endif /* MAC */
7987
7988         /* set up the uio structure which will contain the users return buffer */
7989         loff = fp->f_fglob->fg_offset;
7990         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
7991         uio_addiov(auio, uap->buffer, uap->buffersize);
7992
7993         /*
7994          * If the only item requested is file names, we can let that past with
7995          * just LIST_DIRECTORY.  If they want any other attributes, that means
7996          * they need SEARCH as well.
7997          */
7998         action = KAUTH_VNODE_LIST_DIRECTORY;
7999         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
8000             attributelist.fileattr || attributelist.dirattr)
8001                 action |= KAUTH_VNODE_SEARCH;
8002
8003         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
8004
8005                 /* Believe it or not, uap->options only has 32-bits of valid
8006                  * info, so truncate before extending again */
8007
8008                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
8009                                 (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
8010         }
8011
8012         if (error) {
8013                 (void) vnode_put(vp);
8014                 goto out;
8015         }
8016
8017         /*
8018          * If we've got the last entry of a directory in a union mount
8019          * then reset the eofflag and pretend there's still more to come.
8020          * The next call will again set eofflag and the buffer will be empty,
8021          * so traverse to the underlying directory and do the directory
8022          * read there.
8023          */
8024         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
8025                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
8026                         eofflag = 0;
8027                 } else {                                                // Empty buffer
8028                         struct vnode *tvp = vp;
8029                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
8030                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
8031                                 fp->f_fglob->fg_data = (caddr_t) vp;
8032                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
8033                                 count = savecount;
8034                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
8035                                 vnode_put(tvp);
8036                                 goto unionread;
8037                         }
8038                         vp = tvp;
8039                 }
8040         }
8041
8042         (void)vnode_put(vp);
8043
8044         if (error)
8045                 goto out;
8046         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
8047
8048         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count))))
8049                 goto out;
8050         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate))))
8051                 goto out;
8052         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff))))
8053                 goto out;
8054
8055         *retval = eofflag;  /* similar to getdirentries */
8056         error = 0;
8057 out:
8058         file_drop(fd);
8059         return (error); /* return error earlier, an retval of 0 or 1 now */
8060
8061 } /* end of getdirentriesattr system call */
8062
8063 /*
8064 * Exchange data between two files
8065 */
8066
8067 /* ARGSUSED */
8068 int
8069 exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
8070 {
8071
8072         struct nameidata fnd, snd;
8073         vfs_context_t ctx = vfs_context_current();
8074         vnode_t fvp;
8075         vnode_t svp;
8076         int error;
8077         u_int32_t nameiflags;
8078         char *fpath = NULL;
8079         char *spath = NULL;
8080         int   flen=0, slen=0;
8081         int from_truncated=0, to_truncated=0;
8082 #if CONFIG_FSE
8083         fse_info f_finfo, s_finfo;
8084 #endif
8085
8086         nameiflags = 0;
8087         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8088
8089         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
8090                UIO_USERSPACE, uap->path1, ctx);
8091
8092         error = namei(&fnd);
8093         if (error)
8094                 goto out2;
8095
8096         nameidone(&fnd);
8097         fvp = fnd.ni_vp;
8098
8099         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
8100                UIO_USERSPACE, uap->path2, ctx);
8101
8102         error = namei(&snd);
8103         if (error) {
8104                 vnode_put(fvp);
8105                 goto out2;
8106         }
8107         nameidone(&snd);
8108         svp = snd.ni_vp;
8109
8110         /*
8111          * if the files are the same, return an inval error
8112          */
8113         if (svp == fvp) {
8114                 error = EINVAL;
8115                 goto out;
8116         }
8117
8118         /*
8119          * if the files are on different volumes, return an error
8120          */
8121         if (svp->v_mount != fvp->v_mount) {
8122                 error = EXDEV;
8123                 goto out;
8124         }
8125
8126         /* If they're not files, return an error */
8127         if ( (vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
8128                 error = EINVAL;
8129                 goto out;
8130         }
8131
8132 #if CONFIG_MACF
8133         error = mac_vnode_check_exchangedata(ctx,
8134             fvp, svp);
8135         if (error)
8136                 goto out;
8137 #endif
8138         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
8139             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0))
8140                 goto out;
8141
8142         if (
8143 #if CONFIG_FSE
8144         need_fsevent(FSE_EXCHANGE, fvp) ||
8145 #endif
8146         kauth_authorize_fileop_has_listeners()) {
8147                 GET_PATH(fpath);
8148                 GET_PATH(spath);
8149                 if (fpath == NULL || spath == NULL) {
8150                         error = ENOMEM;
8151                         goto out;
8152                 }
8153
8154                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
8155                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
8156
8157 #if CONFIG_FSE
8158                 get_fse_info(fvp, &f_finfo, ctx);
8159                 get_fse_info(svp, &s_finfo, ctx);
8160                 if (from_truncated || to_truncated) {
8161                         // set it here since only the f_finfo gets reported up to user space
8162                         f_finfo.mode |= FSE_TRUNCATED_PATH;
8163                 }
8164 #endif
8165         }
8166         /* Ok, make the call */
8167         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
8168
8169         if (error == 0) {
8170             const char *tmpname;
8171
8172             if (fpath != NULL && spath != NULL) {
8173                     /* call out to allow 3rd party notification of exchangedata.
8174                      * Ignore result of kauth_authorize_fileop call.
8175                      */
8176                     kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
8177                                            (uintptr_t)fpath, (uintptr_t)spath);
8178             }
8179             name_cache_lock();
8180
8181             tmpname     = fvp->v_name;
8182             fvp->v_name = svp->v_name;
8183             svp->v_name = tmpname;
8184
8185             if (fvp->v_parent != svp->v_parent) {
8186                 vnode_t tmp;
8187
8188                 tmp           = fvp->v_parent;
8189                 fvp->v_parent = svp->v_parent;
8190                 svp->v_parent = tmp;
8191             }
8192             name_cache_unlock();
8193
8194 #if CONFIG_FSE
8195             if (fpath != NULL && spath != NULL) {
8196                     add_fsevent(FSE_EXCHANGE, ctx,
8197                                 FSE_ARG_STRING, flen, fpath,
8198                                 FSE_ARG_FINFO, &f_finfo,
8199                                 FSE_ARG_STRING, slen, spath,
8200                                 FSE_ARG_FINFO, &s_finfo,
8201                                 FSE_ARG_DONE);
8202             }
8203 #endif
8204         }
8205
8206 out:
8207         if (fpath != NULL)
8208                 RELEASE_PATH(fpath);
8209         if (spath != NULL)
8210                 RELEASE_PATH(spath);
8211         vnode_put(svp);
8212         vnode_put(fvp);
8213 out2:
8214         return (error);
8215 }
8216
8217 /*
8218  * Return (in MB) the amount of freespace on the given vnode's volume.
8219  */
8220 uint32_t freespace_mb(vnode_t vp);
8221
8222 uint32_t
8223 freespace_mb(vnode_t vp)
8224 {
8225         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
8226         return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
8227                 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
8228 }
8229
8230 #if CONFIG_SEARCHFS
8231
8232 /* ARGSUSED */
8233
8234 int
8235 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
8236 {
8237         vnode_t vp, tvp;
8238         int i, error=0;
8239         int fserror = 0;
8240         struct nameidata nd;
8241         struct user64_fssearchblock searchblock;
8242         struct searchstate *state;
8243         struct attrlist *returnattrs;
8244         struct timeval timelimit;
8245         void *searchparams1,*searchparams2;
8246         uio_t auio = NULL;
8247         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8248         uint32_t nummatches;
8249         int mallocsize;
8250         uint32_t nameiflags;
8251         vfs_context_t ctx = vfs_context_current();
8252         char uio_buf[ UIO_SIZEOF(1) ];
8253
8254         /* Start by copying in fsearchblock parameter list */
8255     if (IS_64BIT_PROCESS(p)) {
8256         error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
8257         timelimit.tv_sec = searchblock.timelimit.tv_sec;
8258         timelimit.tv_usec = searchblock.timelimit.tv_usec;
8259     }
8260     else {
8261         struct user32_fssearchblock tmp_searchblock;
8262
8263         error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
8264         // munge into 64-bit version
8265         searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
8266         searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
8267         searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
8268         searchblock.maxmatches = tmp_searchblock.maxmatches;
8269                 /*
8270                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
8271                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
8272                  */
8273         timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
8274         timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
8275         searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
8276         searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
8277         searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
8278         searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
8279         searchblock.searchattrs = tmp_searchblock.searchattrs;
8280     }
8281         if (error)
8282                 return(error);
8283
8284         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
8285          */
8286         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
8287                 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS)
8288                 return(EINVAL);
8289
8290         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
8291         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
8292         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
8293         /* block.                                                                                             */
8294         /*                                                                                                    */
8295         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
8296         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
8297         /*       assumes the size is still 556 bytes it will continue to work                                 */
8298
8299         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
8300                 sizeof(struct attrlist) + sizeof(struct searchstate) + (2*sizeof(uint32_t));
8301
8302         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
8303
8304         /* Now set up the various pointers to the correct place in our newly allocated memory */
8305
8306         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
8307         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
8308         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof (struct attrlist));
8309
8310         /* Now copy in the stuff given our local variables. */
8311
8312         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1)))
8313                 goto freeandexit;
8314
8315         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2)))
8316                 goto freeandexit;
8317
8318         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist))))
8319                 goto freeandexit;
8320
8321         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate))))
8322                 goto freeandexit;
8323
8324         /*
8325          * When searching a union mount, need to set the
8326          * start flag at the first call on each layer to
8327          * reset state for the new volume.
8328          */
8329         if (uap->options & SRCHFS_START)
8330                 state->ss_union_layer = 0;
8331         else
8332                 uap->options |= state->ss_union_flags;
8333         state->ss_union_flags = 0;
8334
8335         /*
8336          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
8337          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
8338          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
8339          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
8340          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
8341          */
8342
8343         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
8344                 attrreference_t* string_ref;
8345                 u_int32_t* start_length;
8346                 user64_size_t param_length;
8347
8348                 /* validate searchparams1 */
8349                 param_length = searchblock.sizeofsearchparams1;
8350                 /* skip the word that specifies length of the buffer */
8351                 start_length= (u_int32_t*) searchparams1;
8352                 start_length= start_length+1;
8353                 string_ref= (attrreference_t*) start_length;
8354
8355                 /* ensure no negative offsets or too big offsets */
8356                 if (string_ref->attr_dataoffset < 0 ) {
8357                         error = EINVAL;
8358                         goto freeandexit;
8359                 }
8360                 if (string_ref->attr_length > MAXPATHLEN) {
8361                         error = EINVAL;
8362                         goto freeandexit;
8363                 }
8364
8365                 /* Check for pointer overflow in the string ref */
8366                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
8367                         error = EINVAL;
8368                         goto freeandexit;
8369                 }
8370
8371                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
8372                         error = EINVAL;
8373                         goto freeandexit;
8374                 }
8375                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
8376                         error = EINVAL;
8377                         goto freeandexit;
8378                 }
8379         }
8380
8381         /* set up the uio structure which will contain the users return buffer */
8382         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8383         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
8384
8385         nameiflags = 0;
8386         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8387         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
8388                UIO_USERSPACE, uap->path, ctx);
8389
8390         error = namei(&nd);
8391         if (error)
8392                 goto freeandexit;
8393         vp = nd.ni_vp;
8394         nameidone(&nd);
8395
8396         /*
8397          * Switch to the root vnode for the volume
8398          */
8399         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
8400         vnode_put(vp);
8401         if (error)
8402                 goto freeandexit;
8403         vp = tvp;
8404
8405         /*
8406          * If it's a union mount, the path lookup takes
8407          * us to the top layer. But we may need to descend
8408          * to a lower layer. For non-union mounts the layer
8409          * is always zero.
8410          */
8411         for (i = 0; i < (int) state->ss_union_layer; i++) {
8412                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0)
8413                         break;
8414                 tvp = vp;
8415                 vp = vp->v_mount->mnt_vnodecovered;
8416                 if (vp == NULL) {
8417                         vnode_put(tvp);
8418                         error = ENOENT;
8419                         goto freeandexit;
8420                 }
8421                 vnode_getwithref(vp);
8422                 vnode_put(tvp);
8423         }
8424
8425 #if CONFIG_MACF
8426         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
8427         if (error) {
8428                 vnode_put(vp);
8429                 goto freeandexit;
8430         }
8431 #endif
8432
8433
8434         /*
8435          * If searchblock.maxmatches == 0, then skip the search. This has happened
8436          * before and sometimes the underlying code doesnt deal with it well.
8437          */
8438          if (searchblock.maxmatches == 0) {
8439                 nummatches = 0;
8440                 goto saveandexit;
8441          }
8442
8443         /*
8444          * Allright, we have everything we need, so lets make that call.
8445          *
8446          * We keep special track of the return value from the file system:
8447          * EAGAIN is an acceptable error condition that shouldn't keep us
8448          * from copying out any results...
8449          */
8450
8451         fserror = VNOP_SEARCHFS(vp,
8452                 searchparams1,
8453                 searchparams2,
8454                 &searchblock.searchattrs,
8455                 (u_long)searchblock.maxmatches,
8456                 &timelimit,
8457                 returnattrs,
8458                 &nummatches,
8459                 (u_long)uap->scriptcode,
8460                 (u_long)uap->options,
8461                 auio,
8462                 (struct searchstate *) &state->ss_fsstate,
8463                 ctx);
8464
8465         /*
8466          * If it's a union mount we need to be called again
8467          * to search the mounted-on filesystem.
8468          */
8469         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
8470                 state->ss_union_flags = SRCHFS_START;
8471                 state->ss_union_layer++;        // search next layer down
8472                 fserror = EAGAIN;
8473         }
8474
8475 saveandexit:
8476
8477         vnode_put(vp);
8478
8479         /* Now copy out the stuff that needs copying out. That means the number of matches, the
8480            search state.  Everything was already put into he return buffer by the vop call. */
8481
8482         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0)
8483                 goto freeandexit;
8484
8485         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0)
8486                 goto freeandexit;
8487
8488         error = fserror;
8489
8490 freeandexit:
8491
8492         FREE(searchparams1,M_TEMP);
8493
8494         return(error);
8495
8496
8497 } /* end of searchfs system call */
8498
8499 #else /* CONFIG_SEARCHFS */
8500
8501 int
8502 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
8503 {
8504         return (ENOTSUP);
8505 }
8506
8507 #endif /* CONFIG_SEARCHFS */
8508
8509
8510 lck_grp_attr_t *  nspace_group_attr;
8511 lck_attr_t *      nspace_lock_attr;
8512 lck_grp_t *       nspace_mutex_group;
8513
8514 lck_mtx_t         nspace_handler_lock;
8515 lck_mtx_t         nspace_handler_exclusion_lock;
8516
8517 time_t snapshot_timestamp=0;
8518 int nspace_allow_virtual_devs=0;
8519
8520 void nspace_handler_init(void);
8521
8522 typedef struct nspace_item_info {
8523         struct vnode *vp;
8524         void         *arg;
8525         uint64_t      op;
8526         uint32_t      vid;
8527         uint32_t      flags;
8528         uint32_t      token;
8529         uint32_t      refcount;
8530 } nspace_item_info;
8531
8532 #define MAX_NSPACE_ITEMS   128
8533 nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
8534 uint32_t      nspace_item_idx=0;              // also used as the sleep/wakeup rendezvous address
8535 uint32_t      nspace_token_id=0;
8536 uint32_t      nspace_handler_timeout = 15;    // seconds
8537
8538 #define NSPACE_ITEM_NEW         0x0001
8539 #define NSPACE_ITEM_PROCESSING  0x0002
8540 #define NSPACE_ITEM_DEAD        0x0004
8541 #define NSPACE_ITEM_CANCELLED   0x0008
8542 #define NSPACE_ITEM_DONE        0x0010
8543 #define NSPACE_ITEM_RESET_TIMER 0x0020
8544
8545 #define NSPACE_ITEM_NSPACE_EVENT   0x0040
8546 #define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
8547
8548 #define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT)
8549
8550 //#pragma optimization_level 0
8551
8552 typedef enum {
8553         NSPACE_HANDLER_NSPACE = 0,
8554         NSPACE_HANDLER_SNAPSHOT = 1,
8555
8556         NSPACE_HANDLER_COUNT,
8557 } nspace_type_t;
8558
8559 typedef struct {
8560         uint64_t handler_tid;
8561         struct proc *handler_proc;
8562         int handler_busy;
8563 } nspace_handler_t;
8564
8565 nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
8566
8567 /* namespace fsctl functions */
8568 static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
8569 static int nspace_item_flags_for_type(nspace_type_t nspace_type);
8570 static int nspace_open_flags_for_type(nspace_type_t nspace_type);
8571 static nspace_type_t nspace_type_for_op(uint64_t op);
8572 static int nspace_is_special_process(struct proc *proc);
8573 static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
8574 static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
8575 static int validate_namespace_args (int is64bit, int size);
8576 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
8577
8578
8579 static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
8580 {
8581         switch(nspace_type) {
8582                 case NSPACE_HANDLER_NSPACE:
8583                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
8584                 case NSPACE_HANDLER_SNAPSHOT:
8585                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
8586                 default:
8587                         printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
8588                         return 0;
8589         }
8590 }
8591
8592 static inline int nspace_item_flags_for_type(nspace_type_t nspace_type)
8593 {
8594         switch(nspace_type) {
8595                 case NSPACE_HANDLER_NSPACE:
8596                         return NSPACE_ITEM_NSPACE_EVENT;
8597                 case NSPACE_HANDLER_SNAPSHOT:
8598                         return NSPACE_ITEM_SNAPSHOT_EVENT;
8599                 default:
8600                         printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
8601                         return 0;
8602         }
8603 }
8604
8605 static inline int nspace_open_flags_for_type(nspace_type_t nspace_type)
8606 {
8607         switch(nspace_type) {
8608                 case NSPACE_HANDLER_NSPACE:
8609                         return FREAD | FWRITE | O_EVTONLY;
8610                 case NSPACE_HANDLER_SNAPSHOT:
8611                         return FREAD | O_EVTONLY;
8612                 default:
8613                         printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
8614                         return 0;
8615         }
8616 }
8617
8618 static inline nspace_type_t nspace_type_for_op(uint64_t op)
8619 {
8620         switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
8621                 case NAMESPACE_HANDLER_NSPACE_EVENT:
8622                         return NSPACE_HANDLER_NSPACE;
8623                 case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
8624                         return NSPACE_HANDLER_SNAPSHOT;
8625                 default:
8626                         printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
8627                         return NSPACE_HANDLER_NSPACE;
8628         }
8629 }
8630
8631 static inline int nspace_is_special_process(struct proc *proc)
8632 {
8633         int i;
8634         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
8635                 if (proc == nspace_handlers[i].handler_proc)
8636                         return 1;
8637         }
8638         return 0;
8639 }
8640
8641 void
8642 nspace_handler_init(void)
8643 {
8644         nspace_lock_attr    = lck_attr_alloc_init();
8645         nspace_group_attr   = lck_grp_attr_alloc_init();
8646         nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
8647         lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
8648         lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
8649         memset(&nspace_items[0], 0, sizeof(nspace_items));
8650 }
8651
8652 void
8653 nspace_proc_exit(struct proc *p)
8654 {
8655         int i, event_mask = 0;
8656
8657         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
8658                 if (p == nspace_handlers[i].handler_proc) {
8659                         event_mask |= nspace_item_flags_for_type(i);
8660                         nspace_handlers[i].handler_tid = 0;
8661                         nspace_handlers[i].handler_proc = NULL;
8662                 }
8663         }
8664
8665         if (event_mask == 0) {
8666                 return;
8667         }
8668
8669         if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
8670                 // if this process was the snapshot handler, zero snapshot_timeout
8671                 snapshot_timestamp = 0;
8672         }
8673
8674         //
8675         // unblock anyone that's waiting for the handler that died
8676         //
8677         lck_mtx_lock(&nspace_handler_lock);
8678         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8679                 if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
8680
8681                         if ( nspace_items[i].flags & event_mask ) {
8682
8683                                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
8684                                         vnode_lock_spin(nspace_items[i].vp);
8685                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
8686                                         vnode_unlock(nspace_items[i].vp);
8687                                 }
8688                                 nspace_items[i].vp = NULL;
8689                                 nspace_items[i].vid = 0;
8690                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
8691                                 nspace_items[i].token = 0;
8692
8693                                 wakeup((caddr_t)&(nspace_items[i].vp));
8694                         }
8695                 }
8696         }
8697
8698         wakeup((caddr_t)&nspace_item_idx);
8699         lck_mtx_unlock(&nspace_handler_lock);
8700 }
8701
8702
8703 int
8704 resolve_nspace_item(struct vnode *vp, uint64_t op)
8705 {
8706         return resolve_nspace_item_ext(vp, op, NULL);
8707 }
8708
8709 int
8710 resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
8711 {
8712         int i, error, keep_waiting;
8713         struct timespec ts;
8714         nspace_type_t nspace_type = nspace_type_for_op(op);
8715
8716         // only allow namespace events on regular files, directories and symlinks.
8717         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
8718                 return 0;
8719         }
8720
8721         //
8722         // if this is a snapshot event and the vnode is on a
8723         // disk image just pretend nothing happened since any
8724         // change to the disk image will cause the disk image
8725         // itself to get backed up and this avoids multi-way
8726         // deadlocks between the snapshot handler and the ever
8727         // popular diskimages-helper process.  the variable
8728         // nspace_allow_virtual_devs allows this behavior to
8729         // be overridden (for use by the Mobile TimeMachine
8730         // testing infrastructure which uses disk images)
8731         //
8732         if (   (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
8733             && (vp->v_mount != NULL)
8734             && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
8735             && !nspace_allow_virtual_devs) {
8736
8737                 return 0;
8738         }
8739
8740         // if (thread_tid(current_thread()) == namespace_handler_tid) {
8741         if (nspace_handlers[nspace_type].handler_proc == NULL) {
8742                 return 0;
8743         }
8744
8745         if (nspace_is_special_process(current_proc())) {
8746                 return EDEADLK;
8747         }
8748
8749         lck_mtx_lock(&nspace_handler_lock);
8750
8751 retry:
8752         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8753                 if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
8754                         break;
8755                 }
8756         }
8757
8758         if (i >= MAX_NSPACE_ITEMS) {
8759                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8760                         if (nspace_items[i].flags == 0) {
8761                                 break;
8762                         }
8763                 }
8764         } else {
8765                 nspace_items[i].refcount++;
8766         }
8767
8768         if (i >= MAX_NSPACE_ITEMS) {
8769                 ts.tv_sec = nspace_handler_timeout;
8770                 ts.tv_nsec = 0;
8771
8772                 error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS|PCATCH, "nspace-no-space", &ts);
8773                 if (error == 0) {
8774                         // an entry got free'd up, go see if we can get a slot
8775                         goto retry;
8776                 } else {
8777                         lck_mtx_unlock(&nspace_handler_lock);
8778                         return error;
8779                 }
8780         }
8781
8782         //
8783         // if it didn't already exist, add it.  if it did exist
8784         // we'll get woken up when someone does a wakeup() on
8785         // the slot in the nspace_items table.
8786         //
8787         if (vp != nspace_items[i].vp) {
8788                 nspace_items[i].vp = vp;
8789                 nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
8790                 nspace_items[i].op = op;
8791                 nspace_items[i].vid = vnode_vid(vp);
8792                 nspace_items[i].flags = NSPACE_ITEM_NEW;
8793                 nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
8794                 if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
8795                         if (arg) {
8796                                 vnode_lock_spin(vp);
8797                                 vp->v_flag |= VNEEDSSNAPSHOT;
8798                                 vnode_unlock(vp);
8799                         }
8800                 }
8801
8802                 nspace_items[i].token = 0;
8803                 nspace_items[i].refcount = 1;
8804
8805                 wakeup((caddr_t)&nspace_item_idx);
8806         }
8807
8808         //
8809         // Now go to sleep until the handler does a wakeup on this
8810         // slot in the nspace_items table (or we timeout).
8811         //
8812         keep_waiting = 1;
8813         while(keep_waiting) {
8814                 ts.tv_sec = nspace_handler_timeout;
8815                 ts.tv_nsec = 0;
8816                 error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS|PCATCH, "namespace-done", &ts);
8817
8818                 if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
8819                         error = 0;
8820                 } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
8821                         error = nspace_items[i].token;
8822                 } else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
8823                         if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
8824                                 nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
8825                                 continue;
8826                         } else {
8827                                 error = ETIMEDOUT;
8828                         }
8829                 } else if (error == 0) {
8830                         // hmmm, why did we get woken up?
8831                         printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
8832                                nspace_items[i].token);
8833                 }
8834
8835                 if (--nspace_items[i].refcount == 0) {
8836                         nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
8837                         nspace_items[i].arg = NULL;
8838                         nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
8839                         nspace_items[i].flags = 0;     // this clears it for re-use
8840                 }
8841                 wakeup(&nspace_token_id);
8842                 keep_waiting = 0;
8843         }
8844
8845         lck_mtx_unlock(&nspace_handler_lock);
8846
8847         return error;
8848 }
8849
8850
8851 int
8852 get_nspace_item_status(struct vnode *vp, int32_t *status)
8853 {
8854         int i;
8855
8856         lck_mtx_lock(&nspace_handler_lock);
8857         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
8858                 if (nspace_items[i].vp == vp) {
8859                         break;
8860                 }
8861         }
8862
8863         if (i >= MAX_NSPACE_ITEMS) {
8864                 lck_mtx_unlock(&nspace_handler_lock);
8865                 return ENOENT;
8866         }
8867
8868         *status = nspace_items[i].flags;
8869         lck_mtx_unlock(&nspace_handler_lock);
8870         return 0;
8871 }
8872
8873
8874 #if 0
8875 static int
8876 build_volfs_path(struct vnode *vp, char *path, int *len)
8877 {
8878         struct vnode_attr va;
8879         int ret;
8880
8881         VATTR_INIT(&va);
8882         VATTR_WANTED(&va, va_fsid);
8883         VATTR_WANTED(&va, va_fileid);
8884
8885         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
8886                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
8887                 ret = -1;
8888         } else {
8889                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
8890                 ret = 0;
8891         }
8892
8893         return ret;
8894 }
8895 #endif
8896
8897 //
8898 // Note: this function does NOT check permissions on all of the
8899 // parent directories leading to this vnode.  It should only be
8900 // called on behalf of a root process.  Otherwise a process may
8901 // get access to a file because the file itself is readable even
8902 // though its parent directories would prevent access.
8903 //
8904 static int
8905 vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
8906 {
8907         int error, action;
8908
8909         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
8910                 return error;
8911         }
8912
8913 #if CONFIG_MACF
8914         error = mac_vnode_check_open(ctx, vp, fmode);
8915         if (error)
8916                 return error;
8917 #endif
8918
8919         /* compute action to be authorized */
8920         action = 0;
8921         if (fmode & FREAD) {
8922                 action |= KAUTH_VNODE_READ_DATA;
8923         }
8924         if (fmode & (FWRITE | O_TRUNC)) {
8925                 /*
8926                  * If we are writing, appending, and not truncating,
8927                  * indicate that we are appending so that if the
8928                  * UF_APPEND or SF_APPEND bits are set, we do not deny
8929                  * the open.
8930                  */
8931                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
8932                         action |= KAUTH_VNODE_APPEND_DATA;
8933                 } else {
8934                         action |= KAUTH_VNODE_WRITE_DATA;
8935                 }
8936         }
8937
8938         if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
8939                 return error;
8940
8941
8942         //
8943         // if the vnode is tagged VOPENEVT and the current process
8944         // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
8945         // flag to the open mode so that this open won't count against
8946         // the vnode when carbon delete() does a vnode_isinuse() to see
8947         // if a file is currently in use.  this allows spotlight
8948         // importers to not interfere with carbon apps that depend on
8949         // the no-delete-if-busy semantics of carbon delete().
8950         //
8951         if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
8952                 fmode |= O_EVTONLY;
8953         }
8954
8955         if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) {
8956                 return error;
8957         }
8958         if ( (error = vnode_ref_ext(vp, fmode, 0)) ) {
8959                 VNOP_CLOSE(vp, fmode, ctx);
8960                 return error;
8961         }
8962
8963         /* Call out to allow 3rd party notification of open.
8964          * Ignore result of kauth_authorize_fileop call.
8965          */
8966 #if CONFIG_MACF
8967         mac_vnode_notify_open(ctx, vp, fmode);
8968 #endif
8969         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
8970                                (uintptr_t)vp, 0);
8971
8972
8973         return 0;
8974 }
8975
8976 static int
8977 wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
8978 {
8979         int i, error=0, unblock=0;
8980         task_t curtask;
8981
8982         lck_mtx_lock(&nspace_handler_exclusion_lock);
8983         if (nspace_handlers[nspace_type].handler_busy) {
8984                 lck_mtx_unlock(&nspace_handler_exclusion_lock);
8985                 return EBUSY;
8986         }
8987         nspace_handlers[nspace_type].handler_busy = 1;
8988         lck_mtx_unlock(&nspace_handler_exclusion_lock);
8989
8990         /*
8991          * Any process that gets here will be one of the namespace handlers.
8992          * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
8993          * as we can cause deadlocks to occur, because the namespace handler may prevent
8994          * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
8995          * process.
8996          */
8997         curtask = current_task();
8998         bsd_set_dependency_capable (curtask);
8999
9000         lck_mtx_lock(&nspace_handler_lock);
9001         if (nspace_handlers[nspace_type].handler_proc == NULL) {
9002                 nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
9003                 nspace_handlers[nspace_type].handler_proc = current_proc();
9004         }
9005
9006         while (error == 0) {
9007
9008                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9009                         if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9010                                 if (!nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9011                                         continue;
9012                                 }
9013                                 break;
9014                         }
9015                 }
9016
9017                 if (i < MAX_NSPACE_ITEMS) {
9018                         nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
9019                         nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
9020                         nspace_items[i].token  = ++nspace_token_id;
9021
9022                         if (nspace_items[i].vp) {
9023                                 struct fileproc *fp;
9024                                 int32_t indx, fmode;
9025                                 struct proc *p = current_proc();
9026                                 vfs_context_t ctx = vfs_context_current();
9027                                 struct vnode_attr va;
9028
9029
9030                                 /*
9031                                  * Use vnode pointer to acquire a file descriptor for
9032                                  * hand-off to userland
9033                                  */
9034                                 fmode = nspace_open_flags_for_type(nspace_type);
9035                                 error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
9036                                 if (error) {
9037                                         unblock = 1;
9038                                         break;
9039                                 }
9040                                 error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
9041                                 if (error) {
9042                                         unblock = 1;
9043                                         vnode_put(nspace_items[i].vp);
9044                                         break;
9045                                 }
9046
9047                                 if ((error = falloc(p, &fp, &indx, ctx))) {
9048                                         vn_close(nspace_items[i].vp, fmode, ctx);
9049                                         vnode_put(nspace_items[i].vp);
9050                                         unblock = 1;
9051                                         break;
9052                                 }
9053
9054                                 fp->f_fglob->fg_flag = fmode;
9055                                 fp->f_fglob->fg_ops = &vnops;
9056                                 fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
9057
9058                                 proc_fdlock(p);
9059                                 procfdtbl_releasefd(p, indx, NULL);
9060                                 fp_drop(p, indx, fp, 1);
9061                                 proc_fdunlock(p);
9062
9063                                 /*
9064                                  * All variants of the namespace handler struct support these three fields:
9065                                  * token, flags, and the FD pointer
9066                                  */
9067                                 error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
9068                                 error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
9069                                 error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
9070
9071                                 /*
9072                                  * Handle optional fields:
9073                                  * extended version support an info ptr (offset, length), and the
9074                                  *
9075                                  * namedata version supports a unique per-link object ID
9076                                  *
9077                                  */
9078                                 if (nhd->infoptr) {
9079                                         uio_t uio = (uio_t)nspace_items[i].arg;
9080                                         uint64_t u_offset, u_length;
9081
9082                                         if (uio) {
9083                                                 u_offset = uio_offset(uio);
9084                                                 u_length = uio_resid(uio);
9085                                         } else {
9086                                                 u_offset = 0;
9087                                                 u_length = 0;
9088                                         }
9089                                         error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
9090                                         error = copyout(&u_length, nhd->infoptr+sizeof(uint64_t), sizeof(uint64_t));
9091                                 }
9092
9093                                 if (nhd->objid) {
9094                                         VATTR_INIT(&va);
9095                                         VATTR_WANTED(&va, va_linkid);
9096                                         error = vnode_getattr(nspace_items[i].vp, &va, ctx);
9097                                         if (error == 0 ) {
9098                                                 uint64_t linkid = 0;
9099                                                 if (VATTR_IS_SUPPORTED (&va, va_linkid)) {
9100                                                         linkid = (uint64_t)va.va_linkid;
9101                                                 }
9102                                                 error = copyout (&linkid, nhd->objid, sizeof(uint64_t));
9103                                         }
9104                                 }
9105
9106                                 if (error) {
9107                                         vn_close(nspace_items[i].vp, fmode, ctx);
9108                                         fp_free(p, indx, fp);
9109                                         unblock = 1;
9110                                 }
9111
9112                                 vnode_put(nspace_items[i].vp);
9113
9114                                 break;
9115                         } else {
9116                                 printf("wait_for_nspace_event: failed (nspace_items[%d] == %p error %d, name %s)\n",
9117                                        i, nspace_items[i].vp, error, nspace_items[i].vp->v_name);
9118                         }
9119
9120                 } else {
9121                         error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0);
9122                         if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9123                                 error = EINVAL;
9124                                 break;
9125                         }
9126
9127                 }
9128         }
9129
9130         if (unblock) {
9131                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9132                         vnode_lock_spin(nspace_items[i].vp);
9133                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9134                         vnode_unlock(nspace_items[i].vp);
9135                 }
9136                 nspace_items[i].vp = NULL;
9137                 nspace_items[i].vid = 0;
9138                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9139                 nspace_items[i].token = 0;
9140
9141                 wakeup((caddr_t)&(nspace_items[i].vp));
9142         }
9143
9144         if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
9145                 // just go through every snapshot event and unblock it immediately.
9146                 if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9147                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9148                                 if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9149                                         if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9150                                                 nspace_items[i].vp = NULL;
9151                                                 nspace_items[i].vid = 0;
9152                                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9153                                                 nspace_items[i].token = 0;
9154
9155                                                 wakeup((caddr_t)&(nspace_items[i].vp));
9156                                         }
9157                                 }
9158                         }
9159                 }
9160         }
9161
9162         lck_mtx_unlock(&nspace_handler_lock);
9163
9164         lck_mtx_lock(&nspace_handler_exclusion_lock);
9165         nspace_handlers[nspace_type].handler_busy = 0;
9166         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9167
9168         return error;
9169 }
9170
9171 static inline int validate_namespace_args (int is64bit, int size) {
9172
9173         if (is64bit) {
9174                 /* Must be one of these */
9175                 if (size == sizeof(user64_namespace_handler_info)) {
9176                         goto sizeok;
9177                 }
9178                 if (size == sizeof(user64_namespace_handler_info_ext)) {
9179                         goto sizeok;
9180                 }
9181                 if (size == sizeof(user64_namespace_handler_data)) {
9182                         goto sizeok;
9183                 }
9184                 return EINVAL;
9185         }
9186         else {
9187                 /* 32 bit -- must be one of these */
9188                 if (size == sizeof(user32_namespace_handler_info)) {
9189                         goto sizeok;
9190                 }
9191                 if (size == sizeof(user32_namespace_handler_info_ext)) {
9192                         goto sizeok;
9193                 }
9194                 if (size == sizeof(user32_namespace_handler_data)) {
9195                         goto sizeok;
9196                 }
9197                 return EINVAL;
9198         }
9199
9200 sizeok:
9201
9202         return 0;
9203
9204 }
9205
9206 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
9207 {
9208         int error = 0;
9209         namespace_handler_data nhd;
9210
9211         bzero (&nhd, sizeof(namespace_handler_data));
9212
9213         if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
9214                         (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9215                 return EINVAL;
9216         }
9217
9218         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9219                 return error;
9220         }
9221
9222         error = validate_namespace_args (is64bit, size);
9223         if (error) {
9224                 return error;
9225         }
9226
9227         /* Copy in the userland pointers into our kernel-only struct */
9228
9229         if (is64bit) {
9230                 /* 64 bit userland structures */
9231                 nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
9232                 nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
9233                 nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
9234
9235                 /* If the size is greater than the standard info struct, add in extra fields */
9236                 if (size > (sizeof(user64_namespace_handler_info))) {
9237                         if (size >= (sizeof(user64_namespace_handler_info_ext))) {
9238                                 nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
9239                         }
9240                         if (size == (sizeof(user64_namespace_handler_data))) {
9241                                 nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
9242                         }
9243                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9244                 }
9245         }
9246         else {
9247                 /* 32 bit userland structures */
9248                 nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
9249                 nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
9250                 nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
9251
9252                 if (size > (sizeof(user32_namespace_handler_info))) {
9253                         if (size >= (sizeof(user32_namespace_handler_info_ext))) {
9254                                 nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
9255                         }
9256                         if (size == (sizeof(user32_namespace_handler_data))) {
9257                                 nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
9258                         }
9259                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9260                 }
9261         }
9262
9263         return wait_for_namespace_event(&nhd, nspace_type);
9264 }
9265
9266 /*
9267  * Make a filesystem-specific control call:
9268  */
9269 /* ARGSUSED */
9270 static int
9271 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
9272 {
9273         int error=0;
9274         boolean_t is64bit;
9275         u_int size;
9276 #define STK_PARAMS 128
9277         char stkbuf[STK_PARAMS];
9278         caddr_t data, memp;
9279         vnode_t vp = *arg_vp;
9280
9281         size = IOCPARM_LEN(cmd);
9282         if (size > IOCPARM_MAX) return (EINVAL);
9283
9284         is64bit = proc_is64bit(p);
9285
9286         memp = NULL;
9287
9288
9289         /*
9290          * ensure the buffer is large enough for underlying calls
9291          */
9292 #ifndef HFSIOC_GETPATH
9293         typedef char pn_t[MAXPATHLEN];
9294 #define HFSIOC_GETPATH  _IOWR('h', 13, pn_t)
9295 #endif
9296
9297 #ifndef HFS_GETPATH
9298 #define HFS_GETPATH  IOCBASECMD(HFSIOC_GETPATH)
9299 #endif
9300         if (IOCBASECMD(cmd) == HFS_GETPATH) {
9301                 /* Round up to MAXPATHLEN regardless of user input */
9302                 size = MAXPATHLEN;
9303         }
9304
9305         if (size > sizeof (stkbuf)) {
9306                 if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
9307                 data = memp;
9308         } else {
9309                 data = &stkbuf[0];
9310         };
9311
9312         if (cmd & IOC_IN) {
9313                 if (size) {
9314                         error = copyin(udata, data, size);
9315                         if (error) {
9316                                 if (memp) {
9317                                         kfree (memp, size);
9318                                 }
9319                                 return error;
9320                         }
9321                 } else {
9322                         if (is64bit) {
9323                                 *(user_addr_t *)data = udata;
9324                         }
9325                         else {
9326                                 *(uint32_t *)data = (uint32_t)udata;
9327                         }
9328                 };
9329         } else if ((cmd & IOC_OUT) && size) {
9330                 /*
9331                  * Zero the buffer so the user always
9332                  * gets back something deterministic.
9333                  */
9334                 bzero(data, size);
9335         } else if (cmd & IOC_VOID) {
9336                 if (is64bit) {
9337                         *(user_addr_t *)data = udata;
9338                 }
9339                 else {
9340                         *(uint32_t *)data = (uint32_t)udata;
9341                 }
9342         }
9343
9344         /* Check to see if it's a generic command */
9345         switch (IOCBASECMD(cmd)) {
9346
9347                 case FSCTL_SYNC_VOLUME: {
9348                         mount_t mp = vp->v_mount;
9349                         int arg = *(uint32_t*)data;
9350
9351                         /* record vid of vp so we can drop it below. */
9352                         uint32_t vvid = vp->v_id;
9353
9354                         /*
9355                          * Then grab mount_iterref so that we can release the vnode.
9356                          * Without this, a thread may call vnode_iterate_prepare then
9357                          * get into a deadlock because we've never released the root vp
9358                          */
9359                         error = mount_iterref (mp, 0);
9360                         if (error)  {
9361                                 break;
9362                         }
9363                         vnode_put(vp);
9364
9365                         /* issue the sync for this volume */
9366                         (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
9367
9368                         /*
9369                          * Then release the mount_iterref once we're done syncing; it's not
9370                          * needed for the VNOP_IOCTL below
9371                          */
9372                         mount_iterdrop(mp);
9373
9374                         if (arg & FSCTL_SYNC_FULLSYNC) {
9375                                 /* re-obtain vnode iocount on the root vp, if possible */
9376                                 error = vnode_getwithvid (vp, vvid);
9377                                 if (error == 0) {
9378                                         error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
9379                                         vnode_put (vp);
9380                                 }
9381                         }
9382                         /* mark the argument VP as having been released */
9383                         *arg_vp = NULL;
9384                 }
9385                 break;
9386
9387                 case FSCTL_ROUTEFS_SETROUTEID: {
9388 #if ROUTEFS
9389                         char routepath[MAXPATHLEN];
9390                         size_t len = 0;
9391
9392                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9393                                 break;
9394                         }
9395                         bzero(routepath, MAXPATHLEN);
9396                         error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
9397                         if (error) {
9398                                 break;
9399                         }
9400                         error = routefs_kernel_mount(routepath);
9401                         if (error) {
9402                                 break;
9403                         }
9404 #endif
9405                 }
9406                 break;
9407
9408                 case FSCTL_SET_PACKAGE_EXTS: {
9409                         user_addr_t ext_strings;
9410                         uint32_t    num_entries;
9411                         uint32_t    max_width;
9412
9413                         if (   (is64bit && size != sizeof(user64_package_ext_info))
9414                                         || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
9415
9416                                 // either you're 64-bit and passed a 64-bit struct or
9417                                 // you're 32-bit and passed a 32-bit struct.  otherwise
9418                                 // it's not ok.
9419                                 error = EINVAL;
9420                                 break;
9421                         }
9422
9423                         if (is64bit) {
9424                                 ext_strings = ((user64_package_ext_info *)data)->strings;
9425                                 num_entries = ((user64_package_ext_info *)data)->num_entries;
9426                                 max_width   = ((user64_package_ext_info *)data)->max_width;
9427                         } else {
9428                                 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
9429                                 num_entries = ((user32_package_ext_info *)data)->num_entries;
9430                                 max_width   = ((user32_package_ext_info *)data)->max_width;
9431                         }
9432                         error = set_package_extensions_table(ext_strings, num_entries, max_width);
9433                 }
9434                 break;
9435
9436                 /* namespace handlers */
9437                 case FSCTL_NAMESPACE_HANDLER_GET: {
9438                         error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
9439                 }
9440                 break;
9441
9442                 /* Snapshot handlers */
9443                 case FSCTL_OLD_SNAPSHOT_HANDLER_GET: {
9444                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
9445                 }
9446                 break;
9447
9448                 case FSCTL_SNAPSHOT_HANDLER_GET_EXT: {
9449                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
9450                 }
9451                 break;
9452
9453                 case FSCTL_NAMESPACE_HANDLER_UPDATE: {
9454                         uint32_t token, val;
9455                         int i;
9456
9457                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
9458                                 break;
9459                         }
9460
9461                         if (!nspace_is_special_process(p)) {
9462                                 error = EINVAL;
9463                                 break;
9464                         }
9465
9466                         token = ((uint32_t *)data)[0];
9467                         val   = ((uint32_t *)data)[1];
9468
9469                         lck_mtx_lock(&nspace_handler_lock);
9470
9471                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9472                                 if (nspace_items[i].token == token) {
9473                                         break;  /* exit for loop, not case stmt */
9474                                 }
9475                         }
9476
9477                         if (i >= MAX_NSPACE_ITEMS) {
9478                                 error = ENOENT;
9479                         } else {
9480                                 //
9481                                 // if this bit is set, when resolve_nspace_item() times out
9482                                 // it will loop and go back to sleep.
9483                                 //
9484                                 nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
9485                         }
9486
9487                         lck_mtx_unlock(&nspace_handler_lock);
9488
9489                         if (error) {
9490                                 printf("nspace-handler-update: did not find token %u\n", token);
9491                         }
9492                 }
9493                 break;
9494
9495                 case FSCTL_NAMESPACE_HANDLER_UNBLOCK: {
9496                         uint32_t token, val;
9497                         int i;
9498
9499                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
9500                                 break;
9501                         }
9502
9503                         if (!nspace_is_special_process(p)) {
9504                                 error = EINVAL;
9505                                 break;
9506                         }
9507
9508                         token = ((uint32_t *)data)[0];
9509                         val   = ((uint32_t *)data)[1];
9510
9511                         lck_mtx_lock(&nspace_handler_lock);
9512
9513                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9514                                 if (nspace_items[i].token == token) {
9515                                         break; /* exit for loop, not case statement */
9516                                 }
9517                         }
9518
9519                         if (i >= MAX_NSPACE_ITEMS) {
9520                                 printf("nspace-handler-unblock: did not find token %u\n", token);
9521                                 error = ENOENT;
9522                         } else {
9523                                 if (val == 0 && nspace_items[i].vp) {
9524                                         vnode_lock_spin(nspace_items[i].vp);
9525                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9526                                         vnode_unlock(nspace_items[i].vp);
9527                                 }
9528
9529                                 nspace_items[i].vp = NULL;
9530                                 nspace_items[i].arg = NULL;
9531                                 nspace_items[i].op = 0;
9532                                 nspace_items[i].vid = 0;
9533                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9534                                 nspace_items[i].token = 0;
9535
9536                                 wakeup((caddr_t)&(nspace_items[i].vp));
9537                         }
9538
9539                         lck_mtx_unlock(&nspace_handler_lock);
9540                 }
9541                 break;
9542
9543                 case FSCTL_NAMESPACE_HANDLER_CANCEL: {
9544                         uint32_t token, val;
9545                         int i;
9546
9547                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
9548                                 break;
9549                         }
9550
9551                         if (!nspace_is_special_process(p)) {
9552                                 error = EINVAL;
9553                                 break;
9554                         }
9555
9556                         token = ((uint32_t *)data)[0];
9557                         val   = ((uint32_t *)data)[1];
9558
9559                         lck_mtx_lock(&nspace_handler_lock);
9560
9561                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9562                                 if (nspace_items[i].token == token) {
9563                                         break;  /* exit for loop, not case stmt */
9564                                 }
9565                         }
9566
9567                         if (i >= MAX_NSPACE_ITEMS) {
9568                                 printf("nspace-handler-cancel: did not find token %u\n", token);
9569                                 error = ENOENT;
9570                         } else {
9571                                 if (nspace_items[i].vp) {
9572                                         vnode_lock_spin(nspace_items[i].vp);
9573                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9574                                         vnode_unlock(nspace_items[i].vp);
9575                                 }
9576
9577                                 nspace_items[i].vp = NULL;
9578                                 nspace_items[i].arg = NULL;
9579                                 nspace_items[i].vid = 0;
9580                                 nspace_items[i].token = val;
9581                                 nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
9582                                 nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
9583
9584                                 wakeup((caddr_t)&(nspace_items[i].vp));
9585                         }
9586
9587                         lck_mtx_unlock(&nspace_handler_lock);
9588                 }
9589                 break;
9590
9591                 case FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
9592                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9593                                 break;
9594                         }
9595
9596                         // we explicitly do not do the namespace_handler_proc check here
9597
9598                         lck_mtx_lock(&nspace_handler_lock);
9599                         snapshot_timestamp = ((uint32_t *)data)[0];
9600                         wakeup(&nspace_item_idx);
9601                         lck_mtx_unlock(&nspace_handler_lock);
9602                         printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
9603
9604                 }
9605                 break;
9606
9607                 case FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
9608                 {
9609                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9610                                 break;
9611                         }
9612
9613                         lck_mtx_lock(&nspace_handler_lock);
9614                         nspace_allow_virtual_devs = ((uint32_t *)data)[0];
9615                         lck_mtx_unlock(&nspace_handler_lock);
9616                         printf("nspace-snapshot-handler will%s allow events on disk-images\n",
9617                                         nspace_allow_virtual_devs ? "" : " NOT");
9618                         error = 0;
9619
9620                 }
9621                 break;
9622
9623                 case FSCTL_SET_FSTYPENAME_OVERRIDE:
9624                 {
9625                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9626                                 break;
9627                         }
9628                         if (vp->v_mount) {
9629                                 mount_lock(vp->v_mount);
9630                                 if (data[0] != 0) {
9631                                         strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
9632                                         vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
9633                                         if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
9634                                                 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
9635                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
9636                                         }
9637                                 } else {
9638                                         if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
9639                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
9640                                         }
9641                                         vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
9642                                         vp->v_mount->fstypename_override[0] = '\0';
9643                                 }
9644                                 mount_unlock(vp->v_mount);
9645                         }
9646                 }
9647                 break;
9648
9649                 default: {
9650                         /* Invoke the filesystem-specific code */
9651                         error = VNOP_IOCTL(vp, IOCBASECMD(cmd), data, options, ctx);
9652                 }
9653
9654         } /* end switch stmt */
9655
9656         /*
9657          * if no errors, copy any data to user. Size was
9658          * already set and checked above.
9659          */
9660         if (error == 0 && (cmd & IOC_OUT) && size)
9661                 error = copyout(data, udata, size);
9662
9663         if (memp) {
9664                 kfree(memp, size);
9665         }
9666
9667         return error;
9668 }
9669
9670 /* ARGSUSED */
9671 int
9672 fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
9673 {
9674         int error;
9675         struct nameidata nd;
9676         u_long nameiflags;
9677         vnode_t vp = NULL;
9678         vfs_context_t ctx = vfs_context_current();
9679
9680         AUDIT_ARG(cmd, uap->cmd);
9681         AUDIT_ARG(value32, uap->options);
9682         /* Get the vnode for the file we are getting info on:  */
9683         nameiflags = 0;
9684         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
9685         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
9686                UIO_USERSPACE, uap->path, ctx);
9687         if ((error = namei(&nd))) goto done;
9688         vp = nd.ni_vp;
9689         nameidone(&nd);
9690
9691 #if CONFIG_MACF
9692         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
9693         if (error) {
9694                 goto done;
9695         }
9696 #endif
9697
9698         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
9699
9700 done:
9701         if (vp)
9702                 vnode_put(vp);
9703         return error;
9704 }
9705 /* ARGSUSED */
9706 int
9707 ffsctl (proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
9708 {
9709         int error;
9710         vnode_t vp = NULL;
9711         vfs_context_t ctx = vfs_context_current();
9712         int fd = -1;
9713
9714         AUDIT_ARG(fd, uap->fd);
9715         AUDIT_ARG(cmd, uap->cmd);
9716         AUDIT_ARG(value32, uap->options);
9717
9718         /* Get the vnode for the file we are getting info on:  */
9719         if ((error = file_vnode(uap->fd, &vp)))
9720                 return error;
9721         fd = uap->fd;
9722         if ((error = vnode_getwithref(vp))) {
9723                 file_drop(fd);
9724                 return error;
9725         }
9726
9727 #if CONFIG_MACF
9728         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
9729                 file_drop(fd);
9730                 vnode_put(vp);
9731                 return error;
9732         }
9733 #endif
9734
9735         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
9736
9737         file_drop(fd);
9738
9739         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
9740         if (vp) {
9741                 vnode_put(vp);
9742         }
9743
9744         return error;
9745 }
9746 /* end of fsctl system call */
9747
9748 /*
9749  *  Retrieve the data of an extended attribute.
9750  */
9751 int
9752 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
9753 {
9754         vnode_t vp;
9755         struct nameidata nd;
9756         char attrname[XATTR_MAXNAMELEN+1];
9757         vfs_context_t ctx = vfs_context_current();
9758         uio_t auio = NULL;
9759         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9760         size_t attrsize = 0;
9761         size_t namelen;
9762         u_int32_t nameiflags;
9763         int error;
9764         char uio_buf[ UIO_SIZEOF(1) ];
9765
9766         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9767                 return (EINVAL);
9768
9769         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9770         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
9771         if ((error = namei(&nd))) {
9772                 return (error);
9773         }
9774         vp = nd.ni_vp;
9775         nameidone(&nd);
9776
9777         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9778                 goto out;
9779         }
9780         if (xattr_protected(attrname)) {
9781                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
9782                         error = EPERM;
9783                         goto out;
9784                 }
9785         }
9786         /*
9787          * the specific check for 0xffffffff is a hack to preserve
9788          * binaray compatibilty in K64 with applications that discovered
9789          * that passing in a buf pointer and a size of -1 resulted in
9790          * just the size of the indicated extended attribute being returned.
9791          * this isn't part of the documented behavior, but because of the
9792          * original implemtation's check for "uap->size > 0", this behavior
9793          * was allowed. In K32 that check turned into a signed comparison
9794          * even though uap->size is unsigned...  in K64, we blow by that
9795          * check because uap->size is unsigned and doesn't get sign smeared
9796          * in the munger for a 32 bit user app.  we also need to add a
9797          * check to limit the maximum size of the buffer being passed in...
9798          * unfortunately, the underlying fileystems seem to just malloc
9799          * the requested size even if the actual extended attribute is tiny.
9800          * because that malloc is for kernel wired memory, we have to put a
9801          * sane limit on it.
9802          *
9803          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
9804          * U64 running on K64 will yield -1 (64 bits wide)
9805          * U32/U64 running on K32 will yield -1 (32 bits wide)
9806          */
9807         if (uap->size == 0xffffffff || uap->size == (size_t)-1)
9808                 goto no_uio;
9809
9810         if (uap->value) {
9811                 if (uap->size > (size_t)XATTR_MAXSIZE)
9812                         uap->size = XATTR_MAXSIZE;
9813
9814                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
9815                                             &uio_buf[0], sizeof(uio_buf));
9816                 uio_addiov(auio, uap->value, uap->size);
9817         }
9818 no_uio:
9819         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
9820 out:
9821         vnode_put(vp);
9822
9823         if (auio) {
9824                 *retval = uap->size - uio_resid(auio);
9825         } else {
9826                 *retval = (user_ssize_t)attrsize;
9827         }
9828
9829         return (error);
9830 }
9831
9832 /*
9833  * Retrieve the data of an extended attribute.
9834  */
9835 int
9836 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
9837 {
9838         vnode_t vp;
9839         char attrname[XATTR_MAXNAMELEN+1];
9840         uio_t auio = NULL;
9841         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9842         size_t attrsize = 0;
9843         size_t namelen;
9844         int error;
9845         char uio_buf[ UIO_SIZEOF(1) ];
9846
9847         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9848                 return (EINVAL);
9849
9850         if ( (error = file_vnode(uap->fd, &vp)) ) {
9851                 return (error);
9852         }
9853         if ( (error = vnode_getwithref(vp)) ) {
9854                 file_drop(uap->fd);
9855                 return(error);
9856         }
9857         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9858                 goto out;
9859         }
9860         if (xattr_protected(attrname)) {
9861                 error = EPERM;
9862                 goto out;
9863         }
9864         if (uap->value && uap->size > 0) {
9865                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
9866                                             &uio_buf[0], sizeof(uio_buf));
9867                 uio_addiov(auio, uap->value, uap->size);
9868         }
9869
9870         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
9871 out:
9872         (void)vnode_put(vp);
9873         file_drop(uap->fd);
9874
9875         if (auio) {
9876                 *retval = uap->size - uio_resid(auio);
9877         } else {
9878                 *retval = (user_ssize_t)attrsize;
9879         }
9880         return (error);
9881 }
9882
9883 /*
9884  * Set the data of an extended attribute.
9885  */
9886 int
9887 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
9888 {
9889         vnode_t vp;
9890         struct nameidata nd;
9891         char attrname[XATTR_MAXNAMELEN+1];
9892         vfs_context_t ctx = vfs_context_current();
9893         uio_t auio = NULL;
9894         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9895         size_t namelen;
9896         u_int32_t nameiflags;
9897         int error;
9898         char uio_buf[ UIO_SIZEOF(1) ];
9899
9900         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
9901                 return (EINVAL);
9902
9903         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9904                 if (error == EPERM) {
9905                         /* if the string won't fit in attrname, copyinstr emits EPERM */
9906                         return (ENAMETOOLONG);
9907                 }
9908                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
9909                 return error;
9910         }
9911         if (xattr_protected(attrname))
9912                 return(EPERM);
9913         if (uap->size != 0 && uap->value == 0) {
9914                 return (EINVAL);
9915         }
9916
9917         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
9918         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
9919         if ((error = namei(&nd))) {
9920                 return (error);
9921         }
9922         vp = nd.ni_vp;
9923         nameidone(&nd);
9924
9925         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
9926                                     &uio_buf[0], sizeof(uio_buf));
9927         uio_addiov(auio, uap->value, uap->size);
9928
9929         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
9930 #if CONFIG_FSE
9931         if (error == 0) {
9932                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
9933                     FSE_ARG_VNODE, vp,
9934                     FSE_ARG_DONE);
9935         }
9936 #endif
9937         vnode_put(vp);
9938         *retval = 0;
9939         return (error);
9940 }
9941
9942 /*
9943  * Set the data of an extended attribute.
9944  */
9945 int
9946 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
9947 {
9948         vnode_t vp;
9949         char attrname[XATTR_MAXNAMELEN+1];
9950         uio_t auio = NULL;
9951         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9952         size_t namelen;
9953         int error;
9954         char uio_buf[ UIO_SIZEOF(1) ];
9955 #if CONFIG_FSE
9956         vfs_context_t ctx = vfs_context_current();
9957 #endif
9958
9959         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
9960                 return (EINVAL);
9961
9962         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
9963                 if (error == EPERM) {
9964                         /* if the string won't fit in attrname, copyinstr emits EPERM */
9965                         return (ENAMETOOLONG);
9966                 }
9967                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
9968                 return error;
9969         }
9970         if (xattr_protected(attrname))
9971                 return(EPERM);
9972         if (uap->size != 0 && uap->value == 0) {
9973                 return (EINVAL);
9974         }
9975         if ( (error = file_vnode(uap->fd, &vp)) ) {
9976                 return (error);
9977         }
9978         if ( (error = vnode_getwithref(vp)) ) {
9979                 file_drop(uap->fd);
9980                 return(error);
9981         }
9982         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
9983                                     &uio_buf[0], sizeof(uio_buf));
9984         uio_addiov(auio, uap->value, uap->size);
9985
9986         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
9987 #if CONFIG_FSE
9988         if (error == 0) {
9989                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
9990                     FSE_ARG_VNODE, vp,
9991                     FSE_ARG_DONE);
9992         }
9993 #endif
9994         vnode_put(vp);
9995         file_drop(uap->fd);
9996         *retval = 0;
9997         return (error);
9998 }
9999
10000 /*
10001  * Remove an extended attribute.
10002  * XXX Code duplication here.
10003  */
10004 int
10005 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
10006 {
10007         vnode_t vp;
10008         struct nameidata nd;
10009         char attrname[XATTR_MAXNAMELEN+1];
10010         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10011         vfs_context_t ctx = vfs_context_current();
10012         size_t namelen;
10013         u_int32_t nameiflags;
10014         int error;
10015
10016         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10017                 return (EINVAL);
10018
10019         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10020         if (error != 0) {
10021                 return (error);
10022         }
10023         if (xattr_protected(attrname))
10024                 return(EPERM);
10025         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10026         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
10027         if ((error = namei(&nd))) {
10028                 return (error);
10029         }
10030         vp = nd.ni_vp;
10031         nameidone(&nd);
10032
10033         error = vn_removexattr(vp, attrname, uap->options, ctx);
10034 #if CONFIG_FSE
10035         if (error == 0) {
10036                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10037                     FSE_ARG_VNODE, vp,
10038                     FSE_ARG_DONE);
10039         }
10040 #endif
10041         vnode_put(vp);
10042         *retval = 0;
10043         return (error);
10044 }
10045
10046 /*
10047  * Remove an extended attribute.
10048  * XXX Code duplication here.
10049  */
10050 int
10051 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
10052 {
10053         vnode_t vp;
10054         char attrname[XATTR_MAXNAMELEN+1];
10055         size_t namelen;
10056         int error;
10057 #if CONFIG_FSE
10058         vfs_context_t ctx = vfs_context_current();
10059 #endif
10060
10061         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10062                 return (EINVAL);
10063
10064         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10065         if (error != 0) {
10066                 return (error);
10067         }
10068         if (xattr_protected(attrname))
10069                 return(EPERM);
10070         if ( (error = file_vnode(uap->fd, &vp)) ) {
10071                 return (error);
10072         }
10073         if ( (error = vnode_getwithref(vp)) ) {
10074                 file_drop(uap->fd);
10075                 return(error);
10076         }
10077
10078         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
10079 #if CONFIG_FSE
10080         if (error == 0) {
10081                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10082                     FSE_ARG_VNODE, vp,
10083                     FSE_ARG_DONE);
10084         }
10085 #endif
10086         vnode_put(vp);
10087         file_drop(uap->fd);
10088         *retval = 0;
10089         return (error);
10090 }
10091
10092 /*
10093  * Retrieve the list of extended attribute names.
10094  * XXX Code duplication here.
10095  */
10096 int
10097 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
10098 {
10099         vnode_t vp;
10100         struct nameidata nd;
10101         vfs_context_t ctx = vfs_context_current();
10102         uio_t auio = NULL;
10103         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10104         size_t attrsize = 0;
10105         u_int32_t nameiflags;
10106         int error;
10107         char uio_buf[ UIO_SIZEOF(1) ];
10108
10109         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10110                 return (EINVAL);
10111
10112         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10113         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
10114         if ((error = namei(&nd))) {
10115                 return (error);
10116         }
10117         vp = nd.ni_vp;
10118         nameidone(&nd);
10119         if (uap->namebuf != 0 && uap->bufsize > 0) {
10120                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
10121                                             &uio_buf[0], sizeof(uio_buf));
10122                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10123         }
10124
10125         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
10126
10127         vnode_put(vp);
10128         if (auio) {
10129                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10130         } else {
10131                 *retval = (user_ssize_t)attrsize;
10132         }
10133         return (error);
10134 }
10135
10136 /*
10137  * Retrieve the list of extended attribute names.
10138  * XXX Code duplication here.
10139  */
10140 int
10141 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
10142 {
10143         vnode_t vp;
10144         uio_t auio = NULL;
10145         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10146         size_t attrsize = 0;
10147         int error;
10148         char uio_buf[ UIO_SIZEOF(1) ];
10149
10150         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10151                 return (EINVAL);
10152
10153         if ( (error = file_vnode(uap->fd, &vp)) ) {
10154                 return (error);
10155         }
10156         if ( (error = vnode_getwithref(vp)) ) {
10157                 file_drop(uap->fd);
10158                 return(error);
10159         }
10160         if (uap->namebuf != 0 && uap->bufsize > 0) {
10161                 auio = uio_createwithbuffer(1, 0, spacetype,
10162                                                                           UIO_READ, &uio_buf[0], sizeof(uio_buf));
10163                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10164         }
10165
10166         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
10167
10168         vnode_put(vp);
10169         file_drop(uap->fd);
10170         if (auio) {
10171                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10172         } else {
10173                 *retval = (user_ssize_t)attrsize;
10174         }
10175         return (error);
10176 }
10177
10178 static int fsgetpath_internal(
10179         vfs_context_t ctx, int volfs_id, uint64_t objid,
10180         vm_size_t bufsize, caddr_t buf, int *pathlen)
10181 {
10182         int error;
10183         struct mount *mp = NULL;
10184         vnode_t vp;
10185         int length;
10186         int bpflags;
10187
10188         if (bufsize > PAGE_SIZE) {
10189                 return (EINVAL);
10190         }
10191
10192         if (buf == NULL) {
10193                 return (ENOMEM);
10194         }
10195
10196         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
10197                 error = ENOTSUP;  /* unexpected failure */
10198                 return ENOTSUP;
10199         }
10200
10201 unionget:
10202         if (objid == 2) {
10203                 error = VFS_ROOT(mp, &vp, ctx);
10204         } else {
10205                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
10206         }
10207
10208         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
10209                 /*
10210                  * If the fileid isn't found and we're in a union
10211                  * mount volume, then see if the fileid is in the
10212                  * mounted-on volume.
10213                  */
10214                 struct mount *tmp = mp;
10215                 mp = vnode_mount(tmp->mnt_vnodecovered);
10216                 vfs_unbusy(tmp);
10217                 if (vfs_busy(mp, LK_NOWAIT) == 0)
10218                         goto unionget;
10219         } else {
10220                 vfs_unbusy(mp);
10221         }
10222
10223         if (error) {
10224                 return error;
10225         }
10226
10227 #if CONFIG_MACF
10228         error = mac_vnode_check_fsgetpath(ctx, vp);
10229         if (error) {
10230                 vnode_put(vp);
10231                 return error;
10232         }
10233 #endif
10234
10235         /* Obtain the absolute path to this vnode. */
10236         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
10237         bpflags |= BUILDPATH_CHECK_MOVED;
10238         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
10239         vnode_put(vp);
10240
10241         if (error) {
10242                 goto out;
10243         }
10244
10245         AUDIT_ARG(text, buf);
10246
10247         if (kdebug_enable) {
10248                 long dbg_parms[NUMPARMS];
10249                 int  dbg_namelen;
10250
10251                 dbg_namelen = (int)sizeof(dbg_parms);
10252
10253         if (length < dbg_namelen) {
10254                         memcpy((char *)dbg_parms, buf, length);
10255                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
10256
10257                         dbg_namelen = length;
10258                 } else {
10259                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
10260                 }
10261
10262                 kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE);
10263         }
10264
10265         *pathlen = (user_ssize_t)length; /* may be superseded by error */
10266
10267 out:
10268         return (error);
10269 }
10270
10271 /*
10272  * Obtain the full pathname of a file system object by id.
10273  *
10274  * This is a private SPI used by the File Manager.
10275  */
10276 __private_extern__
10277 int
10278 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
10279 {
10280         vfs_context_t ctx = vfs_context_current();
10281         fsid_t fsid;
10282         char *realpath;
10283         int length;
10284         int error;
10285
10286         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
10287                 return (error);
10288         }
10289         AUDIT_ARG(value32, fsid.val[0]);
10290         AUDIT_ARG(value64, uap->objid);
10291         /* Restrict output buffer size for now. */
10292
10293         if (uap->bufsize > PAGE_SIZE) {
10294                 return (EINVAL);
10295         }
10296         MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK);
10297         if (realpath == NULL) {
10298                 return (ENOMEM);
10299         }
10300
10301         error = fsgetpath_internal(
10302                 ctx, fsid.val[0], uap->objid,
10303                 uap->bufsize, realpath, &length);
10304
10305         if (error) {
10306                 goto out;
10307         }
10308
10309         error = copyout((caddr_t)realpath, uap->buf, length);
10310
10311         *retval = (user_ssize_t)length; /* may be superseded by error */
10312 out:
10313         if (realpath) {
10314                 FREE(realpath, M_TEMP);
10315         }
10316         return (error);
10317 }
10318
10319 /*
10320  * Common routine to handle various flavors of statfs data heading out
10321  *      to user space.
10322  *
10323  * Returns:     0                       Success
10324  *              EFAULT
10325  */
10326 static int
10327 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
10328     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
10329     boolean_t partial_copy)
10330 {
10331         int             error;
10332         int             my_size, copy_size;
10333
10334         if (is_64_bit) {
10335                 struct user64_statfs sfs;
10336                 my_size = copy_size = sizeof(sfs);
10337                 bzero(&sfs, my_size);
10338                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10339                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10340                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10341                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
10342                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
10343                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
10344                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
10345                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
10346                 sfs.f_files = (user64_long_t)sfsp->f_files;
10347                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
10348                 sfs.f_fsid = sfsp->f_fsid;
10349                 sfs.f_owner = sfsp->f_owner;
10350                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
10351                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
10352                 } else {
10353                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
10354                 }
10355                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
10356                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
10357
10358                 if (partial_copy) {
10359                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
10360                 }
10361                 error = copyout((caddr_t)&sfs, bufp, copy_size);
10362         }
10363         else {
10364                 struct user32_statfs sfs;
10365
10366                 my_size = copy_size = sizeof(sfs);
10367                 bzero(&sfs, my_size);
10368
10369                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10370                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10371                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10372
10373                 /*
10374                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
10375                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
10376                  * to reflect the filesystem size as best we can.
10377                  */
10378                 if ((sfsp->f_blocks > INT_MAX)
10379                         /* Hack for 4061702 . I think the real fix is for Carbon to
10380                          * look for some volume capability and not depend on hidden
10381                          * semantics agreed between a FS and carbon.
10382                          * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
10383                          * for Carbon to set bNoVolumeSizes volume attribute.
10384                          * Without this the webdavfs files cannot be copied onto
10385                          * disk as they look huge. This change should not affect
10386                          * XSAN as they should not setting these to -1..
10387                          */
10388                          && (sfsp->f_blocks != 0xffffffffffffffffULL)
10389                          && (sfsp->f_bfree != 0xffffffffffffffffULL)
10390                          && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
10391                         int             shift;
10392
10393                         /*
10394                          * Work out how far we have to shift the block count down to make it fit.
10395                          * Note that it's possible to have to shift so far that the resulting
10396                          * blocksize would be unreportably large.  At that point, we will clip
10397                          * any values that don't fit.
10398                          *
10399                          * For safety's sake, we also ensure that f_iosize is never reported as
10400                          * being smaller than f_bsize.
10401                          */
10402                         for (shift = 0; shift < 32; shift++) {
10403                                 if ((sfsp->f_blocks >> shift) <= INT_MAX)
10404                                         break;
10405                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX)
10406                                         break;
10407                         }
10408 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
10409                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
10410                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
10411                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
10412 #undef __SHIFT_OR_CLIP
10413                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
10414                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
10415                 } else {
10416                         /* filesystem is small enough to be reported honestly */
10417                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
10418                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
10419                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
10420                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
10421                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
10422                 }
10423                 sfs.f_files = (user32_long_t)sfsp->f_files;
10424                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
10425                 sfs.f_fsid = sfsp->f_fsid;
10426                 sfs.f_owner = sfsp->f_owner;
10427                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
10428                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
10429                 } else {
10430                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
10431                 }
10432                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
10433                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
10434
10435                 if (partial_copy) {
10436                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
10437                 }
10438                 error = copyout((caddr_t)&sfs, bufp, copy_size);
10439         }
10440
10441         if (sizep != NULL) {
10442                 *sizep = my_size;
10443         }
10444         return(error);
10445 }
10446
10447 /*
10448  * copy stat structure into user_stat structure.
10449  */
10450 void munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
10451 {
10452         bzero(usbp, sizeof(*usbp));
10453
10454         usbp->st_dev = sbp->st_dev;
10455         usbp->st_ino = sbp->st_ino;
10456         usbp->st_mode = sbp->st_mode;
10457         usbp->st_nlink = sbp->st_nlink;
10458         usbp->st_uid = sbp->st_uid;
10459         usbp->st_gid = sbp->st_gid;
10460         usbp->st_rdev = sbp->st_rdev;
10461 #ifndef _POSIX_C_SOURCE
10462         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10463         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10464         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10465         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10466         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10467         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10468 #else
10469         usbp->st_atime = sbp->st_atime;
10470         usbp->st_atimensec = sbp->st_atimensec;
10471         usbp->st_mtime = sbp->st_mtime;
10472         usbp->st_mtimensec = sbp->st_mtimensec;
10473         usbp->st_ctime = sbp->st_ctime;
10474         usbp->st_ctimensec = sbp->st_ctimensec;
10475 #endif
10476         usbp->st_size = sbp->st_size;
10477         usbp->st_blocks = sbp->st_blocks;
10478         usbp->st_blksize = sbp->st_blksize;
10479         usbp->st_flags = sbp->st_flags;
10480         usbp->st_gen = sbp->st_gen;
10481         usbp->st_lspare = sbp->st_lspare;
10482         usbp->st_qspare[0] = sbp->st_qspare[0];
10483         usbp->st_qspare[1] = sbp->st_qspare[1];
10484 }
10485
10486 void munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
10487 {
10488         bzero(usbp, sizeof(*usbp));
10489
10490         usbp->st_dev = sbp->st_dev;
10491         usbp->st_ino = sbp->st_ino;
10492         usbp->st_mode = sbp->st_mode;
10493         usbp->st_nlink = sbp->st_nlink;
10494         usbp->st_uid = sbp->st_uid;
10495         usbp->st_gid = sbp->st_gid;
10496         usbp->st_rdev = sbp->st_rdev;
10497 #ifndef _POSIX_C_SOURCE
10498         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10499         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10500         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10501         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10502         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10503         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10504 #else
10505         usbp->st_atime = sbp->st_atime;
10506         usbp->st_atimensec = sbp->st_atimensec;
10507         usbp->st_mtime = sbp->st_mtime;
10508         usbp->st_mtimensec = sbp->st_mtimensec;
10509         usbp->st_ctime = sbp->st_ctime;
10510         usbp->st_ctimensec = sbp->st_ctimensec;
10511 #endif
10512         usbp->st_size = sbp->st_size;
10513         usbp->st_blocks = sbp->st_blocks;
10514         usbp->st_blksize = sbp->st_blksize;
10515         usbp->st_flags = sbp->st_flags;
10516         usbp->st_gen = sbp->st_gen;
10517         usbp->st_lspare = sbp->st_lspare;
10518         usbp->st_qspare[0] = sbp->st_qspare[0];
10519         usbp->st_qspare[1] = sbp->st_qspare[1];
10520 }
10521
10522 /*
10523  * copy stat64 structure into user_stat64 structure.
10524  */
10525 void munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
10526 {
10527         bzero(usbp, sizeof(*usbp));
10528
10529         usbp->st_dev = sbp->st_dev;
10530         usbp->st_ino = sbp->st_ino;
10531         usbp->st_mode = sbp->st_mode;
10532         usbp->st_nlink = sbp->st_nlink;
10533         usbp->st_uid = sbp->st_uid;
10534         usbp->st_gid = sbp->st_gid;
10535         usbp->st_rdev = sbp->st_rdev;
10536 #ifndef _POSIX_C_SOURCE
10537         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10538         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10539         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10540         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10541         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10542         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10543         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
10544         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
10545 #else
10546         usbp->st_atime = sbp->st_atime;
10547         usbp->st_atimensec = sbp->st_atimensec;
10548         usbp->st_mtime = sbp->st_mtime;
10549         usbp->st_mtimensec = sbp->st_mtimensec;
10550         usbp->st_ctime = sbp->st_ctime;
10551         usbp->st_ctimensec = sbp->st_ctimensec;
10552         usbp->st_birthtime = sbp->st_birthtime;
10553         usbp->st_birthtimensec = sbp->st_birthtimensec;
10554 #endif
10555         usbp->st_size = sbp->st_size;
10556         usbp->st_blocks = sbp->st_blocks;
10557         usbp->st_blksize = sbp->st_blksize;
10558         usbp->st_flags = sbp->st_flags;
10559         usbp->st_gen = sbp->st_gen;
10560         usbp->st_lspare = sbp->st_lspare;
10561         usbp->st_qspare[0] = sbp->st_qspare[0];
10562         usbp->st_qspare[1] = sbp->st_qspare[1];
10563 }
10564
10565 void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
10566 {
10567         bzero(usbp, sizeof(*usbp));
10568
10569         usbp->st_dev = sbp->st_dev;
10570         usbp->st_ino = sbp->st_ino;
10571         usbp->st_mode = sbp->st_mode;
10572         usbp->st_nlink = sbp->st_nlink;
10573         usbp->st_uid = sbp->st_uid;
10574         usbp->st_gid = sbp->st_gid;
10575         usbp->st_rdev = sbp->st_rdev;
10576 #ifndef _POSIX_C_SOURCE
10577         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
10578         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
10579         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
10580         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
10581         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
10582         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
10583         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
10584         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
10585 #else
10586         usbp->st_atime = sbp->st_atime;
10587         usbp->st_atimensec = sbp->st_atimensec;
10588         usbp->st_mtime = sbp->st_mtime;
10589         usbp->st_mtimensec = sbp->st_mtimensec;
10590         usbp->st_ctime = sbp->st_ctime;
10591         usbp->st_ctimensec = sbp->st_ctimensec;
10592         usbp->st_birthtime = sbp->st_birthtime;
10593         usbp->st_birthtimensec = sbp->st_birthtimensec;
10594 #endif
10595         usbp->st_size = sbp->st_size;
10596         usbp->st_blocks = sbp->st_blocks;
10597         usbp->st_blksize = sbp->st_blksize;
10598         usbp->st_flags = sbp->st_flags;
10599         usbp->st_gen = sbp->st_gen;
10600         usbp->st_lspare = sbp->st_lspare;
10601         usbp->st_qspare[0] = sbp->st_qspare[0];
10602         usbp->st_qspare[1] = sbp->st_qspare[1];
10603 }
10604
10605 /*
10606  * Purge buffer cache for simulating cold starts
10607  */
10608 static int vnode_purge_callback(struct vnode *vp, __unused void *cargs)
10609 {
10610         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
10611
10612         return VNODE_RETURNED;
10613 }
10614
10615 static int vfs_purge_callback(mount_t mp, __unused void * arg)
10616 {
10617         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
10618
10619         return VFS_RETURNED;
10620 }
10621
10622 int
10623 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
10624 {
10625         if (!kauth_cred_issuser(kauth_cred_get()))
10626                 return EPERM;
10627
10628         vfs_iterate(0/* flags */, vfs_purge_callback, NULL);
10629
10630         return 0;
10631 }
10632