bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/sysctl.h>
  98 #include <sys/xattr.h>
  99 #include <sys/fcntl.h>
 100 #include <sys/fsctl.h>
 101 #include <sys/ubc_internal.h>
 102 #include <sys/disk.h>
 103 #include <sys/content_protection.h>
 104 #include <sys/clonefile.h>
 105 #include <sys/snapshot.h>
 106 #include <sys/priv.h>
 107 #include <sys/fsgetpath.h>
 108 #include <machine/cons.h>
 109 #include <machine/limits.h>
 110 #include <miscfs/specfs/specdev.h>
 111
 112 #include <vfs/vfs_disk_conditioner.h>
 113
 114 #include <security/audit/audit.h>
 115 #include <bsm/audit_kevents.h>
 116
 117 #include <mach/mach_types.h>
 118 #include <kern/kern_types.h>
 119 #include <kern/kalloc.h>
 120 #include <kern/task.h>
 121
 122 #include <vm/vm_pageout.h>
 123 #include <vm/vm_protos.h>
 124
 125 #include <libkern/OSAtomic.h>
 126 #include <pexpert/pexpert.h>
 127 #include <IOKit/IOBSD.h>
 128
 129 // deps for MIG call
 130 #include <kern/host.h>
 131 #include <kern/ipc_misc.h>
 132 #include <mach/host_priv.h>
 133 #include <mach/vfs_nspace.h>
 134 #include <os/log.h>
 135
 136 #include <nfs/nfs_conf.h>
 137
 138 #if ROUTEFS
 139 #include <miscfs/routefs/routefs.h>
 140 #endif /* ROUTEFS */
 141
 142 #if CONFIG_MACF
 143 #include <security/mac.h>
 144 #include <security/mac_framework.h>
 145 #endif
 146
 147 #if CONFIG_FSE
 148 #define GET_PATH(x) \
 149         (x) = get_pathbuff();
 150 #define RELEASE_PATH(x) \
 151         release_pathbuff(x);
 152 #else
 153 #define GET_PATH(x)     \
 154         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 155 #define RELEASE_PATH(x) \
 156         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 157 #endif /* CONFIG_FSE */
 158
 159 #ifndef HFS_GET_BOOT_INFO
 160 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
 161 #endif
 162
 163 #ifndef HFS_SET_BOOT_INFO
 164 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
 165 #endif
 166
 167 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
 168 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
 169 #endif
 170
 171 extern void disk_conditioner_unmount(mount_t mp);
 172
 173 /* struct for checkdirs iteration */
 174 struct cdirargs {
 175         vnode_t olddp;
 176         vnode_t newdp;
 177 };
 178 /* callback  for checkdirs iteration */
 179 static int checkdirs_callback(proc_t p, void * arg);
 180
 181 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 182 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 183 void enablequotas(struct mount *mp, vfs_context_t ctx);
 184 static int getfsstat_callback(mount_t mp, void * arg);
 185 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 186 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 187 static int sync_callback(mount_t, void *);
 188 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 189     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 190     boolean_t partial_copy);
 191 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 192 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 193     struct componentname *cnp, user_addr_t fsmountargs,
 194     int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 195     vfs_context_t ctx);
 196 void vfs_notify_mount(vnode_t pdvp);
 197
 198 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 199
 200 struct fd_vn_data * fg_vn_data_alloc(void);
 201
 202 /*
 203  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 204  * Concurrent lookups (or lookups by ids) on hard links can cause the
 205  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 206  * does) to return ENOENT as the path cannot be returned from the name cache
 207  * alone. We have no option but to retry and hope to get one namei->reverse path
 208  * generation done without an intervening lookup, lookup by id on the hard link
 209  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 210  * which currently are the MAC hooks for rename, unlink and rmdir.
 211  */
 212 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 213
 214 /* Max retry limit for rename due to vnode recycling. */
 215 #define MAX_RENAME_ERECYCLE_RETRIES 1024
 216
 217 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
 218     int unlink_flags);
 219
 220 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, uint32_t options, int *);
 221
 222 #ifdef CONFIG_IMGSRC_ACCESS
 223 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 224 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 225 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 226 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 227 static void mount_end_update(mount_t mp);
 228 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 229 #endif /* CONFIG_IMGSRC_ACCESS */
 230
 231 #if CONFIG_LOCKERBOOT
 232 int mount_locker_protoboot(const char *fsname, const char *mntpoint,
 233     const char *pbdevpath);
 234 #endif
 235
 236 //snapshot functions
 237 #if CONFIG_MNT_ROOTSNAP
 238 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
 239 #else
 240 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
 241 #endif
 242
 243 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 244
 245 __private_extern__
 246 int sync_internal(void);
 247
 248 __private_extern__
 249 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 250
 251 extern lck_grp_t *fd_vn_lck_grp;
 252 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 253 extern lck_attr_t *fd_vn_lck_attr;
 254
 255 /*
 256  * incremented each time a mount or unmount operation occurs
 257  * used to invalidate the cached value of the rootvp in the
 258  * mount structure utilized by cache_lookup_path
 259  */
 260 uint32_t mount_generation = 0;
 261
 262 /* counts number of mount and unmount operations */
 263 unsigned int vfs_nummntops = 0;
 264
 265 extern const struct fileops vnops;
 266 #if CONFIG_APPLEDOUBLE
 267 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 268 #endif /* CONFIG_APPLEDOUBLE */
 269
 270 /*
 271  * Virtual File System System Calls
 272  */
 273
 274 #if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
 275 /*
 276  * Private in-kernel mounting spi (NFS only, not exported)
 277  */
 278 __private_extern__
 279 boolean_t
 280 vfs_iskernelmount(mount_t mp)
 281 {
 282         return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
 283 }
 284
 285 __private_extern__
 286 int
 287 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 288     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags, vfs_context_t ctx)
 289 {
 290         struct nameidata nd;
 291         boolean_t did_namei;
 292         int error;
 293
 294         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 295             UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 296
 297         /*
 298          * Get the vnode to be covered if it's not supplied
 299          */
 300         if (vp == NULLVP) {
 301                 error = namei(&nd);
 302                 if (error) {
 303                         if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VMVOL | KERNEL_MOUNT_DATAVOL)) {
 304                                 printf("failed to locate mount-on path: %s ", path);
 305                         }
 306                         return error;
 307                 }
 308                 vp = nd.ni_vp;
 309                 pvp = nd.ni_dvp;
 310                 did_namei = TRUE;
 311         } else {
 312                 char *pnbuf = CAST_DOWN(char *, path);
 313
 314                 nd.ni_cnd.cn_pnbuf = pnbuf;
 315                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 316                 did_namei = FALSE;
 317         }
 318
 319         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 320             syscall_flags, kern_flags, NULL, TRUE, ctx);
 321
 322         if (did_namei) {
 323                 vnode_put(vp);
 324                 vnode_put(pvp);
 325                 nameidone(&nd);
 326         }
 327
 328         return error;
 329 }
 330 #endif /* CONFIG_NFS_CLIENT || DEVFS */
 331
 332 /*
 333  * Mount a file system.
 334  */
 335 /* ARGSUSED */
 336 int
 337 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 338 {
 339         struct __mac_mount_args muap;
 340
 341         muap.type = uap->type;
 342         muap.path = uap->path;
 343         muap.flags = uap->flags;
 344         muap.data = uap->data;
 345         muap.mac_p = USER_ADDR_NULL;
 346         return __mac_mount(p, &muap, retval);
 347 }
 348
 349 int
 350 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
 351 {
 352         struct componentname    cn;
 353         vfs_context_t           ctx = vfs_context_current();
 354         size_t                  dummy = 0;
 355         int                     error;
 356         int                     flags = uap->flags;
 357         char                    fstypename[MFSNAMELEN];
 358         char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
 359         vnode_t                 pvp;
 360         vnode_t                 vp;
 361
 362         AUDIT_ARG(fd, uap->fd);
 363         AUDIT_ARG(fflags, flags);
 364         /* fstypename will get audited by mount_common */
 365
 366         /* Sanity check the flags */
 367         if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
 368                 return ENOTSUP;
 369         }
 370
 371         if (flags & MNT_UNION) {
 372                 return EPERM;
 373         }
 374
 375         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 376         if (error) {
 377                 return error;
 378         }
 379
 380         if ((error = file_vnode(uap->fd, &vp)) != 0) {
 381                 return error;
 382         }
 383
 384         if ((error = vnode_getwithref(vp)) != 0) {
 385                 file_drop(uap->fd);
 386                 return error;
 387         }
 388
 389         pvp = vnode_getparent(vp);
 390         if (pvp == NULL) {
 391                 vnode_put(vp);
 392                 file_drop(uap->fd);
 393                 return EINVAL;
 394         }
 395
 396         memset(&cn, 0, sizeof(struct componentname));
 397         MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
 398         cn.cn_pnlen = MAXPATHLEN;
 399
 400         if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
 401                 FREE(cn.cn_pnbuf, M_TEMP);
 402                 vnode_put(pvp);
 403                 vnode_put(vp);
 404                 file_drop(uap->fd);
 405                 return error;
 406         }
 407
 408         error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
 409
 410         FREE(cn.cn_pnbuf, M_TEMP);
 411         vnode_put(pvp);
 412         vnode_put(vp);
 413         file_drop(uap->fd);
 414
 415         return error;
 416 }
 417
 418 void
 419 vfs_notify_mount(vnode_t pdvp)
 420 {
 421         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 422         lock_vnode_and_post(pdvp, NOTE_WRITE);
 423 }
 424
 425 /*
 426  * __mac_mount:
 427  *      Mount a file system taking into account MAC label behavior.
 428  *      See mount(2) man page for more information
 429  *
 430  * Parameters:    p                        Process requesting the mount
 431  *                uap                      User argument descriptor (see below)
 432  *                retval                   (ignored)
 433  *
 434  * Indirect:      uap->type                Filesystem type
 435  *                uap->path                Path to mount
 436  *                uap->data                Mount arguments
 437  *                uap->mac_p               MAC info
 438  *                uap->flags               Mount flags
 439  *
 440  *
 441  * Returns:        0                       Success
 442  *                !0                       Not success
 443  */
 444 boolean_t root_fs_upgrade_try = FALSE;
 445
 446 int
 447 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 448 {
 449         vnode_t pvp = NULL;
 450         vnode_t vp = NULL;
 451         int need_nameidone = 0;
 452         vfs_context_t ctx = vfs_context_current();
 453         char fstypename[MFSNAMELEN];
 454         struct nameidata nd;
 455         size_t dummy = 0;
 456         char *labelstr = NULL;
 457         int flags = uap->flags;
 458         int error;
 459 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 460         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 461 #else
 462 #pragma unused(p)
 463 #endif
 464         /*
 465          * Get the fs type name from user space
 466          */
 467         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 468         if (error) {
 469                 return error;
 470         }
 471
 472         /*
 473          * Get the vnode to be covered
 474          */
 475         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 476             UIO_USERSPACE, uap->path, ctx);
 477         error = namei(&nd);
 478         if (error) {
 479                 goto out;
 480         }
 481         need_nameidone = 1;
 482         vp = nd.ni_vp;
 483         pvp = nd.ni_dvp;
 484
 485 #ifdef CONFIG_IMGSRC_ACCESS
 486         /* Mounting image source cannot be batched with other operations */
 487         if (flags == MNT_IMGSRC_BY_INDEX) {
 488                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 489                     ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 490                 goto out;
 491         }
 492 #endif /* CONFIG_IMGSRC_ACCESS */
 493
 494 #if CONFIG_MACF
 495         /*
 496          * Get the label string (if any) from user space
 497          */
 498         if (uap->mac_p != USER_ADDR_NULL) {
 499                 struct user_mac mac;
 500                 size_t ulen = 0;
 501
 502                 if (is_64bit) {
 503                         struct user64_mac mac64;
 504                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 505                         mac.m_buflen = mac64.m_buflen;
 506                         mac.m_string = mac64.m_string;
 507                 } else {
 508                         struct user32_mac mac32;
 509                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 510                         mac.m_buflen = mac32.m_buflen;
 511                         mac.m_string = mac32.m_string;
 512                 }
 513                 if (error) {
 514                         goto out;
 515                 }
 516                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 517                     (mac.m_buflen < 2)) {
 518                         error = EINVAL;
 519                         goto out;
 520                 }
 521                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 522                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 523                 if (error) {
 524                         goto out;
 525                 }
 526                 AUDIT_ARG(mac_string, labelstr);
 527         }
 528 #endif /* CONFIG_MACF */
 529
 530         AUDIT_ARG(fflags, flags);
 531
 532 #if SECURE_KERNEL
 533         if (flags & MNT_UNION) {
 534                 /* No union mounts on release kernels */
 535                 error = EPERM;
 536                 goto out;
 537         }
 538 #endif
 539
 540         if ((vp->v_flag & VROOT) &&
 541             (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 542                 if (!(flags & MNT_UNION)) {
 543                         flags |= MNT_UPDATE;
 544                 } else {
 545                         /*
 546                          * For a union mount on '/', treat it as fresh
 547                          * mount instead of update.
 548                          * Otherwise, union mouting on '/' used to panic the
 549                          * system before, since mnt_vnodecovered was found to
 550                          * be NULL for '/' which is required for unionlookup
 551                          * after it gets ENOENT on union mount.
 552                          */
 553                         flags = (flags & ~(MNT_UPDATE));
 554                 }
 555
 556 #if SECURE_KERNEL
 557                 if ((flags & MNT_RDONLY) == 0) {
 558                         /* Release kernels are not allowed to mount "/" as rw */
 559                         error = EPERM;
 560                         goto out;
 561                 }
 562 #endif
 563                 /*
 564                  * See 7392553 for more details on why this check exists.
 565                  * Suffice to say: If this check is ON and something tries
 566                  * to mount the rootFS RW, we'll turn off the codesign
 567                  * bitmap optimization.
 568                  */
 569 #if CHECK_CS_VALIDATION_BITMAP
 570                 if ((flags & MNT_RDONLY) == 0) {
 571                         root_fs_upgrade_try = TRUE;
 572                 }
 573 #endif
 574         }
 575
 576         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 577             labelstr, FALSE, ctx);
 578
 579 out:
 580
 581 #if CONFIG_MACF
 582         if (labelstr) {
 583                 FREE(labelstr, M_MACTEMP);
 584         }
 585 #endif /* CONFIG_MACF */
 586
 587         if (vp) {
 588                 vnode_put(vp);
 589         }
 590         if (pvp) {
 591                 vnode_put(pvp);
 592         }
 593         if (need_nameidone) {
 594                 nameidone(&nd);
 595         }
 596
 597         return error;
 598 }
 599
 600 /*
 601  * common mount implementation (final stage of mounting)
 602  *
 603  * Arguments:
 604  *  fstypename  file system type (ie it's vfs name)
 605  *  pvp         parent of covered vnode
 606  *  vp          covered vnode
 607  *  cnp         component name (ie path) of covered vnode
 608  *  flags       generic mount flags
 609  *  fsmountargs file system specific data
 610  *  labelstr    optional MAC label
 611  *  kernelmount TRUE for mounts initiated from inside the kernel
 612  *  ctx         caller's context
 613  */
 614 static int
 615 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 616     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 617     char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 618 {
 619 #if !CONFIG_MACF
 620 #pragma unused(labelstr)
 621 #endif
 622         struct vnode *devvp = NULLVP;
 623         struct vnode *device_vnode = NULLVP;
 624 #if CONFIG_MACF
 625         struct vnode *rvp;
 626 #endif
 627         struct mount *mp;
 628         struct vfstable *vfsp = (struct vfstable *)0;
 629         struct proc *p = vfs_context_proc(ctx);
 630         int error, flag = 0;
 631         user_addr_t devpath = USER_ADDR_NULL;
 632         int ronly = 0;
 633         int mntalloc = 0;
 634         boolean_t vfsp_ref = FALSE;
 635         boolean_t is_rwlock_locked = FALSE;
 636         boolean_t did_rele = FALSE;
 637         boolean_t have_usecount = FALSE;
 638
 639 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM
 640         /* Check for mutually-exclusive flag bits */
 641         uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL));
 642         int bitcount = 0;
 643         while (checkflags != 0) {
 644                 checkflags &= (checkflags - 1);
 645                 bitcount++;
 646         }
 647
 648         if (bitcount > 1) {
 649                 //not allowed to request multiple mount-by-role flags
 650                 error = EINVAL;
 651                 goto out1;
 652         }
 653 #endif
 654
 655         /*
 656          * Process an update for an existing mount
 657          */
 658         if (flags & MNT_UPDATE) {
 659                 if ((vp->v_flag & VROOT) == 0) {
 660                         error = EINVAL;
 661                         goto out1;
 662                 }
 663                 mp = vp->v_mount;
 664
 665                 /* unmount in progress return error */
 666                 mount_lock_spin(mp);
 667                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 668                         mount_unlock(mp);
 669                         error = EBUSY;
 670                         goto out1;
 671                 }
 672                 mount_unlock(mp);
 673                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 674                 is_rwlock_locked = TRUE;
 675                 /*
 676                  * We only allow the filesystem to be reloaded if it
 677                  * is currently mounted read-only.
 678                  */
 679                 if ((flags & MNT_RELOAD) &&
 680                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 681                         error = ENOTSUP;
 682                         goto out1;
 683                 }
 684
 685                 /*
 686                  * If content protection is enabled, update mounts are not
 687                  * allowed to turn it off.
 688                  */
 689                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 690                     ((flags & MNT_CPROTECT) == 0)) {
 691                         error = EINVAL;
 692                         goto out1;
 693                 }
 694
 695                 /*
 696                  * can't turn off MNT_REMOVABLE either but it may be an unexpected
 697                  * failure to return an error for this so we'll just silently
 698                  * add it if it is not passed in.
 699                  */
 700                 if ((mp->mnt_flag & MNT_REMOVABLE) &&
 701                     ((flags & MNT_REMOVABLE) == 0)) {
 702                         flags |= MNT_REMOVABLE;
 703                 }
 704
 705 #ifdef CONFIG_IMGSRC_ACCESS
 706                 /* Can't downgrade the backer of the root FS */
 707                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 708                     (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 709                         error = ENOTSUP;
 710                         goto out1;
 711                 }
 712 #endif /* CONFIG_IMGSRC_ACCESS */
 713
 714                 /*
 715                  * Only root, or the user that did the original mount is
 716                  * permitted to update it.
 717                  */
 718                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 719                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 720                         goto out1;
 721                 }
 722 #if CONFIG_MACF
 723                 error = mac_mount_check_remount(ctx, mp);
 724                 if (error != 0) {
 725                         goto out1;
 726                 }
 727 #endif
 728                 /*
 729                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 730                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 731                  */
 732                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 733                         flags |= MNT_NOSUID | MNT_NODEV;
 734                         if (mp->mnt_flag & MNT_NOEXEC) {
 735                                 flags |= MNT_NOEXEC;
 736                         }
 737                 }
 738                 flag = mp->mnt_flag;
 739
 740
 741
 742                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 743
 744                 vfsp = mp->mnt_vtable;
 745                 goto update;
 746         } // MNT_UPDATE
 747
 748         /*
 749          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 750          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 751          */
 752         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 753                 flags |= MNT_NOSUID | MNT_NODEV;
 754                 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
 755                         flags |= MNT_NOEXEC;
 756                 }
 757         }
 758
 759         /* XXXAUDIT: Should we capture the type on the error path as well? */
 760         AUDIT_ARG(text, fstypename);
 761         mount_list_lock();
 762         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 763                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 764                         vfsp->vfc_refcount++;
 765                         vfsp_ref = TRUE;
 766                         break;
 767                 }
 768         }
 769         mount_list_unlock();
 770         if (vfsp == NULL) {
 771                 error = ENODEV;
 772                 goto out1;
 773         }
 774
 775         /*
 776          * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
 777          * except in ROSV configs.
 778          */
 779         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
 780             ((internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL)) == 0)) {
 781                 error = EINVAL;  /* unsupported request */
 782                 goto out1;
 783         }
 784
 785         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 786         if (error != 0) {
 787                 goto out1;
 788         }
 789
 790         /*
 791          * Allocate and initialize the filesystem (mount_t)
 792          */
 793         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 794             M_MOUNT, M_WAITOK);
 795         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 796         mntalloc = 1;
 797
 798         /* Initialize the default IO constraints */
 799         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 800         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 801         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 802         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 803         mp->mnt_devblocksize = DEV_BSIZE;
 804         mp->mnt_alignmentmask = PAGE_MASK;
 805         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 806         mp->mnt_ioscale = 1;
 807         mp->mnt_ioflags = 0;
 808         mp->mnt_realrootvp = NULLVP;
 809         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 810
 811         TAILQ_INIT(&mp->mnt_vnodelist);
 812         TAILQ_INIT(&mp->mnt_workerqueue);
 813         TAILQ_INIT(&mp->mnt_newvnodes);
 814         mount_lock_init(mp);
 815         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 816         is_rwlock_locked = TRUE;
 817         mp->mnt_op = vfsp->vfc_vfsops;
 818         mp->mnt_vtable = vfsp;
 819         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 820         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 821         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 822         do {
 823                 int pathlen = MAXPATHLEN;
 824
 825                 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
 826                         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 827                 }
 828         } while (0);
 829         mp->mnt_vnodecovered = vp;
 830         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 831         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 832         mp->mnt_devbsdunit = 0;
 833
 834         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 835         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 836
 837 #if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
 838         if (kernelmount) {
 839                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 840         }
 841         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
 842                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 843         }
 844 #endif /* CONFIG_NFS_CLIENT || DEVFS */
 845
 846 update:
 847
 848         /*
 849          * Set the mount level flags.
 850          */
 851         if (flags & MNT_RDONLY) {
 852                 mp->mnt_flag |= MNT_RDONLY;
 853         } else if (mp->mnt_flag & MNT_RDONLY) {
 854                 // disallow read/write upgrades of file systems that
 855                 // had the TYPENAME_OVERRIDE feature set.
 856                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 857                         error = EPERM;
 858                         goto out1;
 859                 }
 860                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 861         }
 862         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 863             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 864             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 865             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
 866             MNT_QUARANTINE | MNT_CPROTECT);
 867
 868 #if SECURE_KERNEL
 869 #if !CONFIG_MNT_SUID
 870         /*
 871          * On release builds of iOS based platforms, always enforce NOSUID on
 872          * all mounts. We do this here because we can catch update mounts as well as
 873          * non-update mounts in this case.
 874          */
 875         mp->mnt_flag |= (MNT_NOSUID);
 876 #endif
 877 #endif
 878
 879         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 880             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 881             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 882             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
 883             MNT_QUARANTINE | MNT_CPROTECT);
 884
 885 #if CONFIG_MACF
 886         if (flags & MNT_MULTILABEL) {
 887                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 888                         error = EINVAL;
 889                         goto out1;
 890                 }
 891                 mp->mnt_flag |= MNT_MULTILABEL;
 892         }
 893 #endif
 894         /*
 895          * Process device path for local file systems if requested
 896          */
 897         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
 898             !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL))) {
 899                 //snapshot, vm, datavolume mounts are special
 900                 if (vfs_context_is64bit(ctx)) {
 901                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
 902                                 goto out1;
 903                         }
 904                         fsmountargs += sizeof(devpath);
 905                 } else {
 906                         user32_addr_t tmp;
 907                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
 908                                 goto out1;
 909                         }
 910                         /* munge into LP64 addr */
 911                         devpath = CAST_USER_ADDR_T(tmp);
 912                         fsmountargs += sizeof(tmp);
 913                 }
 914
 915                 /* Lookup device and authorize access to it */
 916                 if ((devpath)) {
 917                         struct nameidata nd;
 918
 919                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 920                         if ((error = namei(&nd))) {
 921                                 goto out1;
 922                         }
 923
 924                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 925                         devvp = nd.ni_vp;
 926
 927                         nameidone(&nd);
 928
 929                         if (devvp->v_type != VBLK) {
 930                                 error = ENOTBLK;
 931                                 goto out2;
 932                         }
 933                         if (major(devvp->v_rdev) >= nblkdev) {
 934                                 error = ENXIO;
 935                                 goto out2;
 936                         }
 937                         /*
 938                          * If mount by non-root, then verify that user has necessary
 939                          * permissions on the device.
 940                          */
 941                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 942                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 943
 944                                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 945                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 946                                 }
 947                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
 948                                         goto out2;
 949                                 }
 950                         }
 951                 }
 952                 /* On first mount, preflight and open device */
 953                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 954                         if ((error = vnode_ref(devvp))) {
 955                                 goto out2;
 956                         }
 957                         /*
 958                          * Disallow multiple mounts of the same device.
 959                          * Disallow mounting of a device that is currently in use
 960                          * (except for root, which might share swap device for miniroot).
 961                          * Flush out any old buffers remaining from a previous use.
 962                          */
 963                         if ((error = vfs_mountedon(devvp))) {
 964                                 goto out3;
 965                         }
 966
 967                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 968                                 error = EBUSY;
 969                                 goto out3;
 970                         }
 971                         if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
 972                                 error = ENOTBLK;
 973                                 goto out3;
 974                         }
 975                         if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
 976                                 goto out3;
 977                         }
 978
 979                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 980 #if CONFIG_MACF
 981                         error = mac_vnode_check_open(ctx,
 982                             devvp,
 983                             ronly ? FREAD : FREAD | FWRITE);
 984                         if (error) {
 985                                 goto out3;
 986                         }
 987 #endif /* MAC */
 988                         if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
 989                                 goto out3;
 990                         }
 991
 992                         mp->mnt_devvp = devvp;
 993                         device_vnode = devvp;
 994                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 995                     (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 996                     (device_vnode = mp->mnt_devvp)) {
 997                         dev_t dev;
 998                         int maj;
 999                         /*
1000                          * If upgrade to read-write by non-root, then verify
1001                          * that user has necessary permissions on the device.
1002                          */
1003                         vnode_getalways(device_vnode);
1004
1005                         if (suser(vfs_context_ucred(ctx), NULL) &&
1006                             (error = vnode_authorize(device_vnode, NULL,
1007                             KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1008                             ctx)) != 0) {
1009                                 vnode_put(device_vnode);
1010                                 goto out2;
1011                         }
1012
1013                         /* Tell the device that we're upgrading */
1014                         dev = (dev_t)device_vnode->v_rdev;
1015                         maj = major(dev);
1016
1017                         if ((u_int)maj >= (u_int)nblkdev) {
1018                                 panic("Volume mounted on a device with invalid major number.");
1019                         }
1020
1021                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1022                         vnode_put(device_vnode);
1023                         device_vnode = NULLVP;
1024                         if (error != 0) {
1025                                 goto out2;
1026                         }
1027                 }
1028         } // localargs && !(snapshot | data | vm)
1029
1030 #if CONFIG_MACF
1031         if ((flags & MNT_UPDATE) == 0) {
1032                 mac_mount_label_init(mp);
1033                 mac_mount_label_associate(ctx, mp);
1034         }
1035         if (labelstr) {
1036                 if ((flags & MNT_UPDATE) != 0) {
1037                         error = mac_mount_check_label_update(ctx, mp);
1038                         if (error != 0) {
1039                                 goto out3;
1040                         }
1041                 }
1042         }
1043 #endif
1044         /*
1045          * Mount the filesystem.  We already asserted that internal_flags
1046          * cannot have more than one mount-by-role bit set.
1047          */
1048         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1049                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1050                     (caddr_t)fsmountargs, 0, ctx);
1051         } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1052 #if CONFIG_ROSV_STARTUP
1053                 struct mount *origin_mp = (struct mount*)fsmountargs;
1054                 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1055                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1056                 if (error) {
1057                         printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1058                 } else {
1059                         /* Mark volume associated with system volume */
1060                         mp->mnt_kern_flag |= MNTK_SYSTEM;
1061
1062                         /* Attempt to acquire the mnt_devvp and set it up */
1063                         struct vnode *mp_devvp = NULL;
1064                         if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1065                                 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1066                                     0, &mp_devvp, vfs_context_kernel());
1067                                 if (!lerr) {
1068                                         mp->mnt_devvp = mp_devvp;
1069                                         //vnode_lookup took an iocount, need to drop it.
1070                                         vnode_put(mp_devvp);
1071                                         // now set `device_vnode` to the devvp that was acquired.
1072                                         // this is needed in order to ensure vfs_init_io_attributes is invoked.
1073                                         // note that though the iocount above was dropped, the mount acquires
1074                                         // an implicit reference against the device.
1075                                         device_vnode = mp_devvp;
1076                                 }
1077                         }
1078                 }
1079 #else
1080                 error = EINVAL;
1081 #endif
1082         } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1083 #if CONFIG_MOUNT_VM
1084                 struct mount *origin_mp = (struct mount*)fsmountargs;
1085                 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1086                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1087                 if (error) {
1088                         printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1089                 } else {
1090                         /* Mark volume associated with system volume and a swap mount */
1091                         mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1092                         /* Attempt to acquire the mnt_devvp and set it up */
1093                         struct vnode *mp_devvp = NULL;
1094                         if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1095                                 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1096                                     0, &mp_devvp, vfs_context_kernel());
1097                                 if (!lerr) {
1098                                         mp->mnt_devvp = mp_devvp;
1099                                         //vnode_lookup took an iocount, need to drop it.
1100                                         vnode_put(mp_devvp);
1101
1102                                         // now set `device_vnode` to the devvp that was acquired.
1103                                         // note that though the iocount above was dropped, the mount acquires
1104                                         // an implicit reference against the device.
1105                                         device_vnode = mp_devvp;
1106                                 }
1107                         }
1108                 }
1109 #else
1110                 error = EINVAL;
1111 #endif
1112         } else {
1113                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1114         }
1115
1116         if (flags & MNT_UPDATE) {
1117                 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1118                         mp->mnt_flag &= ~MNT_RDONLY;
1119                 }
1120                 mp->mnt_flag &= ~
1121                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1122                 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1123                 if (error) {
1124                         mp->mnt_flag = flag;  /* restore flag value */
1125                 }
1126                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1127                 lck_rw_done(&mp->mnt_rwlock);
1128                 is_rwlock_locked = FALSE;
1129                 if (!error) {
1130                         enablequotas(mp, ctx);
1131                 }
1132                 goto exit;
1133         }
1134
1135         /*
1136          * Put the new filesystem on the mount list after root.
1137          */
1138         if (error == 0) {
1139                 struct vfs_attr vfsattr;
1140 #if CONFIG_MACF
1141                 error = mac_mount_check_mount_late(ctx, mp);
1142                 if (error != 0) {
1143                         goto out3;
1144                 }
1145
1146                 if (vfs_flags(mp) & MNT_MULTILABEL) {
1147                         error = VFS_ROOT(mp, &rvp, ctx);
1148                         if (error) {
1149                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1150                                 goto out3;
1151                         }
1152                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1153                         /*
1154                          * drop reference provided by VFS_ROOT
1155                          */
1156                         vnode_put(rvp);
1157
1158                         if (error) {
1159                                 goto out3;
1160                         }
1161                 }
1162 #endif  /* MAC */
1163
1164                 vnode_lock_spin(vp);
1165                 CLR(vp->v_flag, VMOUNT);
1166                 vp->v_mountedhere = mp;
1167                 vnode_unlock(vp);
1168
1169                 /*
1170                  * taking the name_cache_lock exclusively will
1171                  * insure that everyone is out of the fast path who
1172                  * might be trying to use a now stale copy of
1173                  * vp->v_mountedhere->mnt_realrootvp
1174                  * bumping mount_generation causes the cached values
1175                  * to be invalidated
1176                  */
1177                 name_cache_lock();
1178                 mount_generation++;
1179                 name_cache_unlock();
1180
1181                 error = vnode_ref(vp);
1182                 if (error != 0) {
1183                         goto out4;
1184                 }
1185
1186                 have_usecount = TRUE;
1187
1188                 error = checkdirs(vp, ctx);
1189                 if (error != 0) {
1190                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
1191                         goto out4;
1192                 }
1193                 /*
1194                  * there is no cleanup code here so I have made it void
1195                  * we need to revisit this
1196                  */
1197                 (void)VFS_START(mp, 0, ctx);
1198
1199                 if (mount_list_add(mp) != 0) {
1200                         /*
1201                          * The system is shutting down trying to umount
1202                          * everything, so fail with a plausible errno.
1203                          */
1204                         error = EBUSY;
1205                         goto out4;
1206                 }
1207                 lck_rw_done(&mp->mnt_rwlock);
1208                 is_rwlock_locked = FALSE;
1209
1210                 /* Check if this mounted file system supports EAs or named streams. */
1211                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1212                 VFSATTR_INIT(&vfsattr);
1213                 VFSATTR_WANTED(&vfsattr, f_capabilities);
1214                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1215                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1216                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1217                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1218                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1219                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1220                         }
1221 #if NAMEDSTREAMS
1222                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1223                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1224                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1225                         }
1226 #endif
1227                         /* Check if this file system supports path from id lookups. */
1228                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1229                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1230                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1231                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
1232                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1233                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1234                         }
1235
1236                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1237                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1238                                 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1239                         }
1240                 }
1241                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1242                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1243                 }
1244                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1245                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1246                 }
1247                 /* increment the operations count */
1248                 OSAddAtomic(1, &vfs_nummntops);
1249                 enablequotas(mp, ctx);
1250
1251                 if (device_vnode) {
1252                         device_vnode->v_specflags |= SI_MOUNTEDON;
1253
1254                         /*
1255                          *   cache the IO attributes for the underlying physical media...
1256                          *   an error return indicates the underlying driver doesn't
1257                          *   support all the queries necessary... however, reasonable
1258                          *   defaults will have been set, so no reason to bail or care
1259                          */
1260                         vfs_init_io_attributes(device_vnode, mp);
1261                 }
1262
1263                 /* Now that mount is setup, notify the listeners */
1264                 vfs_notify_mount(pvp);
1265                 IOBSDMountChange(mp, kIOMountChangeMount);
1266         } else {
1267                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1268                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1269                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1270                             mp->mnt_vtable->vfc_name, error);
1271                 }
1272
1273                 vnode_lock_spin(vp);
1274                 CLR(vp->v_flag, VMOUNT);
1275                 vnode_unlock(vp);
1276                 mount_list_lock();
1277                 mp->mnt_vtable->vfc_refcount--;
1278                 mount_list_unlock();
1279
1280                 if (device_vnode) {
1281                         vnode_rele(device_vnode);
1282                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1283                 }
1284                 lck_rw_done(&mp->mnt_rwlock);
1285                 is_rwlock_locked = FALSE;
1286
1287                 /*
1288                  * if we get here, we have a mount structure that needs to be freed,
1289                  * but since the coveredvp hasn't yet been updated to point at it,
1290                  * no need to worry about other threads holding a crossref on this mp
1291                  * so it's ok to just free it
1292                  */
1293                 mount_lock_destroy(mp);
1294 #if CONFIG_MACF
1295                 mac_mount_label_destroy(mp);
1296 #endif
1297                 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1298         }
1299 exit:
1300         /*
1301          * drop I/O count on the device vp if there was one
1302          */
1303         if (devpath && devvp) {
1304                 vnode_put(devvp);
1305         }
1306
1307         return error;
1308
1309 /* Error condition exits */
1310 out4:
1311         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1312
1313         /*
1314          * If the mount has been placed on the covered vp,
1315          * it may have been discovered by now, so we have
1316          * to treat this just like an unmount
1317          */
1318         mount_lock_spin(mp);
1319         mp->mnt_lflag |= MNT_LDEAD;
1320         mount_unlock(mp);
1321
1322         if (device_vnode != NULLVP) {
1323                 vnode_rele(device_vnode);
1324                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1325                     ctx);
1326                 did_rele = TRUE;
1327         }
1328
1329         vnode_lock_spin(vp);
1330
1331         mp->mnt_crossref++;
1332         vp->v_mountedhere = (mount_t) 0;
1333
1334         vnode_unlock(vp);
1335
1336         if (have_usecount) {
1337                 vnode_rele(vp);
1338         }
1339 out3:
1340         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1341                 vnode_rele(devvp);
1342         }
1343 out2:
1344         if (devpath && devvp) {
1345                 vnode_put(devvp);
1346         }
1347 out1:
1348         /* Release mnt_rwlock only when it was taken */
1349         if (is_rwlock_locked == TRUE) {
1350                 lck_rw_done(&mp->mnt_rwlock);
1351         }
1352
1353         if (mntalloc) {
1354                 if (mp->mnt_crossref) {
1355                         mount_dropcrossref(mp, vp, 0);
1356                 } else {
1357                         mount_lock_destroy(mp);
1358 #if CONFIG_MACF
1359                         mac_mount_label_destroy(mp);
1360 #endif
1361                         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1362                 }
1363         }
1364         if (vfsp_ref) {
1365                 mount_list_lock();
1366                 vfsp->vfc_refcount--;
1367                 mount_list_unlock();
1368         }
1369
1370         return error;
1371 }
1372
1373 /*
1374  * Flush in-core data, check for competing mount attempts,
1375  * and set VMOUNT
1376  */
1377 int
1378 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1379 {
1380 #if !CONFIG_MACF
1381 #pragma unused(cnp,fsname)
1382 #endif
1383         struct vnode_attr va;
1384         int error;
1385
1386         if (!skip_auth) {
1387                 /*
1388                  * If the user is not root, ensure that they own the directory
1389                  * onto which we are attempting to mount.
1390                  */
1391                 VATTR_INIT(&va);
1392                 VATTR_WANTED(&va, va_uid);
1393                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1394                     (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1395                     (!vfs_context_issuser(ctx)))) {
1396                         error = EPERM;
1397                         goto out;
1398                 }
1399         }
1400
1401         if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1402                 goto out;
1403         }
1404
1405         if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1406                 goto out;
1407         }
1408
1409         if (vp->v_type != VDIR) {
1410                 error = ENOTDIR;
1411                 goto out;
1412         }
1413
1414         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1415                 error = EBUSY;
1416                 goto out;
1417         }
1418
1419 #if CONFIG_MACF
1420         error = mac_mount_check_mount(ctx, vp,
1421             cnp, fsname);
1422         if (error != 0) {
1423                 goto out;
1424         }
1425 #endif
1426
1427         vnode_lock_spin(vp);
1428         SET(vp->v_flag, VMOUNT);
1429         vnode_unlock(vp);
1430
1431 out:
1432         return error;
1433 }
1434
1435 #if CONFIG_IMGSRC_ACCESS
1436
1437 #define DEBUG_IMGSRC 0
1438
1439 #if DEBUG_IMGSRC
1440 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1441 #else
1442 #define IMGSRC_DEBUG(args...) do { } while(0)
1443 #endif
1444
1445 static int
1446 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1447 {
1448         struct nameidata nd;
1449         vnode_t vp, realdevvp;
1450         mode_t accessmode;
1451         int error;
1452         enum uio_seg uio = UIO_USERSPACE;
1453
1454         if (ctx == vfs_context_kernel()) {
1455                 uio = UIO_SYSSPACE;
1456         }
1457
1458         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1459         if ((error = namei(&nd))) {
1460                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1461                 return error;
1462         }
1463
1464         vp = nd.ni_vp;
1465
1466         if (!vnode_isblk(vp)) {
1467                 IMGSRC_DEBUG("Not block device.\n");
1468                 error = ENOTBLK;
1469                 goto out;
1470         }
1471
1472         realdevvp = mp->mnt_devvp;
1473         if (realdevvp == NULLVP) {
1474                 IMGSRC_DEBUG("No device backs the mount.\n");
1475                 error = ENXIO;
1476                 goto out;
1477         }
1478
1479         error = vnode_getwithref(realdevvp);
1480         if (error != 0) {
1481                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1482                 goto out;
1483         }
1484
1485         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1486                 IMGSRC_DEBUG("Wrong dev_t.\n");
1487                 error = ENXIO;
1488                 goto out1;
1489         }
1490
1491         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1492
1493         /*
1494          * If mount by non-root, then verify that user has necessary
1495          * permissions on the device.
1496          */
1497         if (!vfs_context_issuser(ctx)) {
1498                 accessmode = KAUTH_VNODE_READ_DATA;
1499                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1500                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1501                 }
1502                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1503                         IMGSRC_DEBUG("Access denied.\n");
1504                         goto out1;
1505                 }
1506         }
1507
1508         *devvpp = vp;
1509
1510 out1:
1511         vnode_put(realdevvp);
1512
1513 out:
1514         nameidone(&nd);
1515
1516         if (error) {
1517                 vnode_put(vp);
1518         }
1519
1520         return error;
1521 }
1522
1523 /*
1524  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1525  * and call checkdirs()
1526  */
1527 static int
1528 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1529 {
1530         int error;
1531
1532         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1533
1534         IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1535             mp->mnt_vtable->vfc_name, vnode_getname(vp));
1536
1537         vnode_lock_spin(vp);
1538         CLR(vp->v_flag, VMOUNT);
1539         vp->v_mountedhere = mp;
1540         vnode_unlock(vp);
1541
1542         /*
1543          * taking the name_cache_lock exclusively will
1544          * insure that everyone is out of the fast path who
1545          * might be trying to use a now stale copy of
1546          * vp->v_mountedhere->mnt_realrootvp
1547          * bumping mount_generation causes the cached values
1548          * to be invalidated
1549          */
1550         name_cache_lock();
1551         mount_generation++;
1552         name_cache_unlock();
1553
1554         error = vnode_ref(vp);
1555         if (error != 0) {
1556                 goto out;
1557         }
1558
1559         error = checkdirs(vp, ctx);
1560         if (error != 0) {
1561                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1562                 vnode_rele(vp);
1563                 goto out;
1564         }
1565
1566 out:
1567         if (error != 0) {
1568                 mp->mnt_vnodecovered = NULLVP;
1569         }
1570         return error;
1571 }
1572
1573 static void
1574 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1575 {
1576         vnode_rele(vp);
1577         vnode_lock_spin(vp);
1578         vp->v_mountedhere = (mount_t)NULL;
1579         vnode_unlock(vp);
1580
1581         mp->mnt_vnodecovered = NULLVP;
1582 }
1583
1584 static int
1585 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1586 {
1587         int error;
1588
1589         /* unmount in progress return error */
1590         mount_lock_spin(mp);
1591         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1592                 mount_unlock(mp);
1593                 return EBUSY;
1594         }
1595         mount_unlock(mp);
1596         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1597
1598         /*
1599          * We only allow the filesystem to be reloaded if it
1600          * is currently mounted read-only.
1601          */
1602         if ((flags & MNT_RELOAD) &&
1603             ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1604                 error = ENOTSUP;
1605                 goto out;
1606         }
1607
1608         /*
1609          * Only root, or the user that did the original mount is
1610          * permitted to update it.
1611          */
1612         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1613             (!vfs_context_issuser(ctx))) {
1614                 error = EPERM;
1615                 goto out;
1616         }
1617 #if CONFIG_MACF
1618         error = mac_mount_check_remount(ctx, mp);
1619         if (error != 0) {
1620                 goto out;
1621         }
1622 #endif
1623
1624 out:
1625         if (error) {
1626                 lck_rw_done(&mp->mnt_rwlock);
1627         }
1628
1629         return error;
1630 }
1631
1632 static void
1633 mount_end_update(mount_t mp)
1634 {
1635         lck_rw_done(&mp->mnt_rwlock);
1636 }
1637
1638 static int
1639 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1640 {
1641         vnode_t vp;
1642
1643         if (height >= MAX_IMAGEBOOT_NESTING) {
1644                 return EINVAL;
1645         }
1646
1647         vp = imgsrc_rootvnodes[height];
1648         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1649                 *rvpp = vp;
1650                 return 0;
1651         } else {
1652                 return ENOENT;
1653         }
1654 }
1655
1656 static int
1657 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1658     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1659     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1660 {
1661         int error;
1662         mount_t mp;
1663         boolean_t placed = FALSE;
1664         struct vfstable *vfsp;
1665         user_addr_t devpath;
1666         char *old_mntonname;
1667         vnode_t rvp;
1668         vnode_t devvp;
1669         uint32_t height;
1670         uint32_t flags;
1671
1672         /* If we didn't imageboot, nothing to move */
1673         if (imgsrc_rootvnodes[0] == NULLVP) {
1674                 return EINVAL;
1675         }
1676
1677         /* Only root can do this */
1678         if (!vfs_context_issuser(ctx)) {
1679                 return EPERM;
1680         }
1681
1682         IMGSRC_DEBUG("looking for root vnode.\n");
1683
1684         /*
1685          * Get root vnode of filesystem we're moving.
1686          */
1687         if (by_index) {
1688                 if (is64bit) {
1689                         struct user64_mnt_imgsrc_args mia64;
1690                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1691                         if (error != 0) {
1692                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1693                                 return error;
1694                         }
1695
1696                         height = mia64.mi_height;
1697                         flags = mia64.mi_flags;
1698                         devpath = mia64.mi_devpath;
1699                 } else {
1700                         struct user32_mnt_imgsrc_args mia32;
1701                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1702                         if (error != 0) {
1703                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1704                                 return error;
1705                         }
1706
1707                         height = mia32.mi_height;
1708                         flags = mia32.mi_flags;
1709                         devpath = mia32.mi_devpath;
1710                 }
1711         } else {
1712                 /*
1713                  * For binary compatibility--assumes one level of nesting.
1714                  */
1715                 if (is64bit) {
1716                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1717                                 return error;
1718                         }
1719                 } else {
1720                         user32_addr_t tmp;
1721                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1722                                 return error;
1723                         }
1724
1725                         /* munge into LP64 addr */
1726                         devpath = CAST_USER_ADDR_T(tmp);
1727                 }
1728
1729                 height = 0;
1730                 flags = 0;
1731         }
1732
1733         if (flags != 0) {
1734                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1735                 return EINVAL;
1736         }
1737
1738         error = get_imgsrc_rootvnode(height, &rvp);
1739         if (error != 0) {
1740                 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1741                 return error;
1742         }
1743
1744         IMGSRC_DEBUG("got old root vnode\n");
1745
1746         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1747
1748         /* Can only move once */
1749         mp = vnode_mount(rvp);
1750         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1751                 IMGSRC_DEBUG("Already moved.\n");
1752                 error = EBUSY;
1753                 goto out0;
1754         }
1755
1756         IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1757         IMGSRC_DEBUG("Starting updated.\n");
1758
1759         /* Get exclusive rwlock on mount, authorize update on mp */
1760         error = mount_begin_update(mp, ctx, 0);
1761         if (error != 0) {
1762                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1763                 goto out0;
1764         }
1765
1766         /*
1767          * It can only be moved once.  Flag is set under the rwlock,
1768          * so we're now safe to proceed.
1769          */
1770         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1771                 IMGSRC_DEBUG("Already moved [2]\n");
1772                 goto out1;
1773         }
1774
1775         IMGSRC_DEBUG("Preparing coveredvp.\n");
1776
1777         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1778         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1779         if (error != 0) {
1780                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1781                 goto out1;
1782         }
1783
1784         IMGSRC_DEBUG("Covered vp OK.\n");
1785
1786         /* Sanity check the name caller has provided */
1787         vfsp = mp->mnt_vtable;
1788         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1789                 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1790                     vfsp->vfc_name, fsname);
1791                 error = EINVAL;
1792                 goto out2;
1793         }
1794
1795         /* Check the device vnode and update mount-from name, for local filesystems */
1796         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1797                 IMGSRC_DEBUG("Local, doing device validation.\n");
1798
1799                 if (devpath != USER_ADDR_NULL) {
1800                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1801                         if (error) {
1802                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1803                                 goto out2;
1804                         }
1805
1806                         vnode_put(devvp);
1807                 }
1808         }
1809
1810         /*
1811          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1812          * and increment the name cache's mount generation
1813          */
1814
1815         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1816         error = place_mount_and_checkdirs(mp, vp, ctx);
1817         if (error != 0) {
1818                 goto out2;
1819         }
1820
1821         placed = TRUE;
1822
1823         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1824         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1825
1826         /* Forbid future moves */
1827         mount_lock(mp);
1828         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1829         mount_unlock(mp);
1830
1831         /* Finally, add to mount list, completely ready to go */
1832         if (mount_list_add(mp) != 0) {
1833                 /*
1834                  * The system is shutting down trying to umount
1835                  * everything, so fail with a plausible errno.
1836                  */
1837                 error = EBUSY;
1838                 goto out3;
1839         }
1840
1841         mount_end_update(mp);
1842         vnode_put(rvp);
1843         FREE(old_mntonname, M_TEMP);
1844
1845         vfs_notify_mount(pvp);
1846
1847         return 0;
1848 out3:
1849         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1850
1851         mount_lock(mp);
1852         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1853         mount_unlock(mp);
1854
1855 out2:
1856         /*
1857          * Placing the mp on the vnode clears VMOUNT,
1858          * so cleanup is different after that point
1859          */
1860         if (placed) {
1861                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1862                 undo_place_on_covered_vp(mp, vp);
1863         } else {
1864                 vnode_lock_spin(vp);
1865                 CLR(vp->v_flag, VMOUNT);
1866                 vnode_unlock(vp);
1867         }
1868 out1:
1869         mount_end_update(mp);
1870
1871 out0:
1872         vnode_put(rvp);
1873         FREE(old_mntonname, M_TEMP);
1874         return error;
1875 }
1876
1877 #if CONFIG_LOCKERBOOT
1878 __private_extern__
1879 int
1880 mount_locker_protoboot(const char *fsname, const char *mntpoint,
1881     const char *pbdevpath)
1882 {
1883         int error = -1;
1884         struct nameidata nd;
1885         boolean_t cleanup_nd = FALSE;
1886         vfs_context_t ctx = vfs_context_kernel();
1887         boolean_t is64 = TRUE;
1888         boolean_t by_index = TRUE;
1889         struct user64_mnt_imgsrc_args mia64 = {
1890                 .mi_height = 0,
1891                 .mi_flags = 0,
1892                 .mi_devpath = CAST_USER_ADDR_T(pbdevpath),
1893         };
1894         user_addr_t mia64addr = CAST_USER_ADDR_T(&mia64);
1895
1896         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
1897             UIO_SYSSPACE, CAST_USER_ADDR_T(mntpoint), ctx);
1898         error = namei(&nd);
1899         if (error) {
1900                 IMGSRC_DEBUG("namei: %d\n", error);
1901                 goto out;
1902         }
1903
1904         cleanup_nd = TRUE;
1905         error = relocate_imageboot_source(nd.ni_dvp, nd.ni_vp,
1906             &nd.ni_cnd, fsname, ctx, is64, mia64addr, by_index);
1907
1908 out:
1909         if (cleanup_nd) {
1910                 int stashed = error;
1911
1912                 error = vnode_put(nd.ni_vp);
1913                 if (error) {
1914                         panic("vnode_put() returned non-zero: %d", error);
1915                 }
1916
1917                 if (nd.ni_dvp) {
1918                         error = vnode_put(nd.ni_dvp);
1919                         if (error) {
1920                                 panic("vnode_put() returned non-zero: %d", error);
1921                         }
1922                 }
1923                 nameidone(&nd);
1924
1925                 error = stashed;
1926         }
1927         return error;
1928 }
1929 #endif /* CONFIG_LOCKERBOOT */
1930 #endif /* CONFIG_IMGSRC_ACCESS */
1931
1932 void
1933 enablequotas(struct mount *mp, vfs_context_t ctx)
1934 {
1935         struct nameidata qnd;
1936         int type;
1937         char qfpath[MAXPATHLEN];
1938         const char *qfname = QUOTAFILENAME;
1939         const char *qfopsname = QUOTAOPSNAME;
1940         const char *qfextension[] = INITQFNAMES;
1941
1942         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1943         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
1944                 return;
1945         }
1946         /*
1947          * Enable filesystem disk quotas if necessary.
1948          * We ignore errors as this should not interfere with final mount
1949          */
1950         for (type = 0; type < MAXQUOTAS; type++) {
1951                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1952                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1953                     CAST_USER_ADDR_T(qfpath), ctx);
1954                 if (namei(&qnd) != 0) {
1955                         continue;           /* option file to trigger quotas is not present */
1956                 }
1957                 vnode_put(qnd.ni_vp);
1958                 nameidone(&qnd);
1959                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1960
1961                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1962         }
1963         return;
1964 }
1965
1966
1967 static int
1968 checkdirs_callback(proc_t p, void * arg)
1969 {
1970         struct cdirargs * cdrp = (struct cdirargs *)arg;
1971         vnode_t olddp = cdrp->olddp;
1972         vnode_t newdp = cdrp->newdp;
1973         struct filedesc *fdp;
1974         vnode_t new_cvp = newdp;
1975         vnode_t new_rvp = newdp;
1976         vnode_t old_cvp = NULL;
1977         vnode_t old_rvp = NULL;
1978
1979         /*
1980          * XXX Also needs to iterate each thread in the process to see if it
1981          * XXX is using a per-thread current working directory, and, if so,
1982          * XXX update that as well.
1983          */
1984
1985         /*
1986          * First, with the proc_fdlock held, check to see if we will need
1987          * to do any work.  If not, we will get out fast.
1988          */
1989         proc_fdlock(p);
1990         fdp = p->p_fd;
1991         if (fdp == NULL ||
1992             (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp)) {
1993                 proc_fdunlock(p);
1994                 return PROC_RETURNED;
1995         }
1996         proc_fdunlock(p);
1997
1998         /*
1999          * Ok, we will have to do some work.  Always take two refs
2000          * because we might need that many.  We'll dispose of whatever
2001          * we ended up not using.
2002          */
2003         if (vnode_ref(newdp) != 0) {
2004                 return PROC_RETURNED;
2005         }
2006         if (vnode_ref(newdp) != 0) {
2007                 vnode_rele(newdp);
2008                 return PROC_RETURNED;
2009         }
2010
2011         proc_dirs_lock_exclusive(p);
2012         /*
2013          * Now do the work.  Note: we dropped the proc_fdlock, so we
2014          * have to do all of the checks again.
2015          */
2016         proc_fdlock(p);
2017         fdp = p->p_fd;
2018         if (fdp != NULL) {
2019                 if (fdp->fd_cdir == olddp) {
2020                         old_cvp = olddp;
2021                         fdp->fd_cdir = newdp;
2022                         new_cvp = NULL;
2023                 }
2024                 if (fdp->fd_rdir == olddp) {
2025                         old_rvp = olddp;
2026                         fdp->fd_rdir = newdp;
2027                         new_rvp = NULL;
2028                 }
2029         }
2030         proc_fdunlock(p);
2031         proc_dirs_unlock_exclusive(p);
2032
2033         /*
2034          * Dispose of any references that are no longer needed.
2035          */
2036         if (old_cvp != NULL) {
2037                 vnode_rele(old_cvp);
2038         }
2039         if (old_rvp != NULL) {
2040                 vnode_rele(old_rvp);
2041         }
2042         if (new_cvp != NULL) {
2043                 vnode_rele(new_cvp);
2044         }
2045         if (new_rvp != NULL) {
2046                 vnode_rele(new_rvp);
2047         }
2048
2049         return PROC_RETURNED;
2050 }
2051
2052
2053
2054 /*
2055  * Scan all active processes to see if any of them have a current
2056  * or root directory onto which the new filesystem has just been
2057  * mounted. If so, replace them with the new mount point.
2058  */
2059 static int
2060 checkdirs(vnode_t olddp, vfs_context_t ctx)
2061 {
2062         vnode_t newdp;
2063         vnode_t tvp;
2064         int err;
2065         struct cdirargs cdr;
2066
2067         if (olddp->v_usecount == 1) {
2068                 return 0;
2069         }
2070         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2071
2072         if (err != 0) {
2073 #if DIAGNOSTIC
2074                 panic("mount: lost mount: error %d", err);
2075 #endif
2076                 return err;
2077         }
2078
2079         cdr.olddp = olddp;
2080         cdr.newdp = newdp;
2081         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2082         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2083
2084         if (rootvnode == olddp) {
2085                 vnode_ref(newdp);
2086                 tvp = rootvnode;
2087                 rootvnode = newdp;
2088                 vnode_rele(tvp);
2089         }
2090
2091         vnode_put(newdp);
2092         return 0;
2093 }
2094
2095 /*
2096  * Unmount a file system.
2097  *
2098  * Note: unmount takes a path to the vnode mounted on as argument,
2099  * not special file (as before).
2100  */
2101 /* ARGSUSED */
2102 int
2103 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2104 {
2105         vnode_t vp;
2106         struct mount *mp;
2107         int error;
2108         struct nameidata nd;
2109         vfs_context_t ctx = vfs_context_current();
2110
2111         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2112             UIO_USERSPACE, uap->path, ctx);
2113         error = namei(&nd);
2114         if (error) {
2115                 return error;
2116         }
2117         vp = nd.ni_vp;
2118         mp = vp->v_mount;
2119         nameidone(&nd);
2120
2121 #if CONFIG_MACF
2122         error = mac_mount_check_umount(ctx, mp);
2123         if (error != 0) {
2124                 vnode_put(vp);
2125                 return error;
2126         }
2127 #endif
2128         /*
2129          * Must be the root of the filesystem
2130          */
2131         if ((vp->v_flag & VROOT) == 0) {
2132                 vnode_put(vp);
2133                 return EINVAL;
2134         }
2135         mount_ref(mp, 0);
2136         vnode_put(vp);
2137         /* safedounmount consumes the mount ref */
2138         return safedounmount(mp, uap->flags, ctx);
2139 }
2140
2141 int
2142 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2143 {
2144         mount_t mp;
2145
2146         mp = mount_list_lookupby_fsid(fsid, 0, 1);
2147         if (mp == (mount_t)0) {
2148                 return ENOENT;
2149         }
2150         mount_ref(mp, 0);
2151         mount_iterdrop(mp);
2152         /* safedounmount consumes the mount ref */
2153         return safedounmount(mp, flags, ctx);
2154 }
2155
2156
2157 /*
2158  * The mount struct comes with a mount ref which will be consumed.
2159  * Do the actual file system unmount, prevent some common foot shooting.
2160  */
2161 int
2162 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2163 {
2164         int error;
2165         proc_t p = vfs_context_proc(ctx);
2166
2167         /*
2168          * If the file system is not responding and MNT_NOBLOCK
2169          * is set and not a forced unmount then return EBUSY.
2170          */
2171         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2172             (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2173                 error = EBUSY;
2174                 goto out;
2175         }
2176
2177         /*
2178          * Skip authorization if the mount is tagged as permissive and
2179          * this is not a forced-unmount attempt.
2180          */
2181         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
2182                 /*
2183                  * Only root, or the user that did the original mount is
2184                  * permitted to unmount this filesystem.
2185                  */
2186                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2187                     (error = suser(kauth_cred_get(), &p->p_acflag))) {
2188                         goto out;
2189                 }
2190         }
2191         /*
2192          * Don't allow unmounting the root file system (or the associated VM or DATA mounts) .
2193          */
2194         if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2195                 error = EBUSY; /* the root (or associated volumes) is always busy */
2196                 goto out;
2197         }
2198
2199 #ifdef CONFIG_IMGSRC_ACCESS
2200         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2201                 error = EBUSY;
2202                 goto out;
2203         }
2204 #endif /* CONFIG_IMGSRC_ACCESS */
2205
2206         return dounmount(mp, flags, 1, ctx);
2207
2208 out:
2209         mount_drop(mp, 0);
2210         return error;
2211 }
2212
2213 /*
2214  * Do the actual file system unmount.
2215  */
2216 int
2217 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2218 {
2219         vnode_t coveredvp = (vnode_t)0;
2220         int error;
2221         int needwakeup = 0;
2222         int forcedunmount = 0;
2223         int lflags = 0;
2224         struct vnode *devvp = NULLVP;
2225 #if CONFIG_TRIGGERS
2226         proc_t p = vfs_context_proc(ctx);
2227         int did_vflush = 0;
2228         int pflags_save = 0;
2229 #endif /* CONFIG_TRIGGERS */
2230
2231 #if CONFIG_FSE
2232         if (!(flags & MNT_FORCE)) {
2233                 fsevent_unmount(mp, ctx);  /* has to come first! */
2234         }
2235 #endif
2236
2237         mount_lock(mp);
2238
2239         /*
2240          * If already an unmount in progress just return EBUSY.
2241          * Even a forced unmount cannot override.
2242          */
2243         if (mp->mnt_lflag & MNT_LUNMOUNT) {
2244                 if (withref != 0) {
2245                         mount_drop(mp, 1);
2246                 }
2247                 mount_unlock(mp);
2248                 return EBUSY;
2249         }
2250
2251         if (flags & MNT_FORCE) {
2252                 forcedunmount = 1;
2253                 mp->mnt_lflag |= MNT_LFORCE;
2254         }
2255
2256 #if CONFIG_TRIGGERS
2257         if (flags & MNT_NOBLOCK && p != kernproc) {
2258                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2259         }
2260 #endif
2261
2262         mp->mnt_kern_flag |= MNTK_UNMOUNT;
2263         mp->mnt_lflag |= MNT_LUNMOUNT;
2264         mp->mnt_flag &= ~MNT_ASYNC;
2265         /*
2266          * anyone currently in the fast path that
2267          * trips over the cached rootvp will be
2268          * dumped out and forced into the slow path
2269          * to regenerate a new cached value
2270          */
2271         mp->mnt_realrootvp = NULLVP;
2272         mount_unlock(mp);
2273
2274         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2275                 /*
2276                  * Force unmount any mounts in this filesystem.
2277                  * If any unmounts fail - just leave them dangling.
2278                  * Avoids recursion.
2279                  */
2280                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2281         }
2282
2283         /*
2284          * taking the name_cache_lock exclusively will
2285          * insure that everyone is out of the fast path who
2286          * might be trying to use a now stale copy of
2287          * vp->v_mountedhere->mnt_realrootvp
2288          * bumping mount_generation causes the cached values
2289          * to be invalidated
2290          */
2291         name_cache_lock();
2292         mount_generation++;
2293         name_cache_unlock();
2294
2295
2296         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2297         if (withref != 0) {
2298                 mount_drop(mp, 0);
2299         }
2300         error = 0;
2301         if (forcedunmount == 0) {
2302                 ubc_umount(mp); /* release cached vnodes */
2303                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2304                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
2305                         if (error) {
2306                                 mount_lock(mp);
2307                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2308                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2309                                 mp->mnt_lflag &= ~MNT_LFORCE;
2310                                 goto out;
2311                         }
2312                 }
2313         }
2314
2315         IOBSDMountChange(mp, kIOMountChangeUnmount);
2316
2317 #if CONFIG_TRIGGERS
2318         vfs_nested_trigger_unmounts(mp, flags, ctx);
2319         did_vflush = 1;
2320 #endif
2321         if (forcedunmount) {
2322                 lflags |= FORCECLOSE;
2323         }
2324         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2325         if ((forcedunmount == 0) && error) {
2326                 mount_lock(mp);
2327                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2328                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2329                 mp->mnt_lflag &= ~MNT_LFORCE;
2330                 goto out;
2331         }
2332
2333         /* make sure there are no one in the mount iterations or lookup */
2334         mount_iterdrain(mp);
2335
2336         error = VFS_UNMOUNT(mp, flags, ctx);
2337         if (error) {
2338                 mount_iterreset(mp);
2339                 mount_lock(mp);
2340                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2341                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2342                 mp->mnt_lflag &= ~MNT_LFORCE;
2343                 goto out;
2344         }
2345
2346         /* increment the operations count */
2347         if (!error) {
2348                 OSAddAtomic(1, &vfs_nummntops);
2349         }
2350
2351         if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2352                 /* hold an io reference and drop the usecount before close */
2353                 devvp = mp->mnt_devvp;
2354                 vnode_getalways(devvp);
2355                 vnode_rele(devvp);
2356                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2357                     ctx);
2358                 vnode_clearmountedon(devvp);
2359                 vnode_put(devvp);
2360         }
2361         lck_rw_done(&mp->mnt_rwlock);
2362         mount_list_remove(mp);
2363         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2364
2365         /* mark the mount point hook in the vp but not drop the ref yet */
2366         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2367                 /*
2368                  * The covered vnode needs special handling. Trying to get an
2369                  * iocount must not block here as this may lead to deadlocks
2370                  * if the Filesystem to which the covered vnode belongs is
2371                  * undergoing forced unmounts. Since we hold a usecount, the
2372                  * vnode cannot be reused (it can, however, still be terminated)
2373                  */
2374                 vnode_getalways(coveredvp);
2375                 vnode_lock_spin(coveredvp);
2376
2377                 mp->mnt_crossref++;
2378                 coveredvp->v_mountedhere = (struct mount *)0;
2379                 CLR(coveredvp->v_flag, VMOUNT);
2380
2381                 vnode_unlock(coveredvp);
2382                 vnode_put(coveredvp);
2383         }
2384
2385         mount_list_lock();
2386         mp->mnt_vtable->vfc_refcount--;
2387         mount_list_unlock();
2388
2389         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2390         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2391         mount_lock(mp);
2392         mp->mnt_lflag |= MNT_LDEAD;
2393
2394         if (mp->mnt_lflag & MNT_LWAIT) {
2395                 /*
2396                  * do the wakeup here
2397                  * in case we block in mount_refdrain
2398                  * which will drop the mount lock
2399                  * and allow anyone blocked in vfs_busy
2400                  * to wakeup and see the LDEAD state
2401                  */
2402                 mp->mnt_lflag &= ~MNT_LWAIT;
2403                 wakeup((caddr_t)mp);
2404         }
2405         mount_refdrain(mp);
2406
2407         /* free disk_conditioner_info structure for this mount */
2408         disk_conditioner_unmount(mp);
2409
2410 out:
2411         if (mp->mnt_lflag & MNT_LWAIT) {
2412                 mp->mnt_lflag &= ~MNT_LWAIT;
2413                 needwakeup = 1;
2414         }
2415
2416 #if CONFIG_TRIGGERS
2417         if (flags & MNT_NOBLOCK && p != kernproc) {
2418                 // Restore P_NOREMOTEHANG bit to its previous value
2419                 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2420                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2421                 }
2422         }
2423
2424         /*
2425          * Callback and context are set together under the mount lock, and
2426          * never cleared, so we're safe to examine them here, drop the lock,
2427          * and call out.
2428          */
2429         if (mp->mnt_triggercallback != NULL) {
2430                 mount_unlock(mp);
2431                 if (error == 0) {
2432                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2433                 } else if (did_vflush) {
2434                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2435                 }
2436         } else {
2437                 mount_unlock(mp);
2438         }
2439 #else
2440         mount_unlock(mp);
2441 #endif /* CONFIG_TRIGGERS */
2442
2443         lck_rw_done(&mp->mnt_rwlock);
2444
2445         if (needwakeup) {
2446                 wakeup((caddr_t)mp);
2447         }
2448
2449         if (!error) {
2450                 if ((coveredvp != NULLVP)) {
2451                         vnode_t pvp = NULLVP;
2452
2453                         /*
2454                          * The covered vnode needs special handling. Trying to
2455                          * get an iocount must not block here as this may lead
2456                          * to deadlocks if the Filesystem to which the covered
2457                          * vnode belongs is undergoing forced unmounts. Since we
2458                          * hold a usecount, the  vnode cannot be reused
2459                          * (it can, however, still be terminated).
2460                          */
2461                         vnode_getalways(coveredvp);
2462
2463                         mount_dropcrossref(mp, coveredvp, 0);
2464                         /*
2465                          * We'll _try_ to detect if this really needs to be
2466                          * done. The coveredvp can only be in termination (or
2467                          * terminated) if the coveredvp's mount point is in a
2468                          * forced unmount (or has been) since we still hold the
2469                          * ref.
2470                          */
2471                         if (!vnode_isrecycled(coveredvp)) {
2472                                 pvp = vnode_getparent(coveredvp);
2473 #if CONFIG_TRIGGERS
2474                                 if (coveredvp->v_resolve) {
2475                                         vnode_trigger_rearm(coveredvp, ctx);
2476                                 }
2477 #endif
2478                         }
2479
2480                         vnode_rele(coveredvp);
2481                         vnode_put(coveredvp);
2482                         coveredvp = NULLVP;
2483
2484                         if (pvp) {
2485                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2486                                 vnode_put(pvp);
2487                         }
2488                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2489                         mount_lock_destroy(mp);
2490 #if CONFIG_MACF
2491                         mac_mount_label_destroy(mp);
2492 #endif
2493                         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2494                 } else {
2495                         panic("dounmount: no coveredvp");
2496                 }
2497         }
2498         return error;
2499 }
2500
2501 /*
2502  * Unmount any mounts in this filesystem.
2503  */
2504 void
2505 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2506 {
2507         mount_t smp;
2508         fsid_t *fsids, fsid;
2509         int fsids_sz;
2510         int count = 0, i, m = 0;
2511         vnode_t vp;
2512
2513         mount_list_lock();
2514
2515         // Get an array to hold the submounts fsids.
2516         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2517         count++;
2518         fsids_sz = count * sizeof(fsid_t);
2519         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2520         if (fsids == NULL) {
2521                 mount_list_unlock();
2522                 goto out;
2523         }
2524         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2525
2526         /*
2527          * Fill the array with submount fsids.
2528          * Since mounts are always added to the tail of the mount list, the
2529          * list is always in mount order.
2530          * For each mount check if the mounted-on vnode belongs to a
2531          * mount that's already added to our array of mounts to be unmounted.
2532          */
2533         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2534                 vp = smp->mnt_vnodecovered;
2535                 if (vp == NULL) {
2536                         continue;
2537                 }
2538                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2539                 for (i = 0; i <= m; i++) {
2540                         if (fsids[i].val[0] == fsid.val[0] &&
2541                             fsids[i].val[1] == fsid.val[1]) {
2542                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2543                                 break;
2544                         }
2545                 }
2546         }
2547         mount_list_unlock();
2548
2549         // Unmount the submounts in reverse order. Ignore errors.
2550         for (i = m; i > 0; i--) {
2551                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2552                 if (smp) {
2553                         mount_ref(smp, 0);
2554                         mount_iterdrop(smp);
2555                         (void) dounmount(smp, flags, 1, ctx);
2556                 }
2557         }
2558 out:
2559         if (fsids) {
2560                 FREE(fsids, M_TEMP);
2561         }
2562 }
2563
2564 void
2565 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2566 {
2567         vnode_lock(dp);
2568         mp->mnt_crossref--;
2569
2570         if (mp->mnt_crossref < 0) {
2571                 panic("mount cross refs -ve");
2572         }
2573
2574         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2575                 if (need_put) {
2576                         vnode_put_locked(dp);
2577                 }
2578                 vnode_unlock(dp);
2579
2580                 mount_lock_destroy(mp);
2581 #if CONFIG_MACF
2582                 mac_mount_label_destroy(mp);
2583 #endif
2584                 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2585                 return;
2586         }
2587         if (need_put) {
2588                 vnode_put_locked(dp);
2589         }
2590         vnode_unlock(dp);
2591 }
2592
2593
2594 /*
2595  * Sync each mounted filesystem.
2596  */
2597 #if DIAGNOSTIC
2598 int syncprt = 0;
2599 #endif
2600
2601 int print_vmpage_stat = 0;
2602
2603 /*
2604  * sync_callback:       simple wrapper that calls VFS_SYNC() on volumes
2605  *                      mounted read-write with the passed waitfor value.
2606  *
2607  * Parameters:  mp      mount-point descriptor per mounted file-system instance.
2608  *              arg     user argument (please see below)
2609  *
2610  * User argument is a pointer to 32 bit unsigned integer which describes the
2611  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
2612  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2613  * waitfor value.
2614  *
2615  * Returns:             VFS_RETURNED
2616  */
2617 static int
2618 sync_callback(mount_t mp, void *arg)
2619 {
2620         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2621                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2622                 unsigned waitfor = MNT_NOWAIT;
2623
2624                 if (arg) {
2625                         waitfor = *(uint32_t*)arg;
2626                 }
2627
2628                 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
2629                 if (waitfor != MNT_WAIT &&
2630                     waitfor != (MNT_WAIT | MNT_VOLUME) &&
2631                     waitfor != MNT_NOWAIT &&
2632                     waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2633                     waitfor != MNT_DWAIT &&
2634                     waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2635                         panic("Passed inappropriate waitfor %u to "
2636                             "sync_callback()", waitfor);
2637                 }
2638
2639                 mp->mnt_flag &= ~MNT_ASYNC;
2640                 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2641                 if (asyncflag) {
2642                         mp->mnt_flag |= MNT_ASYNC;
2643                 }
2644         }
2645
2646         return VFS_RETURNED;
2647 }
2648
2649 /* ARGSUSED */
2650 int
2651 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2652 {
2653         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2654
2655         if (print_vmpage_stat) {
2656                 vm_countdirtypages();
2657         }
2658
2659 #if DIAGNOSTIC
2660         if (syncprt) {
2661                 vfs_bufstats();
2662         }
2663 #endif /* DIAGNOSTIC */
2664         return 0;
2665 }
2666
2667 typedef enum {
2668         SYNC_ALL = 0,
2669         SYNC_ONLY_RELIABLE_MEDIA = 1,
2670         SYNC_ONLY_UNRELIABLE_MEDIA = 2
2671 } sync_type_t;
2672
2673 static int
2674 sync_internal_callback(mount_t mp, void *arg)
2675 {
2676         if (arg) {
2677                 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2678                     (mp->mnt_flag & MNT_LOCAL);
2679                 sync_type_t sync_type = *((sync_type_t *)arg);
2680
2681                 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2682                         return VFS_RETURNED;
2683                 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2684                         return VFS_RETURNED;
2685                 }
2686         }
2687
2688         (void)sync_callback(mp, NULL);
2689
2690         return VFS_RETURNED;
2691 }
2692
2693 int sync_thread_state = 0;
2694 int sync_timeout_seconds = 5;
2695
2696 #define SYNC_THREAD_RUN       0x0001
2697 #define SYNC_THREAD_RUNNING   0x0002
2698
2699 static void
2700 sync_thread(__unused void *arg, __unused wait_result_t wr)
2701 {
2702         sync_type_t sync_type;
2703
2704         lck_mtx_lock(sync_mtx_lck);
2705         while (sync_thread_state & SYNC_THREAD_RUN) {
2706                 sync_thread_state &= ~SYNC_THREAD_RUN;
2707                 lck_mtx_unlock(sync_mtx_lck);
2708
2709                 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2710                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2711                 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2712                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2713
2714                 lck_mtx_lock(sync_mtx_lck);
2715         }
2716         /*
2717          * This wakeup _has_ to be issued before the lock is released otherwise
2718          * we may end up waking up a thread in sync_internal which is
2719          * expecting a wakeup from a thread it just created and not from this
2720          * thread which is about to exit.
2721          */
2722         wakeup(&sync_thread_state);
2723         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2724         lck_mtx_unlock(sync_mtx_lck);
2725
2726         if (print_vmpage_stat) {
2727                 vm_countdirtypages();
2728         }
2729
2730 #if DIAGNOSTIC
2731         if (syncprt) {
2732                 vfs_bufstats();
2733         }
2734 #endif /* DIAGNOSTIC */
2735 }
2736
2737 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2738
2739 /*
2740  * An in-kernel sync for power management to call.
2741  * This function always returns within sync_timeout seconds.
2742  */
2743 __private_extern__ int
2744 sync_internal(void)
2745 {
2746         thread_t thd;
2747         int error;
2748         int thread_created = FALSE;
2749         struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2750
2751         lck_mtx_lock(sync_mtx_lck);
2752         sync_thread_state |= SYNC_THREAD_RUN;
2753         if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2754                 int kr;
2755
2756                 sync_thread_state |= SYNC_THREAD_RUNNING;
2757                 kr = kernel_thread_start(sync_thread, NULL, &thd);
2758                 if (kr != KERN_SUCCESS) {
2759                         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2760                         lck_mtx_unlock(sync_mtx_lck);
2761                         printf("sync_thread failed\n");
2762                         return 0;
2763                 }
2764                 thread_created = TRUE;
2765         }
2766
2767         error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
2768             (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2769         if (error) {
2770                 struct timeval now;
2771
2772                 microtime(&now);
2773                 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2774                         printf("sync timed out: %d sec\n", sync_timeout_seconds);
2775                         sync_timeout_last_print.tv_sec = now.tv_sec;
2776                 }
2777         }
2778
2779         if (thread_created) {
2780                 thread_deallocate(thd);
2781         }
2782
2783         return 0;
2784 } /* end of sync_internal call */
2785
2786 /*
2787  * Change filesystem quotas.
2788  */
2789 #if QUOTA
2790 int
2791 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2792 {
2793         struct mount *mp;
2794         int error, quota_cmd, quota_status = 0;
2795         caddr_t datap;
2796         size_t fnamelen;
2797         struct nameidata nd;
2798         vfs_context_t ctx = vfs_context_current();
2799         struct dqblk my_dqblk = {};
2800
2801         AUDIT_ARG(uid, uap->uid);
2802         AUDIT_ARG(cmd, uap->cmd);
2803         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2804             uap->path, ctx);
2805         error = namei(&nd);
2806         if (error) {
2807                 return error;
2808         }
2809         mp = nd.ni_vp->v_mount;
2810         mount_ref(mp, 0);
2811         vnode_put(nd.ni_vp);
2812         nameidone(&nd);
2813
2814         /* copyin any data we will need for downstream code */
2815         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2816
2817         switch (quota_cmd) {
2818         case Q_QUOTAON:
2819                 /* uap->arg specifies a file from which to take the quotas */
2820                 fnamelen = MAXPATHLEN;
2821                 datap = kalloc(MAXPATHLEN);
2822                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2823                 break;
2824         case Q_GETQUOTA:
2825                 /* uap->arg is a pointer to a dqblk structure. */
2826                 datap = (caddr_t) &my_dqblk;
2827                 break;
2828         case Q_SETQUOTA:
2829         case Q_SETUSE:
2830                 /* uap->arg is a pointer to a dqblk structure. */
2831                 datap = (caddr_t) &my_dqblk;
2832                 if (proc_is64bit(p)) {
2833                         struct user_dqblk       my_dqblk64;
2834                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2835                         if (error == 0) {
2836                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2837                         }
2838                 } else {
2839                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2840                 }
2841                 break;
2842         case Q_QUOTASTAT:
2843                 /* uap->arg is a pointer to an integer */
2844                 datap = (caddr_t) &quota_status;
2845                 break;
2846         default:
2847                 datap = NULL;
2848                 break;
2849         } /* switch */
2850
2851         if (error == 0) {
2852                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2853         }
2854
2855         switch (quota_cmd) {
2856         case Q_QUOTAON:
2857                 if (datap != NULL) {
2858                         kfree(datap, MAXPATHLEN);
2859                 }
2860                 break;
2861         case Q_GETQUOTA:
2862                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2863                 if (error == 0) {
2864                         if (proc_is64bit(p)) {
2865                                 struct user_dqblk       my_dqblk64;
2866
2867                                 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
2868                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2869                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
2870                         } else {
2871                                 error = copyout(datap, uap->arg, sizeof(struct dqblk));
2872                         }
2873                 }
2874                 break;
2875         case Q_QUOTASTAT:
2876                 /* uap->arg is a pointer to an integer */
2877                 if (error == 0) {
2878                         error = copyout(datap, uap->arg, sizeof(quota_status));
2879                 }
2880                 break;
2881         default:
2882                 break;
2883         } /* switch */
2884
2885         mount_drop(mp, 0);
2886         return error;
2887 }
2888 #else
2889 int
2890 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2891 {
2892         return EOPNOTSUPP;
2893 }
2894 #endif /* QUOTA */
2895
2896 /*
2897  * Get filesystem statistics.
2898  *
2899  * Returns:     0                       Success
2900  *      namei:???
2901  *      vfs_update_vfsstat:???
2902  *      munge_statfs:EFAULT
2903  */
2904 /* ARGSUSED */
2905 int
2906 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2907 {
2908         struct mount *mp;
2909         struct vfsstatfs *sp;
2910         int error;
2911         struct nameidata nd;
2912         vfs_context_t ctx = vfs_context_current();
2913         vnode_t vp;
2914
2915         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2916             UIO_USERSPACE, uap->path, ctx);
2917         error = namei(&nd);
2918         if (error != 0) {
2919                 return error;
2920         }
2921         vp = nd.ni_vp;
2922         mp = vp->v_mount;
2923         sp = &mp->mnt_vfsstat;
2924         nameidone(&nd);
2925
2926 #if CONFIG_MACF
2927         error = mac_mount_check_stat(ctx, mp);
2928         if (error != 0) {
2929                 vnode_put(vp);
2930                 return error;
2931         }
2932 #endif
2933
2934         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2935         if (error != 0) {
2936                 vnode_put(vp);
2937                 return error;
2938         }
2939
2940         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2941         vnode_put(vp);
2942         return error;
2943 }
2944
2945 /*
2946  * Get filesystem statistics.
2947  */
2948 /* ARGSUSED */
2949 int
2950 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2951 {
2952         vnode_t vp;
2953         struct mount *mp;
2954         struct vfsstatfs *sp;
2955         int error;
2956
2957         AUDIT_ARG(fd, uap->fd);
2958
2959         if ((error = file_vnode(uap->fd, &vp))) {
2960                 return error;
2961         }
2962
2963         error = vnode_getwithref(vp);
2964         if (error) {
2965                 file_drop(uap->fd);
2966                 return error;
2967         }
2968
2969         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2970
2971         mp = vp->v_mount;
2972         if (!mp) {
2973                 error = EBADF;
2974                 goto out;
2975         }
2976
2977 #if CONFIG_MACF
2978         error = mac_mount_check_stat(vfs_context_current(), mp);
2979         if (error != 0) {
2980                 goto out;
2981         }
2982 #endif
2983
2984         sp = &mp->mnt_vfsstat;
2985         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2986                 goto out;
2987         }
2988
2989         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2990
2991 out:
2992         file_drop(uap->fd);
2993         vnode_put(vp);
2994
2995         return error;
2996 }
2997
2998 void
2999 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3000 {
3001         struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3002
3003         bzero(sfs, sizeof(*sfs));
3004
3005         sfs->f_bsize = vsfs->f_bsize;
3006         sfs->f_iosize = (int32_t)vsfs->f_iosize;
3007         sfs->f_blocks = vsfs->f_blocks;
3008         sfs->f_bfree = vsfs->f_bfree;
3009         sfs->f_bavail = vsfs->f_bavail;
3010         sfs->f_files = vsfs->f_files;
3011         sfs->f_ffree = vsfs->f_ffree;
3012         sfs->f_fsid = vsfs->f_fsid;
3013         sfs->f_owner = vsfs->f_owner;
3014         sfs->f_type = mp->mnt_vtable->vfc_typenum;
3015         sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3016         sfs->f_fssubtype = vsfs->f_fssubtype;
3017         sfs->f_flags_ext = ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS)) ? MNT_EXT_ROOT_DATA_VOL : 0;
3018         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3019                 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3020         } else {
3021                 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3022         }
3023         strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3024         strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3025 }
3026
3027 /*
3028  * Get file system statistics in 64-bit mode
3029  */
3030 int
3031 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3032 {
3033         struct mount *mp;
3034         int error;
3035         struct nameidata nd;
3036         struct statfs64 sfs;
3037         vfs_context_t ctxp = vfs_context_current();
3038         vnode_t vp;
3039
3040         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3041             UIO_USERSPACE, uap->path, ctxp);
3042         error = namei(&nd);
3043         if (error != 0) {
3044                 return error;
3045         }
3046         vp = nd.ni_vp;
3047         mp = vp->v_mount;
3048         nameidone(&nd);
3049
3050 #if CONFIG_MACF
3051         error = mac_mount_check_stat(ctxp, mp);
3052         if (error != 0) {
3053                 vnode_put(vp);
3054                 return error;
3055         }
3056 #endif
3057
3058         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3059         if (error != 0) {
3060                 vnode_put(vp);
3061                 return error;
3062         }
3063
3064         vfs_get_statfs64(mp, &sfs);
3065         if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3066             (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3067                 /* This process does not want to see a seperate data volume mountpoint */
3068                 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3069         }
3070         error = copyout(&sfs, uap->buf, sizeof(sfs));
3071         vnode_put(vp);
3072
3073         return error;
3074 }
3075
3076 /*
3077  * Get file system statistics in 64-bit mode
3078  */
3079 int
3080 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3081 {
3082         struct vnode *vp;
3083         struct mount *mp;
3084         struct statfs64 sfs;
3085         int error;
3086
3087         AUDIT_ARG(fd, uap->fd);
3088
3089         if ((error = file_vnode(uap->fd, &vp))) {
3090                 return error;
3091         }
3092
3093         error = vnode_getwithref(vp);
3094         if (error) {
3095                 file_drop(uap->fd);
3096                 return error;
3097         }
3098
3099         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3100
3101         mp = vp->v_mount;
3102         if (!mp) {
3103                 error = EBADF;
3104                 goto out;
3105         }
3106
3107 #if CONFIG_MACF
3108         error = mac_mount_check_stat(vfs_context_current(), mp);
3109         if (error != 0) {
3110                 goto out;
3111         }
3112 #endif
3113
3114         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3115                 goto out;
3116         }
3117
3118         vfs_get_statfs64(mp, &sfs);
3119         if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3120             (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3121                 /* This process does not want to see a seperate data volume mountpoint */
3122                 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3123         }
3124         error = copyout(&sfs, uap->buf, sizeof(sfs));
3125
3126 out:
3127         file_drop(uap->fd);
3128         vnode_put(vp);
3129
3130         return error;
3131 }
3132
3133 struct getfsstat_struct {
3134         user_addr_t     sfsp;
3135         user_addr_t     *mp;
3136         int             count;
3137         int             maxcount;
3138         int             flags;
3139         int             error;
3140 };
3141
3142
3143 static int
3144 getfsstat_callback(mount_t mp, void * arg)
3145 {
3146         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3147         struct vfsstatfs *sp;
3148         int error, my_size;
3149         vfs_context_t ctx = vfs_context_current();
3150
3151         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3152 #if CONFIG_MACF
3153                 error = mac_mount_check_stat(ctx, mp);
3154                 if (error != 0) {
3155                         fstp->error = error;
3156                         return VFS_RETURNED_DONE;
3157                 }
3158 #endif
3159                 sp = &mp->mnt_vfsstat;
3160                 /*
3161                  * If MNT_NOWAIT is specified, do not refresh the
3162                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3163                  */
3164                 if ((mp->mnt_lflag & MNT_LDEAD) ||
3165                     (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3166                     (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3167                     (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3168                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3169                         return VFS_RETURNED;
3170                 }
3171
3172                 /*
3173                  * Need to handle LP64 version of struct statfs
3174                  */
3175                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3176                 if (error) {
3177                         fstp->error = error;
3178                         return VFS_RETURNED_DONE;
3179                 }
3180                 fstp->sfsp += my_size;
3181
3182                 if (fstp->mp) {
3183 #if CONFIG_MACF
3184                         error = mac_mount_label_get(mp, *fstp->mp);
3185                         if (error) {
3186                                 fstp->error = error;
3187                                 return VFS_RETURNED_DONE;
3188                         }
3189 #endif
3190                         fstp->mp++;
3191                 }
3192         }
3193         fstp->count++;
3194         return VFS_RETURNED;
3195 }
3196
3197 /*
3198  * Get statistics on all filesystems.
3199  */
3200 int
3201 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3202 {
3203         struct __mac_getfsstat_args muap;
3204
3205         muap.buf = uap->buf;
3206         muap.bufsize = uap->bufsize;
3207         muap.mac = USER_ADDR_NULL;
3208         muap.macsize = 0;
3209         muap.flags = uap->flags;
3210
3211         return __mac_getfsstat(p, &muap, retval);
3212 }
3213
3214 /*
3215  * __mac_getfsstat: Get MAC-related file system statistics
3216  *
3217  * Parameters:    p                        (ignored)
3218  *                uap                      User argument descriptor (see below)
3219  *                retval                   Count of file system statistics (N stats)
3220  *
3221  * Indirect:      uap->bufsize             Buffer size
3222  *                uap->macsize             MAC info size
3223  *                uap->buf                 Buffer where information will be returned
3224  *                uap->mac                 MAC info
3225  *                uap->flags               File system flags
3226  *
3227  *
3228  * Returns:        0                       Success
3229  *                !0                       Not success
3230  *
3231  */
3232 int
3233 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3234 {
3235         user_addr_t sfsp;
3236         user_addr_t *mp;
3237         size_t count, maxcount, bufsize, macsize;
3238         struct getfsstat_struct fst;
3239
3240         if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3241                 return EINVAL;
3242         }
3243
3244         bufsize = (size_t) uap->bufsize;
3245         macsize = (size_t) uap->macsize;
3246
3247         if (IS_64BIT_PROCESS(p)) {
3248                 maxcount = bufsize / sizeof(struct user64_statfs);
3249         } else {
3250                 maxcount = bufsize / sizeof(struct user32_statfs);
3251         }
3252         sfsp = uap->buf;
3253         count = 0;
3254
3255         mp = NULL;
3256
3257 #if CONFIG_MACF
3258         if (uap->mac != USER_ADDR_NULL) {
3259                 u_int32_t *mp0;
3260                 int error;
3261                 unsigned int i;
3262
3263                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3264                 if (count != maxcount) {
3265                         return EINVAL;
3266                 }
3267
3268                 /* Copy in the array */
3269                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
3270                 if (mp0 == NULL) {
3271                         return ENOMEM;
3272                 }
3273
3274                 error = copyin(uap->mac, mp0, macsize);
3275                 if (error) {
3276                         FREE(mp0, M_MACTEMP);
3277                         return error;
3278                 }
3279
3280                 /* Normalize to an array of user_addr_t */
3281                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
3282                 if (mp == NULL) {
3283                         FREE(mp0, M_MACTEMP);
3284                         return ENOMEM;
3285                 }
3286
3287                 for (i = 0; i < count; i++) {
3288                         if (IS_64BIT_PROCESS(p)) {
3289                                 mp[i] = ((user_addr_t *)mp0)[i];
3290                         } else {
3291                                 mp[i] = (user_addr_t)mp0[i];
3292                         }
3293                 }
3294                 FREE(mp0, M_MACTEMP);
3295         }
3296 #endif
3297
3298
3299         fst.sfsp = sfsp;
3300         fst.mp = mp;
3301         fst.flags = uap->flags;
3302         fst.count = 0;
3303         fst.error = 0;
3304         fst.maxcount = maxcount;
3305
3306
3307         vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3308
3309         if (mp) {
3310                 FREE(mp, M_MACTEMP);
3311         }
3312
3313         if (fst.error) {
3314                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3315                 return fst.error;
3316         }
3317
3318         if (fst.sfsp && fst.count > fst.maxcount) {
3319                 *retval = fst.maxcount;
3320         } else {
3321                 *retval = fst.count;
3322         }
3323         return 0;
3324 }
3325
3326 static int
3327 getfsstat64_callback(mount_t mp, void * arg)
3328 {
3329         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3330         struct vfsstatfs *sp;
3331         struct statfs64 sfs;
3332         int error;
3333
3334         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3335 #if CONFIG_MACF
3336                 error = mac_mount_check_stat(vfs_context_current(), mp);
3337                 if (error != 0) {
3338                         fstp->error = error;
3339                         return VFS_RETURNED_DONE;
3340                 }
3341 #endif
3342                 sp = &mp->mnt_vfsstat;
3343                 /*
3344                  * If MNT_NOWAIT is specified, do not refresh the fsstat
3345                  * cache. MNT_WAIT overrides MNT_NOWAIT.
3346                  *
3347                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
3348                  * getfsstat, since the constants are out of the same
3349                  * namespace.
3350                  */
3351                 if ((mp->mnt_lflag & MNT_LDEAD) ||
3352                     ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3353                     (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3354                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3355                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3356                         return VFS_RETURNED;
3357                 }
3358
3359                 vfs_get_statfs64(mp, &sfs);
3360                 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3361                 if (error) {
3362                         fstp->error = error;
3363                         return VFS_RETURNED_DONE;
3364                 }
3365                 fstp->sfsp += sizeof(sfs);
3366         }
3367         fstp->count++;
3368         return VFS_RETURNED;
3369 }
3370
3371 /*
3372  * Get statistics on all file systems in 64 bit mode.
3373  */
3374 int
3375 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3376 {
3377         user_addr_t sfsp;
3378         int count, maxcount;
3379         struct getfsstat_struct fst;
3380
3381         maxcount = uap->bufsize / sizeof(struct statfs64);
3382
3383         sfsp = uap->buf;
3384         count = 0;
3385
3386         fst.sfsp = sfsp;
3387         fst.flags = uap->flags;
3388         fst.count = 0;
3389         fst.error = 0;
3390         fst.maxcount = maxcount;
3391
3392         vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3393
3394         if (fst.error) {
3395                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3396                 return fst.error;
3397         }
3398
3399         if (fst.sfsp && fst.count > fst.maxcount) {
3400                 *retval = fst.maxcount;
3401         } else {
3402                 *retval = fst.count;
3403         }
3404
3405         return 0;
3406 }
3407
3408 /*
3409  * gets the associated vnode with the file descriptor passed.
3410  * as input
3411  *
3412  * INPUT
3413  * ctx - vfs context of caller
3414  * fd - file descriptor for which vnode is required.
3415  * vpp - Pointer to pointer to vnode to be returned.
3416  *
3417  * The vnode is returned with an iocount so any vnode obtained
3418  * by this call needs a vnode_put
3419  *
3420  */
3421 int
3422 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3423 {
3424         int error;
3425         vnode_t vp;
3426         struct fileproc *fp;
3427         proc_t p = vfs_context_proc(ctx);
3428
3429         *vpp =  NULLVP;
3430
3431         error = fp_getfvp(p, fd, &fp, &vp);
3432         if (error) {
3433                 return error;
3434         }
3435
3436         error = vnode_getwithref(vp);
3437         if (error) {
3438                 (void)fp_drop(p, fd, fp, 0);
3439                 return error;
3440         }
3441
3442         (void)fp_drop(p, fd, fp, 0);
3443         *vpp = vp;
3444         return error;
3445 }
3446
3447 /*
3448  * Wrapper function around namei to start lookup from a directory
3449  * specified by a file descriptor ni_dirfd.
3450  *
3451  * In addition to all the errors returned by namei, this call can
3452  * return ENOTDIR if the file descriptor does not refer to a directory.
3453  * and EBADF if the file descriptor is not valid.
3454  */
3455 int
3456 nameiat(struct nameidata *ndp, int dirfd)
3457 {
3458         if ((dirfd != AT_FDCWD) &&
3459             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3460             !(ndp->ni_cnd.cn_flags & USEDVP)) {
3461                 int error = 0;
3462                 char c;
3463
3464                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3465                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3466                         if (error) {
3467                                 return error;
3468                         }
3469                 } else {
3470                         c = *((char *)(ndp->ni_dirp));
3471                 }
3472
3473                 if (c != '/') {
3474                         vnode_t dvp_at;
3475
3476                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3477                             &dvp_at);
3478                         if (error) {
3479                                 return error;
3480                         }
3481
3482                         if (vnode_vtype(dvp_at) != VDIR) {
3483                                 vnode_put(dvp_at);
3484                                 return ENOTDIR;
3485                         }
3486
3487                         ndp->ni_dvp = dvp_at;
3488                         ndp->ni_cnd.cn_flags |= USEDVP;
3489                         error = namei(ndp);
3490                         ndp->ni_cnd.cn_flags &= ~USEDVP;
3491                         vnode_put(dvp_at);
3492                         return error;
3493                 }
3494         }
3495
3496         return namei(ndp);
3497 }
3498
3499 /*
3500  * Change current working directory to a given file descriptor.
3501  */
3502 /* ARGSUSED */
3503 static int
3504 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3505 {
3506         struct filedesc *fdp = p->p_fd;
3507         vnode_t vp;
3508         vnode_t tdp;
3509         vnode_t tvp;
3510         struct mount *mp;
3511         int error;
3512         vfs_context_t ctx = vfs_context_current();
3513
3514         AUDIT_ARG(fd, uap->fd);
3515         if (per_thread && uap->fd == -1) {
3516                 /*
3517                  * Switching back from per-thread to per process CWD; verify we
3518                  * in fact have one before proceeding.  The only success case
3519                  * for this code path is to return 0 preemptively after zapping
3520                  * the thread structure contents.
3521                  */
3522                 thread_t th = vfs_context_thread(ctx);
3523                 if (th) {
3524                         uthread_t uth = get_bsdthread_info(th);
3525                         tvp = uth->uu_cdir;
3526                         uth->uu_cdir = NULLVP;
3527                         if (tvp != NULLVP) {
3528                                 vnode_rele(tvp);
3529                                 return 0;
3530                         }
3531                 }
3532                 return EBADF;
3533         }
3534
3535         if ((error = file_vnode(uap->fd, &vp))) {
3536                 return error;
3537         }
3538         if ((error = vnode_getwithref(vp))) {
3539                 file_drop(uap->fd);
3540                 return error;
3541         }
3542
3543         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3544
3545         if (vp->v_type != VDIR) {
3546                 error = ENOTDIR;
3547                 goto out;
3548         }
3549
3550 #if CONFIG_MACF
3551         error = mac_vnode_check_chdir(ctx, vp);
3552         if (error) {
3553                 goto out;
3554         }
3555 #endif
3556         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3557         if (error) {
3558                 goto out;
3559         }
3560
3561         while (!error && (mp = vp->v_mountedhere) != NULL) {
3562                 if (vfs_busy(mp, LK_NOWAIT)) {
3563                         error = EACCES;
3564                         goto out;
3565                 }
3566                 error = VFS_ROOT(mp, &tdp, ctx);
3567                 vfs_unbusy(mp);
3568                 if (error) {
3569                         break;
3570                 }
3571                 vnode_put(vp);
3572                 vp = tdp;
3573         }
3574         if (error) {
3575                 goto out;
3576         }
3577         if ((error = vnode_ref(vp))) {
3578                 goto out;
3579         }
3580         vnode_put(vp);
3581
3582         if (per_thread) {
3583                 thread_t th = vfs_context_thread(ctx);
3584                 if (th) {
3585                         uthread_t uth = get_bsdthread_info(th);
3586                         tvp = uth->uu_cdir;
3587                         uth->uu_cdir = vp;
3588                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3589                 } else {
3590                         vnode_rele(vp);
3591                         return ENOENT;
3592                 }
3593         } else {
3594                 proc_dirs_lock_exclusive(p);
3595                 proc_fdlock(p);
3596                 tvp = fdp->fd_cdir;
3597                 fdp->fd_cdir = vp;
3598                 proc_fdunlock(p);
3599                 proc_dirs_unlock_exclusive(p);
3600         }
3601
3602         if (tvp) {
3603                 vnode_rele(tvp);
3604         }
3605         file_drop(uap->fd);
3606
3607         return 0;
3608 out:
3609         vnode_put(vp);
3610         file_drop(uap->fd);
3611
3612         return error;
3613 }
3614
3615 int
3616 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3617 {
3618         return common_fchdir(p, uap, 0);
3619 }
3620
3621 int
3622 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3623 {
3624         return common_fchdir(p, (void *)uap, 1);
3625 }
3626
3627
3628 /*
3629  * Change current working directory (".").
3630  *
3631  * Returns:     0                       Success
3632  *      change_dir:ENOTDIR
3633  *      change_dir:???
3634  *      vnode_ref:ENOENT                No such file or directory
3635  */
3636 /* ARGSUSED */
3637 int
3638 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3639 {
3640         struct filedesc *fdp = p->p_fd;
3641         int error;
3642         vnode_t tvp;
3643
3644         error = change_dir(ndp, ctx);
3645         if (error) {
3646                 return error;
3647         }
3648         if ((error = vnode_ref(ndp->ni_vp))) {
3649                 vnode_put(ndp->ni_vp);
3650                 return error;
3651         }
3652         /*
3653          * drop the iocount we picked up in change_dir
3654          */
3655         vnode_put(ndp->ni_vp);
3656
3657         if (per_thread) {
3658                 thread_t th = vfs_context_thread(ctx);
3659                 if (th) {
3660                         uthread_t uth = get_bsdthread_info(th);
3661                         tvp = uth->uu_cdir;
3662                         uth->uu_cdir = ndp->ni_vp;
3663                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3664                 } else {
3665                         vnode_rele(ndp->ni_vp);
3666                         return ENOENT;
3667                 }
3668         } else {
3669                 proc_dirs_lock_exclusive(p);
3670                 proc_fdlock(p);
3671                 tvp = fdp->fd_cdir;
3672                 fdp->fd_cdir = ndp->ni_vp;
3673                 proc_fdunlock(p);
3674                 proc_dirs_unlock_exclusive(p);
3675         }
3676
3677         if (tvp) {
3678                 vnode_rele(tvp);
3679         }
3680
3681         return 0;
3682 }
3683
3684
3685 /*
3686  * Change current working directory (".").
3687  *
3688  * Returns:     0                       Success
3689  *      chdir_internal:ENOTDIR
3690  *      chdir_internal:ENOENT           No such file or directory
3691  *      chdir_internal:???
3692  */
3693 /* ARGSUSED */
3694 static int
3695 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3696 {
3697         struct nameidata nd;
3698         vfs_context_t ctx = vfs_context_current();
3699
3700         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3701             UIO_USERSPACE, uap->path, ctx);
3702
3703         return chdir_internal(p, ctx, &nd, per_thread);
3704 }
3705
3706
3707 /*
3708  * chdir
3709  *
3710  * Change current working directory (".") for the entire process
3711  *
3712  * Parameters:  p       Process requesting the call
3713  *              uap     User argument descriptor (see below)
3714  *              retval  (ignored)
3715  *
3716  * Indirect parameters: uap->path       Directory path
3717  *
3718  * Returns:     0                       Success
3719  *              common_chdir: ENOTDIR
3720  *              common_chdir: ENOENT    No such file or directory
3721  *              common_chdir: ???
3722  *
3723  */
3724 int
3725 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3726 {
3727         return common_chdir(p, (void *)uap, 0);
3728 }
3729
3730 /*
3731  * __pthread_chdir
3732  *
3733  * Change current working directory (".") for a single thread
3734  *
3735  * Parameters:  p       Process requesting the call
3736  *              uap     User argument descriptor (see below)
3737  *              retval  (ignored)
3738  *
3739  * Indirect parameters: uap->path       Directory path
3740  *
3741  * Returns:     0                       Success
3742  *              common_chdir: ENOTDIR
3743  *              common_chdir: ENOENT    No such file or directory
3744  *              common_chdir: ???
3745  *
3746  */
3747 int
3748 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3749 {
3750         return common_chdir(p, (void *)uap, 1);
3751 }
3752
3753
3754 /*
3755  * Change notion of root (``/'') directory.
3756  */
3757 /* ARGSUSED */
3758 int
3759 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3760 {
3761         struct filedesc *fdp = p->p_fd;
3762         int error;
3763         struct nameidata nd;
3764         vnode_t tvp;
3765         vfs_context_t ctx = vfs_context_current();
3766
3767         if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3768                 return error;
3769         }
3770
3771         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3772             UIO_USERSPACE, uap->path, ctx);
3773         error = change_dir(&nd, ctx);
3774         if (error) {
3775                 return error;
3776         }
3777
3778 #if CONFIG_MACF
3779         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3780             &nd.ni_cnd);
3781         if (error) {
3782                 vnode_put(nd.ni_vp);
3783                 return error;
3784         }
3785 #endif
3786
3787         if ((error = vnode_ref(nd.ni_vp))) {
3788                 vnode_put(nd.ni_vp);
3789                 return error;
3790         }
3791         vnode_put(nd.ni_vp);
3792
3793         /*
3794          * This lock provides the guarantee that as long as you hold the lock
3795          * fdp->fd_rdir has a usecount on it. This is used to take an iocount
3796          * on a referenced vnode in namei when determining the rootvnode for
3797          * a process.
3798          */
3799         /* needed for synchronization with lookup */
3800         proc_dirs_lock_exclusive(p);
3801         /* needed for setting the flag and other activities on the fd itself */
3802         proc_fdlock(p);
3803         tvp = fdp->fd_rdir;
3804         fdp->fd_rdir = nd.ni_vp;
3805         fdp->fd_flags |= FD_CHROOT;
3806         proc_fdunlock(p);
3807         proc_dirs_unlock_exclusive(p);
3808
3809         if (tvp != NULL) {
3810                 vnode_rele(tvp);
3811         }
3812
3813         return 0;
3814 }
3815
3816 /*
3817  * Common routine for chroot and chdir.
3818  *
3819  * Returns:     0                       Success
3820  *              ENOTDIR                 Not a directory
3821  *              namei:???               [anything namei can return]
3822  *              vnode_authorize:???     [anything vnode_authorize can return]
3823  */
3824 static int
3825 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3826 {
3827         vnode_t vp;
3828         int error;
3829
3830         if ((error = namei(ndp))) {
3831                 return error;
3832         }
3833         nameidone(ndp);
3834         vp = ndp->ni_vp;
3835
3836         if (vp->v_type != VDIR) {
3837                 vnode_put(vp);
3838                 return ENOTDIR;
3839         }
3840
3841 #if CONFIG_MACF
3842         error = mac_vnode_check_chdir(ctx, vp);
3843         if (error) {
3844                 vnode_put(vp);
3845                 return error;
3846         }
3847 #endif
3848
3849         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3850         if (error) {
3851                 vnode_put(vp);
3852                 return error;
3853         }
3854
3855         return error;
3856 }
3857
3858 /*
3859  * Free the vnode data (for directories) associated with the file glob.
3860  */
3861 struct fd_vn_data *
3862 fg_vn_data_alloc(void)
3863 {
3864         struct fd_vn_data *fvdata;
3865
3866         /* Allocate per fd vnode data */
3867         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3868             M_FD_VN_DATA, M_WAITOK | M_ZERO);
3869         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3870         return fvdata;
3871 }
3872
3873 /*
3874  * Free the vnode data (for directories) associated with the file glob.
3875  */
3876 void
3877 fg_vn_data_free(void *fgvndata)
3878 {
3879         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3880
3881         if (fvdata->fv_buf) {
3882                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3883         }
3884         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3885         FREE(fvdata, M_FD_VN_DATA);
3886 }
3887
3888 /*
3889  * Check permissions, allocate an open file structure,
3890  * and call the device open routine if any.
3891  *
3892  * Returns:     0                       Success
3893  *              EINVAL
3894  *              EINTR
3895  *      falloc:ENFILE
3896  *      falloc:EMFILE
3897  *      falloc:ENOMEM
3898  *      vn_open_auth:???
3899  *      dupfdopen:???
3900  *      VNOP_ADVLOCK:???
3901  *      vnode_setsize:???
3902  *
3903  * XXX Need to implement uid, gid
3904  */
3905 int
3906 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3907     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3908     int32_t *retval)
3909 {
3910         proc_t p = vfs_context_proc(ctx);
3911         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3912         struct fileproc *fp;
3913         vnode_t vp;
3914         int flags, oflags;
3915         int type, indx, error;
3916         struct flock lf;
3917         struct vfs_context context;
3918
3919         oflags = uflags;
3920
3921         if ((oflags & O_ACCMODE) == O_ACCMODE) {
3922                 return EINVAL;
3923         }
3924
3925         flags = FFLAGS(uflags);
3926         CLR(flags, FENCRYPTED);
3927         CLR(flags, FUNENCRYPTED);
3928
3929         AUDIT_ARG(fflags, oflags);
3930         AUDIT_ARG(mode, vap->va_mode);
3931
3932         if ((error = falloc_withalloc(p,
3933             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3934                 return error;
3935         }
3936         uu->uu_dupfd = -indx - 1;
3937
3938         if ((error = vn_open_auth(ndp, &flags, vap))) {
3939                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {       /* XXX from fdopen */
3940                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3941                                 fp_drop(p, indx, NULL, 0);
3942                                 *retval = indx;
3943                                 return 0;
3944                         }
3945                 }
3946                 if (error == ERESTART) {
3947                         error = EINTR;
3948                 }
3949                 fp_free(p, indx, fp);
3950                 return error;
3951         }
3952         uu->uu_dupfd = 0;
3953         vp = ndp->ni_vp;
3954
3955         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3956         fp->f_fglob->fg_ops = &vnops;
3957         fp->f_fglob->fg_data = (caddr_t)vp;
3958
3959         if (flags & (O_EXLOCK | O_SHLOCK)) {
3960                 lf.l_whence = SEEK_SET;
3961                 lf.l_start = 0;
3962                 lf.l_len = 0;
3963                 if (flags & O_EXLOCK) {
3964                         lf.l_type = F_WRLCK;
3965                 } else {
3966                         lf.l_type = F_RDLCK;
3967                 }
3968                 type = F_FLOCK;
3969                 if ((flags & FNONBLOCK) == 0) {
3970                         type |= F_WAIT;
3971                 }
3972 #if CONFIG_MACF
3973                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3974                     F_SETLK, &lf);
3975                 if (error) {
3976                         goto bad;
3977                 }
3978 #endif
3979                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL))) {
3980                         goto bad;
3981                 }
3982                 fp->f_fglob->fg_flag |= FHASLOCK;
3983         }
3984
3985         /* try to truncate by setting the size attribute */
3986         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
3987                 goto bad;
3988         }
3989
3990         /*
3991          * For directories we hold some additional information in the fd.
3992          */
3993         if (vnode_vtype(vp) == VDIR) {
3994                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3995         } else {
3996                 fp->f_fglob->fg_vn_data = NULL;
3997         }
3998
3999         vnode_put(vp);
4000
4001         /*
4002          * The first terminal open (without a O_NOCTTY) by a session leader
4003          * results in it being set as the controlling terminal.
4004          */
4005         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4006             !(flags & O_NOCTTY)) {
4007                 int tmp = 0;
4008
4009                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4010                     (caddr_t)&tmp, ctx);
4011         }
4012
4013         proc_fdlock(p);
4014         if (flags & O_CLOEXEC) {
4015                 *fdflags(p, indx) |= UF_EXCLOSE;
4016         }
4017         if (flags & O_CLOFORK) {
4018                 *fdflags(p, indx) |= UF_FORKCLOSE;
4019         }
4020         procfdtbl_releasefd(p, indx, NULL);
4021
4022 #if CONFIG_SECLUDED_MEMORY
4023         if (secluded_for_filecache &&
4024             FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
4025             vnode_vtype(vp) == VREG) {
4026                 memory_object_control_t moc;
4027
4028                 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4029
4030                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4031                         /* nothing to do... */
4032                 } else if (fp->f_fglob->fg_flag & FWRITE) {
4033                         /* writable -> no longer  eligible for secluded pages */
4034                         memory_object_mark_eligible_for_secluded(moc,
4035                             FALSE);
4036                 } else if (secluded_for_filecache == 1) {
4037                         char pathname[32] = { 0, };
4038                         size_t copied;
4039                         /* XXX FBDP: better way to detect /Applications/ ? */
4040                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4041                                 (void)copyinstr(ndp->ni_dirp,
4042                                     pathname,
4043                                     sizeof(pathname),
4044                                     &copied);
4045                         } else {
4046                                 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4047                                     pathname,
4048                                     sizeof(pathname),
4049                                     &copied);
4050                         }
4051                         pathname[sizeof(pathname) - 1] = '\0';
4052                         if (strncmp(pathname,
4053                             "/Applications/",
4054                             strlen("/Applications/")) == 0 &&
4055                             strncmp(pathname,
4056                             "/Applications/Camera.app/",
4057                             strlen("/Applications/Camera.app/")) != 0) {
4058                                 /*
4059                                  * not writable
4060                                  * AND from "/Applications/"
4061                                  * AND not from "/Applications/Camera.app/"
4062                                  * ==> eligible for secluded
4063                                  */
4064                                 memory_object_mark_eligible_for_secluded(moc,
4065                                     TRUE);
4066                         }
4067                 } else if (secluded_for_filecache == 2) {
4068 #if __arm64__
4069 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
4070 #elif __arm__
4071 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
4072 #else
4073 /* not implemented... */
4074 #endif
4075                         size_t len = strlen(vp->v_name);
4076                         if (!strncmp(vp->v_name, DYLD_SHARED_CACHE_NAME, len) ||
4077                             !strncmp(vp->v_name, "dyld", len) ||
4078                             !strncmp(vp->v_name, "launchd", len) ||
4079                             !strncmp(vp->v_name, "Camera", len) ||
4080                             !strncmp(vp->v_name, "mediaserverd", len) ||
4081                             !strncmp(vp->v_name, "SpringBoard", len) ||
4082                             !strncmp(vp->v_name, "backboardd", len)) {
4083                                 /*
4084                                  * This file matters when launching Camera:
4085                                  * do not store its contents in the secluded
4086                                  * pool that will be drained on Camera launch.
4087                                  */
4088                                 memory_object_mark_eligible_for_secluded(moc,
4089                                     FALSE);
4090                         }
4091                 }
4092         }
4093 #endif /* CONFIG_SECLUDED_MEMORY */
4094
4095         fp_drop(p, indx, fp, 1);
4096         proc_fdunlock(p);
4097
4098         *retval = indx;
4099
4100         return 0;
4101 bad:
4102         context = *vfs_context_current();
4103         context.vc_ucred = fp->f_fglob->fg_cred;
4104
4105         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
4106             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
4107                 lf.l_whence = SEEK_SET;
4108                 lf.l_start = 0;
4109                 lf.l_len = 0;
4110                 lf.l_type = F_UNLCK;
4111
4112                 (void)VNOP_ADVLOCK(
4113                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4114         }
4115
4116         vn_close(vp, fp->f_fglob->fg_flag, &context);
4117         vnode_put(vp);
4118         fp_free(p, indx, fp);
4119
4120         return error;
4121 }
4122
4123 /*
4124  * While most of the *at syscall handlers can call nameiat() which
4125  * is a wrapper around namei, the use of namei and initialisation
4126  * of nameidata are far removed and in different functions  - namei
4127  * gets called in vn_open_auth for open1. So we'll just do here what
4128  * nameiat() does.
4129  */
4130 static int
4131 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4132     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
4133     int dirfd)
4134 {
4135         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4136                 int error;
4137                 char c;
4138
4139                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4140                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
4141                         if (error) {
4142                                 return error;
4143                         }
4144                 } else {
4145                         c = *((char *)(ndp->ni_dirp));
4146                 }
4147
4148                 if (c != '/') {
4149                         vnode_t dvp_at;
4150
4151                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4152                             &dvp_at);
4153                         if (error) {
4154                                 return error;
4155                         }
4156
4157                         if (vnode_vtype(dvp_at) != VDIR) {
4158                                 vnode_put(dvp_at);
4159                                 return ENOTDIR;
4160                         }
4161
4162                         ndp->ni_dvp = dvp_at;
4163                         ndp->ni_cnd.cn_flags |= USEDVP;
4164                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
4165                             retval);
4166                         vnode_put(dvp_at);
4167                         return error;
4168                 }
4169         }
4170
4171         return open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval);
4172 }
4173
4174 /*
4175  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4176  *
4177  * Parameters:  p                       Process requesting the open
4178  *              uap                     User argument descriptor (see below)
4179  *              retval                  Pointer to an area to receive the
4180  *                                      return calue from the system call
4181  *
4182  * Indirect:    uap->path               Path to open (same as 'open')
4183  *              uap->flags              Flags to open (same as 'open'
4184  *              uap->uid                UID to set, if creating
4185  *              uap->gid                GID to set, if creating
4186  *              uap->mode               File mode, if creating (same as 'open')
4187  *              uap->xsecurity          ACL to set, if creating
4188  *
4189  * Returns:     0                       Success
4190  *              !0                      errno value
4191  *
4192  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4193  *
4194  * XXX:         We should enummerate the possible errno values here, and where
4195  *              in the code they originated.
4196  */
4197 int
4198 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4199 {
4200         struct filedesc *fdp = p->p_fd;
4201         int ciferror;
4202         kauth_filesec_t xsecdst;
4203         struct vnode_attr va;
4204         struct nameidata nd;
4205         int cmode;
4206
4207         AUDIT_ARG(owner, uap->uid, uap->gid);
4208
4209         xsecdst = NULL;
4210         if ((uap->xsecurity != USER_ADDR_NULL) &&
4211             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4212                 return ciferror;
4213         }
4214
4215         VATTR_INIT(&va);
4216         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4217         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4218         if (uap->uid != KAUTH_UID_NONE) {
4219                 VATTR_SET(&va, va_uid, uap->uid);
4220         }
4221         if (uap->gid != KAUTH_GID_NONE) {
4222                 VATTR_SET(&va, va_gid, uap->gid);
4223         }
4224         if (xsecdst != NULL) {
4225                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4226         }
4227
4228         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4229             uap->path, vfs_context_current());
4230
4231         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4232             fileproc_alloc_init, NULL, retval);
4233         if (xsecdst != NULL) {
4234                 kauth_filesec_free(xsecdst);
4235         }
4236
4237         return ciferror;
4238 }
4239
4240 /*
4241  * Go through the data-protected atomically controlled open (2)
4242  *
4243  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4244  */
4245 int
4246 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4247 {
4248         int flags = uap->flags;
4249         int class = uap->class;
4250         int dpflags = uap->dpflags;
4251
4252         /*
4253          * Follow the same path as normal open(2)
4254          * Look up the item if it exists, and acquire the vnode.
4255          */
4256         struct filedesc *fdp = p->p_fd;
4257         struct vnode_attr va;
4258         struct nameidata nd;
4259         int cmode;
4260         int error;
4261
4262         VATTR_INIT(&va);
4263         /* Mask off all but regular access permissions */
4264         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4265         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4266
4267         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4268             uap->path, vfs_context_current());
4269
4270         /*
4271          * Initialize the extra fields in vnode_attr to pass down our
4272          * extra fields.
4273          * 1. target cprotect class.
4274          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4275          */
4276         if (flags & O_CREAT) {
4277                 /* lower level kernel code validates that the class is valid before applying it. */
4278                 if (class != PROTECTION_CLASS_DEFAULT) {
4279                         /*
4280                          * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4281                          * file behave the same as open (2)
4282                          */
4283                         VATTR_SET(&va, va_dataprotect_class, class);
4284                 }
4285         }
4286
4287         if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4288                 if (flags & (O_RDWR | O_WRONLY)) {
4289                         /* Not allowed to write raw encrypted bytes */
4290                         return EINVAL;
4291                 }
4292                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4293                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4294                 }
4295                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4296                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4297                 }
4298         }
4299
4300         error = open1(vfs_context_current(), &nd, uap->flags, &va,
4301             fileproc_alloc_init, NULL, retval);
4302
4303         return error;
4304 }
4305
4306 static int
4307 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4308     int fd, enum uio_seg segflg, int *retval)
4309 {
4310         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
4311         struct vnode_attr va;
4312         struct nameidata nd;
4313         int cmode;
4314
4315         VATTR_INIT(&va);
4316         /* Mask off all but regular access permissions */
4317         cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4318         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4319
4320         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4321             segflg, path, ctx);
4322
4323         return open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
4324                    retval, fd);
4325 }
4326
4327 int
4328 open(proc_t p, struct open_args *uap, int32_t *retval)
4329 {
4330         __pthread_testcancel(1);
4331         return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4332 }
4333
4334 int
4335 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4336     int32_t *retval)
4337 {
4338         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4339                    uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4340 }
4341
4342 int
4343 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4344     int32_t *retval)
4345 {
4346         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4347                    uap->mode, uap->fd, UIO_USERSPACE, retval);
4348 }
4349
4350 int
4351 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4352 {
4353         __pthread_testcancel(1);
4354         return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4355 }
4356
4357 /*
4358  * openbyid_np: open a file given a file system id and a file system object id
4359  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
4360  *      file systems that don't support object ids it is a node id (uint64_t).
4361  *
4362  * Parameters:  p                       Process requesting the open
4363  *              uap                     User argument descriptor (see below)
4364  *              retval                  Pointer to an area to receive the
4365  *                                      return calue from the system call
4366  *
4367  * Indirect:    uap->path               Path to open (same as 'open')
4368  *
4369  *              uap->fsid               id of target file system
4370  *              uap->objid              id of target file system object
4371  *              uap->flags              Flags to open (same as 'open')
4372  *
4373  * Returns:     0                       Success
4374  *              !0                      errno value
4375  *
4376  *
4377  * XXX:         We should enummerate the possible errno values here, and where
4378  *              in the code they originated.
4379  */
4380 int
4381 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4382 {
4383         fsid_t fsid;
4384         uint64_t objid;
4385         int error;
4386         char *buf = NULL;
4387         int buflen = MAXPATHLEN;
4388         int pathlen = 0;
4389         vfs_context_t ctx = vfs_context_current();
4390
4391         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4392                 return error;
4393         }
4394
4395         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4396                 return error;
4397         }
4398
4399         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4400         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4401                 return error;
4402         }
4403
4404         AUDIT_ARG(value32, fsid.val[0]);
4405         AUDIT_ARG(value64, objid);
4406
4407         /*resolve path from fsis, objid*/
4408         do {
4409                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
4410                 if (buf == NULL) {
4411                         return ENOMEM;
4412                 }
4413
4414                 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4415                     buf, FSOPT_ISREALFSID, &pathlen);
4416
4417                 if (error) {
4418                         FREE(buf, M_TEMP);
4419                         buf = NULL;
4420                 }
4421         } while (error == ENOSPC && (buflen += MAXPATHLEN));
4422
4423         if (error) {
4424                 return error;
4425         }
4426
4427         buf[pathlen] = 0;
4428
4429         error = openat_internal(
4430                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4431
4432         FREE(buf, M_TEMP);
4433
4434         return error;
4435 }
4436
4437
4438 /*
4439  * Create a special file.
4440  */
4441 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4442
4443 int
4444 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4445 {
4446         struct vnode_attr va;
4447         vfs_context_t ctx = vfs_context_current();
4448         int error;
4449         struct nameidata nd;
4450         vnode_t vp, dvp;
4451
4452         VATTR_INIT(&va);
4453         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4454         VATTR_SET(&va, va_rdev, uap->dev);
4455
4456         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4457         if ((uap->mode & S_IFMT) == S_IFIFO) {
4458                 return mkfifo1(ctx, uap->path, &va);
4459         }
4460
4461         AUDIT_ARG(mode, uap->mode);
4462         AUDIT_ARG(value32, uap->dev);
4463
4464         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4465                 return error;
4466         }
4467         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4468             UIO_USERSPACE, uap->path, ctx);
4469         error = namei(&nd);
4470         if (error) {
4471                 return error;
4472         }
4473         dvp = nd.ni_dvp;
4474         vp = nd.ni_vp;
4475
4476         if (vp != NULL) {
4477                 error = EEXIST;
4478                 goto out;
4479         }
4480
4481         switch (uap->mode & S_IFMT) {
4482         case S_IFCHR:
4483                 VATTR_SET(&va, va_type, VCHR);
4484                 break;
4485         case S_IFBLK:
4486                 VATTR_SET(&va, va_type, VBLK);
4487                 break;
4488         default:
4489                 error = EINVAL;
4490                 goto out;
4491         }
4492
4493 #if CONFIG_MACF
4494         error = mac_vnode_check_create(ctx,
4495             nd.ni_dvp, &nd.ni_cnd, &va);
4496         if (error) {
4497                 goto out;
4498         }
4499 #endif
4500
4501         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4502                 goto out;
4503         }
4504
4505         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4506                 goto out;
4507         }
4508
4509         if (vp) {
4510                 int     update_flags = 0;
4511
4512                 // Make sure the name & parent pointers are hooked up
4513                 if (vp->v_name == NULL) {
4514                         update_flags |= VNODE_UPDATE_NAME;
4515                 }
4516                 if (vp->v_parent == NULLVP) {
4517                         update_flags |= VNODE_UPDATE_PARENT;
4518                 }
4519
4520                 if (update_flags) {
4521                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4522                 }
4523
4524 #if CONFIG_FSE
4525                 add_fsevent(FSE_CREATE_FILE, ctx,
4526                     FSE_ARG_VNODE, vp,
4527                     FSE_ARG_DONE);
4528 #endif
4529         }
4530
4531 out:
4532         /*
4533          * nameidone has to happen before we vnode_put(dvp)
4534          * since it may need to release the fs_nodelock on the dvp
4535          */
4536         nameidone(&nd);
4537
4538         if (vp) {
4539                 vnode_put(vp);
4540         }
4541         vnode_put(dvp);
4542
4543         return error;
4544 }
4545
4546 /*
4547  * Create a named pipe.
4548  *
4549  * Returns:     0                       Success
4550  *              EEXIST
4551  *      namei:???
4552  *      vnode_authorize:???
4553  *      vn_create:???
4554  */
4555 static int
4556 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4557 {
4558         vnode_t vp, dvp;
4559         int error;
4560         struct nameidata nd;
4561
4562         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4563             UIO_USERSPACE, upath, ctx);
4564         error = namei(&nd);
4565         if (error) {
4566                 return error;
4567         }
4568         dvp = nd.ni_dvp;
4569         vp = nd.ni_vp;
4570
4571         /* check that this is a new file and authorize addition */
4572         if (vp != NULL) {
4573                 error = EEXIST;
4574                 goto out;
4575         }
4576         VATTR_SET(vap, va_type, VFIFO);
4577
4578         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4579                 goto out;
4580         }
4581
4582         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4583 out:
4584         /*
4585          * nameidone has to happen before we vnode_put(dvp)
4586          * since it may need to release the fs_nodelock on the dvp
4587          */
4588         nameidone(&nd);
4589
4590         if (vp) {
4591                 vnode_put(vp);
4592         }
4593         vnode_put(dvp);
4594
4595         return error;
4596 }
4597
4598
4599 /*
4600  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4601  *
4602  * Parameters:  p                       Process requesting the open
4603  *              uap                     User argument descriptor (see below)
4604  *              retval                  (Ignored)
4605  *
4606  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
4607  *              uap->uid                UID to set
4608  *              uap->gid                GID to set
4609  *              uap->mode               File mode to set (same as 'mkfifo')
4610  *              uap->xsecurity          ACL to set, if creating
4611  *
4612  * Returns:     0                       Success
4613  *              !0                      errno value
4614  *
4615  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4616  *
4617  * XXX:         We should enummerate the possible errno values here, and where
4618  *              in the code they originated.
4619  */
4620 int
4621 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4622 {
4623         int ciferror;
4624         kauth_filesec_t xsecdst;
4625         struct vnode_attr va;
4626
4627         AUDIT_ARG(owner, uap->uid, uap->gid);
4628
4629         xsecdst = KAUTH_FILESEC_NONE;
4630         if (uap->xsecurity != USER_ADDR_NULL) {
4631                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4632                         return ciferror;
4633                 }
4634         }
4635
4636         VATTR_INIT(&va);
4637         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4638         if (uap->uid != KAUTH_UID_NONE) {
4639                 VATTR_SET(&va, va_uid, uap->uid);
4640         }
4641         if (uap->gid != KAUTH_GID_NONE) {
4642                 VATTR_SET(&va, va_gid, uap->gid);
4643         }
4644         if (xsecdst != KAUTH_FILESEC_NONE) {
4645                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4646         }
4647
4648         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4649
4650         if (xsecdst != KAUTH_FILESEC_NONE) {
4651                 kauth_filesec_free(xsecdst);
4652         }
4653         return ciferror;
4654 }
4655
4656 /* ARGSUSED */
4657 int
4658 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4659 {
4660         struct vnode_attr va;
4661
4662         VATTR_INIT(&va);
4663         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4664
4665         return mkfifo1(vfs_context_current(), uap->path, &va);
4666 }
4667
4668
4669 static char *
4670 my_strrchr(char *p, int ch)
4671 {
4672         char *save;
4673
4674         for (save = NULL;; ++p) {
4675                 if (*p == ch) {
4676                         save = p;
4677                 }
4678                 if (!*p) {
4679                         return save;
4680                 }
4681         }
4682         /* NOTREACHED */
4683 }
4684
4685 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4686 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4687 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4688
4689 int
4690 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4691 {
4692         int ret, len = _len;
4693
4694         *truncated_path = 0;
4695
4696         if (firmlink) {
4697                 ret = vn_getpath(dvp, path, &len);
4698         } else {
4699                 ret = vn_getpath_no_firmlink(dvp, path, &len);
4700         }
4701         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4702                 if (leafname) {
4703                         path[len - 1] = '/';
4704                         len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4705                         if (len > MAXPATHLEN) {
4706                                 char *ptr;
4707
4708                                 // the string got truncated!
4709                                 *truncated_path = 1;
4710                                 ptr = my_strrchr(path, '/');
4711                                 if (ptr) {
4712                                         *ptr = '\0';   // chop off the string at the last directory component
4713                                 }
4714                                 len = strlen(path) + 1;
4715                         }
4716                 }
4717         } else if (ret == 0) {
4718                 *truncated_path = 1;
4719         } else if (ret != 0) {
4720                 struct vnode *mydvp = dvp;
4721
4722                 if (ret != ENOSPC) {
4723                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4724                             dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4725                 }
4726                 *truncated_path = 1;
4727
4728                 do {
4729                         if (mydvp->v_parent != NULL) {
4730                                 mydvp = mydvp->v_parent;
4731                         } else if (mydvp->v_mount) {
4732                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4733                                 break;
4734                         } else {
4735                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4736                                 strlcpy(path, "/", _len);
4737                                 len = 2;
4738                                 mydvp = NULL;
4739                         }
4740
4741                         if (mydvp == NULL) {
4742                                 break;
4743                         }
4744
4745                         len = _len;
4746                         if (firmlink) {
4747                                 ret = vn_getpath(mydvp, path, &len);
4748                         } else {
4749                                 ret = vn_getpath_no_firmlink(mydvp, path, &len);
4750                         }
4751                 } while (ret == ENOSPC);
4752         }
4753
4754         return len;
4755 }
4756
4757 int
4758 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4759 {
4760         return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
4761 }
4762
4763 int
4764 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4765 {
4766         return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
4767 }
4768
4769 /*
4770  * Make a hard file link.
4771  *
4772  * Returns:     0                       Success
4773  *              EPERM
4774  *              EEXIST
4775  *              EXDEV
4776  *      namei:???
4777  *      vnode_authorize:???
4778  *      VNOP_LINK:???
4779  */
4780 /* ARGSUSED */
4781 static int
4782 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4783     user_addr_t link, int flag, enum uio_seg segflg)
4784 {
4785         vnode_t vp, pvp, dvp, lvp;
4786         struct nameidata nd;
4787         int follow;
4788         int error;
4789 #if CONFIG_FSE
4790         fse_info finfo;
4791 #endif
4792         int need_event, has_listeners, need_kpath2;
4793         char *target_path = NULL;
4794         int truncated = 0;
4795
4796         vp = dvp = lvp = NULLVP;
4797
4798         /* look up the object we are linking to */
4799         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4800         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4801             segflg, path, ctx);
4802
4803         error = nameiat(&nd, fd1);
4804         if (error) {
4805                 if (error == EPERM) {
4806                         printf("XXX 54841485: nameiat() src EPERM\n");
4807                 }
4808                 return error;
4809         }
4810         vp = nd.ni_vp;
4811
4812         nameidone(&nd);
4813
4814         /*
4815          * Normally, linking to directories is not supported.
4816          * However, some file systems may have limited support.
4817          */
4818         if (vp->v_type == VDIR) {
4819                 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4820                         error = EPERM;   /* POSIX */
4821                         printf("XXX 54841485: VDIR EPERM\n");
4822                         goto out;
4823                 }
4824
4825                 /* Linking to a directory requires ownership. */
4826                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4827                         struct vnode_attr dva;
4828
4829                         VATTR_INIT(&dva);
4830                         VATTR_WANTED(&dva, va_uid);
4831                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4832                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4833                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4834                                 error = EACCES;
4835                                 goto out;
4836                         }
4837                 }
4838         }
4839
4840         /* lookup the target node */
4841 #if CONFIG_TRIGGERS
4842         nd.ni_op = OP_LINK;
4843 #endif
4844         nd.ni_cnd.cn_nameiop = CREATE;
4845         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4846         nd.ni_dirp = link;
4847         error = nameiat(&nd, fd2);
4848         if (error != 0) {
4849                 if (error == EPERM) {
4850                         printf("XXX 54841485: nameiat() dst EPERM\n");
4851                 }
4852                 goto out;
4853         }
4854         dvp = nd.ni_dvp;
4855         lvp = nd.ni_vp;
4856
4857 #if CONFIG_MACF
4858         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
4859                 if (error == EPERM) {
4860                         printf("XXX 54841485: mac_vnode_check_link() EPERM\n");
4861                 }
4862                 goto out2;
4863         }
4864 #endif
4865
4866         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4867         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
4868                 if (error == EPERM) {
4869                         printf("XXX 54841485: vnode_authorize() LINKTARGET EPERM\n");
4870                 }
4871                 goto out2;
4872         }
4873
4874         /* target node must not exist */
4875         if (lvp != NULLVP) {
4876                 error = EEXIST;
4877                 goto out2;
4878         }
4879         /* cannot link across mountpoints */
4880         if (vnode_mount(vp) != vnode_mount(dvp)) {
4881                 error = EXDEV;
4882                 goto out2;
4883         }
4884
4885         /* authorize creation of the target note */
4886         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4887                 if (error == EPERM) {
4888                         printf("XXX 54841485: vnode_authorize() ADD_FILE EPERM\n");
4889                 }
4890                 goto out2;
4891         }
4892
4893         /* and finally make the link */
4894         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4895         if (error) {
4896                 if (error == EPERM) {
4897                         printf("XXX 54841485: VNOP_LINK() EPERM\n");
4898                 }
4899                 goto out2;
4900         }
4901
4902 #if CONFIG_MACF
4903         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4904 #endif
4905
4906 #if CONFIG_FSE
4907         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4908 #else
4909         need_event = 0;
4910 #endif
4911         has_listeners = kauth_authorize_fileop_has_listeners();
4912
4913         need_kpath2 = 0;
4914 #if CONFIG_AUDIT
4915         if (AUDIT_RECORD_EXISTS()) {
4916                 need_kpath2 = 1;
4917         }
4918 #endif
4919
4920         if (need_event || has_listeners || need_kpath2) {
4921                 char *link_to_path = NULL;
4922                 int len, link_name_len;
4923
4924                 /* build the path to the new link file */
4925                 GET_PATH(target_path);
4926                 if (target_path == NULL) {
4927                         error = ENOMEM;
4928                         goto out2;
4929                 }
4930
4931                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4932
4933                 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
4934
4935                 if (has_listeners) {
4936                         /* build the path to file we are linking to */
4937                         GET_PATH(link_to_path);
4938                         if (link_to_path == NULL) {
4939                                 error = ENOMEM;
4940                                 goto out2;
4941                         }
4942
4943                         link_name_len = MAXPATHLEN;
4944                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4945                                 /*
4946                                  * Call out to allow 3rd party notification of rename.
4947                                  * Ignore result of kauth_authorize_fileop call.
4948                                  */
4949                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4950                                     (uintptr_t)link_to_path,
4951                                     (uintptr_t)target_path);
4952                         }
4953                         if (link_to_path != NULL) {
4954                                 RELEASE_PATH(link_to_path);
4955                         }
4956                 }
4957 #if CONFIG_FSE
4958                 if (need_event) {
4959                         /* construct fsevent */
4960                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4961                                 if (truncated) {
4962                                         finfo.mode |= FSE_TRUNCATED_PATH;
4963                                 }
4964
4965                                 // build the path to the destination of the link
4966                                 add_fsevent(FSE_CREATE_FILE, ctx,
4967                                     FSE_ARG_STRING, len, target_path,
4968                                     FSE_ARG_FINFO, &finfo,
4969                                     FSE_ARG_DONE);
4970                         }
4971
4972                         pvp = vp->v_parent;
4973                         // need an iocount on pvp in this case
4974                         if (pvp && pvp != dvp) {
4975                                 error = vnode_get(pvp);
4976                                 if (error) {
4977                                         pvp = NULLVP;
4978                                         error = 0;
4979                                 }
4980                         }
4981                         if (pvp) {
4982                                 add_fsevent(FSE_STAT_CHANGED, ctx,
4983                                     FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
4984                         }
4985                         if (pvp && pvp != dvp) {
4986                                 vnode_put(pvp);
4987                         }
4988                 }
4989 #endif
4990         }
4991 out2:
4992         /*
4993          * nameidone has to happen before we vnode_put(dvp)
4994          * since it may need to release the fs_nodelock on the dvp
4995          */
4996         nameidone(&nd);
4997         if (target_path != NULL) {
4998                 RELEASE_PATH(target_path);
4999         }
5000 out:
5001         if (lvp) {
5002                 vnode_put(lvp);
5003         }
5004         if (dvp) {
5005                 vnode_put(dvp);
5006         }
5007         vnode_put(vp);
5008         return error;
5009 }
5010
5011 int
5012 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5013 {
5014         return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5015                    AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5016 }
5017
5018 int
5019 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5020 {
5021         if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5022                 return EINVAL;
5023         }
5024
5025         return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5026                    uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5027 }
5028
5029 /*
5030  * Make a symbolic link.
5031  *
5032  * We could add support for ACLs here too...
5033  */
5034 /* ARGSUSED */
5035 static int
5036 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5037     user_addr_t link, enum uio_seg segflg)
5038 {
5039         struct vnode_attr va;
5040         char *path;
5041         int error;
5042         struct nameidata nd;
5043         vnode_t vp, dvp;
5044         size_t dummy = 0;
5045         proc_t p;
5046
5047         error = 0;
5048         if (UIO_SEG_IS_USER_SPACE(segflg)) {
5049                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
5050                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5051         } else {
5052                 path = (char *)path_data;
5053         }
5054         if (error) {
5055                 goto out;
5056         }
5057         AUDIT_ARG(text, path);  /* This is the link string */
5058
5059         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5060             segflg, link, ctx);
5061
5062         error = nameiat(&nd, fd);
5063         if (error) {
5064                 goto out;
5065         }
5066         dvp = nd.ni_dvp;
5067         vp = nd.ni_vp;
5068
5069         p = vfs_context_proc(ctx);
5070         VATTR_INIT(&va);
5071         VATTR_SET(&va, va_type, VLNK);
5072         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
5073
5074 #if CONFIG_MACF
5075         error = mac_vnode_check_create(ctx,
5076             dvp, &nd.ni_cnd, &va);
5077 #endif
5078         if (error != 0) {
5079                 goto skipit;
5080         }
5081
5082         if (vp != NULL) {
5083                 error = EEXIST;
5084                 goto skipit;
5085         }
5086
5087         /* authorize */
5088         if (error == 0) {
5089                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5090         }
5091         /* get default ownership, etc. */
5092         if (error == 0) {
5093                 error = vnode_authattr_new(dvp, &va, 0, ctx);
5094         }
5095         if (error == 0) {
5096                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5097         }
5098
5099         /* do fallback attribute handling */
5100         if (error == 0 && vp) {
5101                 error = vnode_setattr_fallback(vp, &va, ctx);
5102         }
5103
5104 #if CONFIG_MACF
5105         if (error == 0 && vp) {
5106                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5107         }
5108 #endif
5109
5110         if (error == 0) {
5111                 int     update_flags = 0;
5112
5113                 /*check if a new vnode was created, else try to get one*/
5114                 if (vp == NULL) {
5115                         nd.ni_cnd.cn_nameiop = LOOKUP;
5116 #if CONFIG_TRIGGERS
5117                         nd.ni_op = OP_LOOKUP;
5118 #endif
5119                         nd.ni_cnd.cn_flags = 0;
5120                         error = nameiat(&nd, fd);
5121                         vp = nd.ni_vp;
5122
5123                         if (vp == NULL) {
5124                                 goto skipit;
5125                         }
5126                 }
5127
5128 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5129                 /* call out to allow 3rd party notification of rename.
5130                  * Ignore result of kauth_authorize_fileop call.
5131                  */
5132                 if (kauth_authorize_fileop_has_listeners() &&
5133                     namei(&nd) == 0) {
5134                         char *new_link_path = NULL;
5135                         int             len;
5136
5137                         /* build the path to the new link file */
5138                         new_link_path = get_pathbuff();
5139                         len = MAXPATHLEN;
5140                         vn_getpath(dvp, new_link_path, &len);
5141                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5142                                 new_link_path[len - 1] = '/';
5143                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5144                         }
5145
5146                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5147                             (uintptr_t)path, (uintptr_t)new_link_path);
5148                         if (new_link_path != NULL) {
5149                                 release_pathbuff(new_link_path);
5150                         }
5151                 }
5152 #endif
5153                 // Make sure the name & parent pointers are hooked up
5154                 if (vp->v_name == NULL) {
5155                         update_flags |= VNODE_UPDATE_NAME;
5156                 }
5157                 if (vp->v_parent == NULLVP) {
5158                         update_flags |= VNODE_UPDATE_PARENT;
5159                 }
5160
5161                 if (update_flags) {
5162                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5163                 }
5164
5165 #if CONFIG_FSE
5166                 add_fsevent(FSE_CREATE_FILE, ctx,
5167                     FSE_ARG_VNODE, vp,
5168                     FSE_ARG_DONE);
5169 #endif
5170         }
5171
5172 skipit:
5173         /*
5174          * nameidone has to happen before we vnode_put(dvp)
5175          * since it may need to release the fs_nodelock on the dvp
5176          */
5177         nameidone(&nd);
5178
5179         if (vp) {
5180                 vnode_put(vp);
5181         }
5182         vnode_put(dvp);
5183 out:
5184         if (path && (path != (char *)path_data)) {
5185                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
5186         }
5187
5188         return error;
5189 }
5190
5191 int
5192 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5193 {
5194         return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5195                    uap->link, UIO_USERSPACE);
5196 }
5197
5198 int
5199 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5200     __unused int32_t *retval)
5201 {
5202         return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5203                    uap->path2, UIO_USERSPACE);
5204 }
5205
5206 /*
5207  * Delete a whiteout from the filesystem.
5208  * No longer supported.
5209  */
5210 int
5211 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5212 {
5213         return ENOTSUP;
5214 }
5215
5216 /*
5217  * Delete a name from the filesystem.
5218  */
5219 /* ARGSUSED */
5220 static int
5221 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5222     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5223 {
5224         struct nameidata nd;
5225         vnode_t vp, dvp;
5226         int error;
5227         struct componentname *cnp;
5228         char  *path = NULL;
5229         char  *no_firmlink_path = NULL;
5230         int  len_path = 0;
5231         int  len_no_firmlink_path = 0;
5232 #if CONFIG_FSE
5233         fse_info  finfo;
5234         struct vnode_attr va;
5235 #endif
5236         int flags;
5237         int need_event;
5238         int has_listeners;
5239         int truncated_path;
5240         int truncated_no_firmlink_path;
5241         int batched;
5242         struct vnode_attr *vap;
5243         int do_retry;
5244         int retry_count = 0;
5245         int cn_flags;
5246
5247         cn_flags = LOCKPARENT;
5248         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5249                 cn_flags |= AUDITVNPATH1;
5250         }
5251         /* If a starting dvp is passed, it trumps any fd passed. */
5252         if (start_dvp) {
5253                 cn_flags |= USEDVP;
5254         }
5255
5256 #if NAMEDRSRCFORK
5257         /* unlink or delete is allowed on rsrc forks and named streams */
5258         cn_flags |= CN_ALLOWRSRCFORK;
5259 #endif
5260
5261 retry:
5262         do_retry = 0;
5263         flags = 0;
5264         need_event = 0;
5265         has_listeners = 0;
5266         truncated_path = 0;
5267         truncated_no_firmlink_path = 0;
5268         vap = NULL;
5269
5270         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5271
5272         nd.ni_dvp = start_dvp;
5273         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
5274         cnp = &nd.ni_cnd;
5275
5276 continue_lookup:
5277         error = nameiat(&nd, fd);
5278         if (error) {
5279                 return error;
5280         }
5281
5282         dvp = nd.ni_dvp;
5283         vp = nd.ni_vp;
5284
5285
5286         /* With Carbon delete semantics, busy files cannot be deleted */
5287         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5288                 flags |= VNODE_REMOVE_NODELETEBUSY;
5289         }
5290
5291         /* Skip any potential upcalls if told to. */
5292         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5293                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5294         }
5295
5296         if (vp) {
5297                 batched = vnode_compound_remove_available(vp);
5298                 /*
5299                  * The root of a mounted filesystem cannot be deleted.
5300                  */
5301                 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5302                         error = EBUSY;
5303                         goto out;
5304                 }
5305
5306 #if DEVELOPMENT || DEBUG
5307                 /*
5308                  * XXX VSWAP: Check for entitlements or special flag here
5309                  * so we can restrict access appropriately.
5310                  */
5311 #else /* DEVELOPMENT || DEBUG */
5312
5313                 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5314                         error = EPERM;
5315                         goto out;
5316                 }
5317 #endif /* DEVELOPMENT || DEBUG */
5318
5319                 if (!batched) {
5320                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5321                         if (error) {
5322                                 if (error == ENOENT) {
5323                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5324                                                 do_retry = 1;
5325                                                 retry_count++;
5326                                         }
5327                                 }
5328                                 goto out;
5329                         }
5330                 }
5331         } else {
5332                 batched = 1;
5333
5334                 if (!vnode_compound_remove_available(dvp)) {
5335                         panic("No vp, but no compound remove?");
5336                 }
5337         }
5338
5339 #if CONFIG_FSE
5340         need_event = need_fsevent(FSE_DELETE, dvp);
5341         if (need_event) {
5342                 if (!batched) {
5343                         if ((vp->v_flag & VISHARDLINK) == 0) {
5344                                 /* XXX need to get these data in batched VNOP */
5345                                 get_fse_info(vp, &finfo, ctx);
5346                         }
5347                 } else {
5348                         error = vfs_get_notify_attributes(&va);
5349                         if (error) {
5350                                 goto out;
5351                         }
5352
5353                         vap = &va;
5354                 }
5355         }
5356 #endif
5357         has_listeners = kauth_authorize_fileop_has_listeners();
5358         if (need_event || has_listeners) {
5359                 if (path == NULL) {
5360                         GET_PATH(path);
5361                         if (path == NULL) {
5362                                 error = ENOMEM;
5363                                 goto out;
5364                         }
5365                 }
5366                 len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5367                 if (no_firmlink_path == NULL) {
5368                         GET_PATH(no_firmlink_path);
5369                         if (no_firmlink_path == NULL) {
5370                                 error = ENOMEM;
5371                                 goto out;
5372                         }
5373                 }
5374                 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5375         }
5376
5377 #if NAMEDRSRCFORK
5378         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5379                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5380         } else
5381 #endif
5382         {
5383                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
5384                 vp = nd.ni_vp;
5385                 if (error == EKEEPLOOKING) {
5386                         if (!batched) {
5387                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5388                         }
5389
5390                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
5391                                 panic("EKEEPLOOKING, but continue flag not set?");
5392                         }
5393
5394                         if (vnode_isdir(vp)) {
5395                                 error = EISDIR;
5396                                 goto out;
5397                         }
5398                         goto continue_lookup;
5399                 } else if (error == ENOENT && batched) {
5400                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5401                                 /*
5402                                  * For compound VNOPs, the authorization callback may
5403                                  * return ENOENT in case of racing hardlink lookups
5404                                  * hitting the name  cache, redrive the lookup.
5405                                  */
5406                                 do_retry = 1;
5407                                 retry_count += 1;
5408                                 goto out;
5409                         }
5410                 }
5411         }
5412
5413         /*
5414          * Call out to allow 3rd party notification of delete.
5415          * Ignore result of kauth_authorize_fileop call.
5416          */
5417         if (!error) {
5418                 if (has_listeners) {
5419                         kauth_authorize_fileop(vfs_context_ucred(ctx),
5420                             KAUTH_FILEOP_DELETE,
5421                             (uintptr_t)vp,
5422                             (uintptr_t)path);
5423                 }
5424
5425                 if (vp->v_flag & VISHARDLINK) {
5426                         //
5427                         // if a hardlink gets deleted we want to blow away the
5428                         // v_parent link because the path that got us to this
5429                         // instance of the link is no longer valid.  this will
5430                         // force the next call to get the path to ask the file
5431                         // system instead of just following the v_parent link.
5432                         //
5433                         vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5434                 }
5435
5436 #if CONFIG_FSE
5437                 if (need_event) {
5438                         if (vp->v_flag & VISHARDLINK) {
5439                                 get_fse_info(vp, &finfo, ctx);
5440                         } else if (vap) {
5441                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
5442                         }
5443                         if (truncated_path) {
5444                                 finfo.mode |= FSE_TRUNCATED_PATH;
5445                         }
5446                         add_fsevent(FSE_DELETE, ctx,
5447                             FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5448                             FSE_ARG_FINFO, &finfo,
5449                             FSE_ARG_DONE);
5450                 }
5451 #endif
5452         }
5453
5454 out:
5455         if (path != NULL) {
5456                 RELEASE_PATH(path);
5457                 path = NULL;
5458         }
5459
5460         if (no_firmlink_path != NULL) {
5461                 RELEASE_PATH(no_firmlink_path);
5462                 no_firmlink_path = NULL;
5463         }
5464 #if NAMEDRSRCFORK
5465         /* recycle the deleted rsrc fork vnode to force a reclaim, which
5466          * will cause its shadow file to go away if necessary.
5467          */
5468         if (vp && (vnode_isnamedstream(vp)) &&
5469             (vp->v_parent != NULLVP) &&
5470             vnode_isshadow(vp)) {
5471                 vnode_recycle(vp);
5472         }
5473 #endif
5474         /*
5475          * nameidone has to happen before we vnode_put(dvp)
5476          * since it may need to release the fs_nodelock on the dvp
5477          */
5478         nameidone(&nd);
5479         vnode_put(dvp);
5480         if (vp) {
5481                 vnode_put(vp);
5482         }
5483
5484         if (do_retry) {
5485                 goto retry;
5486         }
5487
5488         return error;
5489 }
5490
5491 int
5492 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5493     enum uio_seg segflg, int unlink_flags)
5494 {
5495         return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5496                    unlink_flags);
5497 }
5498
5499 /*
5500  * Delete a name from the filesystem using Carbon semantics.
5501  */
5502 int
5503 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5504 {
5505         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5506                    uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5507 }
5508
5509 /*
5510  * Delete a name from the filesystem using POSIX semantics.
5511  */
5512 int
5513 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5514 {
5515         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5516                    uap->path, UIO_USERSPACE, 0);
5517 }
5518
5519 int
5520 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5521 {
5522         if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5523                 return EINVAL;
5524         }
5525
5526         if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5527                 int unlink_flags = 0;
5528
5529                 if (uap->flag & AT_REMOVEDIR_DATALESS) {
5530                         unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5531                 }
5532                 return rmdirat_internal(vfs_context_current(), uap->fd,
5533                            uap->path, UIO_USERSPACE, unlink_flags);
5534         } else {
5535                 return unlinkat_internal(vfs_context_current(), uap->fd,
5536                            NULLVP, uap->path, UIO_USERSPACE, 0);
5537         }
5538 }
5539
5540 /*
5541  * Reposition read/write file offset.
5542  */
5543 int
5544 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5545 {
5546         struct fileproc *fp;
5547         vnode_t vp;
5548         struct vfs_context *ctx;
5549         off_t offset = uap->offset, file_size;
5550         int error;
5551
5552         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5553                 if (error == ENOTSUP) {
5554                         return ESPIPE;
5555                 }
5556                 return error;
5557         }
5558         if (vnode_isfifo(vp)) {
5559                 file_drop(uap->fd);
5560                 return ESPIPE;
5561         }
5562
5563
5564         ctx = vfs_context_current();
5565 #if CONFIG_MACF
5566         if (uap->whence == L_INCR && uap->offset == 0) {
5567                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5568                     fp->f_fglob);
5569         } else {
5570                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5571                     fp->f_fglob);
5572         }
5573         if (error) {
5574                 file_drop(uap->fd);
5575                 return error;
5576         }
5577 #endif
5578         if ((error = vnode_getwithref(vp))) {
5579                 file_drop(uap->fd);
5580                 return error;
5581         }
5582
5583         switch (uap->whence) {
5584         case L_INCR:
5585                 offset += fp->f_fglob->fg_offset;
5586                 break;
5587         case L_XTND:
5588                 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5589                         break;
5590                 }
5591                 offset += file_size;
5592                 break;
5593         case L_SET:
5594                 break;
5595         case SEEK_HOLE:
5596                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5597                 break;
5598         case SEEK_DATA:
5599                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5600                 break;
5601         default:
5602                 error = EINVAL;
5603         }
5604         if (error == 0) {
5605                 if (uap->offset > 0 && offset < 0) {
5606                         /* Incremented/relative move past max size */
5607                         error = EOVERFLOW;
5608                 } else {
5609                         /*
5610                          * Allow negative offsets on character devices, per
5611                          * POSIX 1003.1-2001.  Most likely for writing disk
5612                          * labels.
5613                          */
5614                         if (offset < 0 && vp->v_type != VCHR) {
5615                                 /* Decremented/relative move before start */
5616                                 error = EINVAL;
5617                         } else {
5618                                 /* Success */
5619                                 fp->f_fglob->fg_offset = offset;
5620                                 *retval = fp->f_fglob->fg_offset;
5621                         }
5622                 }
5623         }
5624
5625         /*
5626          * An lseek can affect whether data is "available to read."  Use
5627          * hint of NOTE_NONE so no EVFILT_VNODE events fire
5628          */
5629         post_event_if_success(vp, error, NOTE_NONE);
5630         (void)vnode_put(vp);
5631         file_drop(uap->fd);
5632         return error;
5633 }
5634
5635
5636 /*
5637  * Check access permissions.
5638  *
5639  * Returns:     0                       Success
5640  *              vnode_authorize:???
5641  */
5642 static int
5643 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5644 {
5645         kauth_action_t action;
5646         int error;
5647
5648         /*
5649          * If just the regular access bits, convert them to something
5650          * that vnode_authorize will understand.
5651          */
5652         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5653                 action = 0;
5654                 if (uflags & R_OK) {
5655                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
5656                 }
5657                 if (uflags & W_OK) {
5658                         if (vnode_isdir(vp)) {
5659                                 action |= KAUTH_VNODE_ADD_FILE |
5660                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
5661                                 /* might want delete rights here too */
5662                         } else {
5663                                 action |= KAUTH_VNODE_WRITE_DATA;
5664                         }
5665                 }
5666                 if (uflags & X_OK) {
5667                         if (vnode_isdir(vp)) {
5668                                 action |= KAUTH_VNODE_SEARCH;
5669                         } else {
5670                                 action |= KAUTH_VNODE_EXECUTE;
5671                         }
5672                 }
5673         } else {
5674                 /* take advantage of definition of uflags */
5675                 action = uflags >> 8;
5676         }
5677
5678 #if CONFIG_MACF
5679         error = mac_vnode_check_access(ctx, vp, uflags);
5680         if (error) {
5681                 return error;
5682         }
5683 #endif /* MAC */
5684
5685         /* action == 0 means only check for existence */
5686         if (action != 0) {
5687                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5688         } else {
5689                 error = 0;
5690         }
5691
5692         return error;
5693 }
5694
5695
5696
5697 /*
5698  * access_extended: Check access permissions in bulk.
5699  *
5700  * Description: uap->entries            Pointer to an array of accessx
5701  *                                      descriptor structs, plus one or
5702  *                                      more NULL terminated strings (see
5703  *                                      "Notes" section below).
5704  *              uap->size               Size of the area pointed to by
5705  *                                      uap->entries.
5706  *              uap->results            Pointer to the results array.
5707  *
5708  * Returns:     0                       Success
5709  *              ENOMEM                  Insufficient memory
5710  *              EINVAL                  Invalid arguments
5711  *              namei:EFAULT            Bad address
5712  *              namei:ENAMETOOLONG      Filename too long
5713  *              namei:ENOENT            No such file or directory
5714  *              namei:ELOOP             Too many levels of symbolic links
5715  *              namei:EBADF             Bad file descriptor
5716  *              namei:ENOTDIR           Not a directory
5717  *              namei:???
5718  *              access1:
5719  *
5720  * Implicit returns:
5721  *              uap->results            Array contents modified
5722  *
5723  * Notes:       The uap->entries are structured as an arbitrary length array
5724  *              of accessx descriptors, followed by one or more NULL terminated
5725  *              strings
5726  *
5727  *                      struct accessx_descriptor[0]
5728  *                      ...
5729  *                      struct accessx_descriptor[n]
5730  *                      char name_data[0];
5731  *
5732  *              We determine the entry count by walking the buffer containing
5733  *              the uap->entries argument descriptor.  For each descriptor we
5734  *              see, the valid values for the offset ad_name_offset will be
5735  *              in the byte range:
5736  *
5737  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
5738  *                                              to
5739  *                              [ uap->entries + uap->size - 2 ]
5740  *
5741  *              since we must have at least one string, and the string must
5742  *              be at least one character plus the NULL terminator in length.
5743  *
5744  * XXX:         Need to support the check-as uid argument
5745  */
5746 int
5747 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5748 {
5749         struct accessx_descriptor *input = NULL;
5750         errno_t *result = NULL;
5751         errno_t error = 0;
5752         int wantdelete = 0;
5753         unsigned int desc_max, desc_actual, i, j;
5754         struct vfs_context context;
5755         struct nameidata nd;
5756         int niopts;
5757         vnode_t vp = NULL;
5758         vnode_t dvp = NULL;
5759 #define ACCESSX_MAX_DESCR_ON_STACK 10
5760         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5761
5762         context.vc_ucred = NULL;
5763
5764         /*
5765          * Validate parameters; if valid, copy the descriptor array and string
5766          * arguments into local memory.  Before proceeding, the following
5767          * conditions must have been met:
5768          *
5769          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5770          * o    There must be sufficient room in the request for at least one
5771          *      descriptor and a one yte NUL terminated string.
5772          * o    The allocation of local storage must not fail.
5773          */
5774         if (uap->size > ACCESSX_MAX_TABLESIZE) {
5775                 return ENOMEM;
5776         }
5777         if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
5778                 return EINVAL;
5779         }
5780         if (uap->size <= sizeof(stack_input)) {
5781                 input = stack_input;
5782         } else {
5783                 MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5784                 if (input == NULL) {
5785                         error = ENOMEM;
5786                         goto out;
5787                 }
5788         }
5789         error = copyin(uap->entries, input, uap->size);
5790         if (error) {
5791                 goto out;
5792         }
5793
5794         AUDIT_ARG(opaque, input, uap->size);
5795
5796         /*
5797          * Force NUL termination of the copyin buffer to avoid nami() running
5798          * off the end.  If the caller passes us bogus data, they may get a
5799          * bogus result.
5800          */
5801         ((char *)input)[uap->size - 1] = 0;
5802
5803         /*
5804          * Access is defined as checking against the process' real identity,
5805          * even if operations are checking the effective identity.  This
5806          * requires that we use a local vfs context.
5807          */
5808         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5809         context.vc_thread = current_thread();
5810
5811         /*
5812          * Find out how many entries we have, so we can allocate the result
5813          * array by walking the list and adjusting the count downward by the
5814          * earliest string offset we see.
5815          */
5816         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5817         desc_actual = desc_max;
5818         for (i = 0; i < desc_actual; i++) {
5819                 /*
5820                  * Take the offset to the name string for this entry and
5821                  * convert to an input array index, which would be one off
5822                  * the end of the array if this entry was the lowest-addressed
5823                  * name string.
5824                  */
5825                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5826
5827                 /*
5828                  * An offset greater than the max allowable offset is an error.
5829                  * It is also an error for any valid entry to point
5830                  * to a location prior to the end of the current entry, if
5831                  * it's not a reference to the string of the previous entry.
5832                  */
5833                 if (j > desc_max || (j != 0 && j <= i)) {
5834                         error = EINVAL;
5835                         goto out;
5836                 }
5837
5838                 /* Also do not let ad_name_offset point to something beyond the size of the input */
5839                 if (input[i].ad_name_offset >= uap->size) {
5840                         error = EINVAL;
5841                         goto out;
5842                 }
5843
5844                 /*
5845                  * An offset of 0 means use the previous descriptor's offset;
5846                  * this is used to chain multiple requests for the same file
5847                  * to avoid multiple lookups.
5848                  */
5849                 if (j == 0) {
5850                         /* This is not valid for the first entry */
5851                         if (i == 0) {
5852                                 error = EINVAL;
5853                                 goto out;
5854                         }
5855                         continue;
5856                 }
5857
5858                 /*
5859                  * If the offset of the string for this descriptor is before
5860                  * what we believe is the current actual last descriptor,
5861                  * then we need to adjust our estimate downward; this permits
5862                  * the string table following the last descriptor to be out
5863                  * of order relative to the descriptor list.
5864                  */
5865                 if (j < desc_actual) {
5866                         desc_actual = j;
5867                 }
5868         }
5869
5870         /*
5871          * We limit the actual number of descriptors we are willing to process
5872          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5873          * requested does not exceed this limit,
5874          */
5875         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5876                 error = ENOMEM;
5877                 goto out;
5878         }
5879         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK | M_ZERO);
5880         if (result == NULL) {
5881                 error = ENOMEM;
5882                 goto out;
5883         }
5884
5885         /*
5886          * Do the work by iterating over the descriptor entries we know to
5887          * at least appear to contain valid data.
5888          */
5889         error = 0;
5890         for (i = 0; i < desc_actual; i++) {
5891                 /*
5892                  * If the ad_name_offset is 0, then we use the previous
5893                  * results to make the check; otherwise, we are looking up
5894                  * a new file name.
5895                  */
5896                 if (input[i].ad_name_offset != 0) {
5897                         /* discard old vnodes */
5898                         if (vp) {
5899                                 vnode_put(vp);
5900                                 vp = NULL;
5901                         }
5902                         if (dvp) {
5903                                 vnode_put(dvp);
5904                                 dvp = NULL;
5905                         }
5906
5907                         /*
5908                          * Scan forward in the descriptor list to see if we
5909                          * need the parent vnode.  We will need it if we are
5910                          * deleting, since we must have rights  to remove
5911                          * entries in the parent directory, as well as the
5912                          * rights to delete the object itself.
5913                          */
5914                         wantdelete = input[i].ad_flags & _DELETE_OK;
5915                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
5916                                 if (input[j].ad_flags & _DELETE_OK) {
5917                                         wantdelete = 1;
5918                                 }
5919                         }
5920
5921                         niopts = FOLLOW | AUDITVNPATH1;
5922
5923                         /* need parent for vnode_authorize for deletion test */
5924                         if (wantdelete) {
5925                                 niopts |= WANTPARENT;
5926                         }
5927
5928                         /* do the lookup */
5929                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5930                             CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5931                             &context);
5932                         error = namei(&nd);
5933                         if (!error) {
5934                                 vp = nd.ni_vp;
5935                                 if (wantdelete) {
5936                                         dvp = nd.ni_dvp;
5937                                 }
5938                         }
5939                         nameidone(&nd);
5940                 }
5941
5942                 /*
5943                  * Handle lookup errors.
5944                  */
5945                 switch (error) {
5946                 case ENOENT:
5947                 case EACCES:
5948                 case EPERM:
5949                 case ENOTDIR:
5950                         result[i] = error;
5951                         break;
5952                 case 0:
5953                         /* run this access check */
5954                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5955                         break;
5956                 default:
5957                         /* fatal lookup error */
5958
5959                         goto out;
5960                 }
5961         }
5962
5963         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5964
5965         /* copy out results */
5966         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5967
5968 out:
5969         if (input && input != stack_input) {
5970                 FREE(input, M_TEMP);
5971         }
5972         if (result) {
5973                 FREE(result, M_TEMP);
5974         }
5975         if (vp) {
5976                 vnode_put(vp);
5977         }
5978         if (dvp) {
5979                 vnode_put(dvp);
5980         }
5981         if (IS_VALID_CRED(context.vc_ucred)) {
5982                 kauth_cred_unref(&context.vc_ucred);
5983         }
5984         return error;
5985 }
5986
5987
5988 /*
5989  * Returns:     0                       Success
5990  *              namei:EFAULT            Bad address
5991  *              namei:ENAMETOOLONG      Filename too long
5992  *              namei:ENOENT            No such file or directory
5993  *              namei:ELOOP             Too many levels of symbolic links
5994  *              namei:EBADF             Bad file descriptor
5995  *              namei:ENOTDIR           Not a directory
5996  *              namei:???
5997  *              access1:
5998  */
5999 static int
6000 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6001     int flag, enum uio_seg segflg)
6002 {
6003         int error;
6004         struct nameidata nd;
6005         int niopts;
6006         struct vfs_context context;
6007 #if NAMEDRSRCFORK
6008         int is_namedstream = 0;
6009 #endif
6010
6011         /*
6012          * Unless the AT_EACCESS option is used, Access is defined as checking
6013          * against the process' real identity, even if operations are checking
6014          * the effective identity.  So we need to tweak the credential
6015          * in the context for that case.
6016          */
6017         if (!(flag & AT_EACCESS)) {
6018                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6019         } else {
6020                 context.vc_ucred = ctx->vc_ucred;
6021         }
6022         context.vc_thread = ctx->vc_thread;
6023
6024
6025         niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6026         /* need parent for vnode_authorize for deletion test */
6027         if (amode & _DELETE_OK) {
6028                 niopts |= WANTPARENT;
6029         }
6030         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6031             path, &context);
6032
6033 #if NAMEDRSRCFORK
6034         /* access(F_OK) calls are allowed for resource forks. */
6035         if (amode == F_OK) {
6036                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6037         }
6038 #endif
6039         error = nameiat(&nd, fd);
6040         if (error) {
6041                 goto out;
6042         }
6043
6044 #if NAMEDRSRCFORK
6045         /* Grab reference on the shadow stream file vnode to
6046          * force an inactive on release which will mark it
6047          * for recycle.
6048          */
6049         if (vnode_isnamedstream(nd.ni_vp) &&
6050             (nd.ni_vp->v_parent != NULLVP) &&
6051             vnode_isshadow(nd.ni_vp)) {
6052                 is_namedstream = 1;
6053                 vnode_ref(nd.ni_vp);
6054         }
6055 #endif
6056
6057         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6058
6059 #if NAMEDRSRCFORK
6060         if (is_namedstream) {
6061                 vnode_rele(nd.ni_vp);
6062         }
6063 #endif
6064
6065         vnode_put(nd.ni_vp);
6066         if (amode & _DELETE_OK) {
6067                 vnode_put(nd.ni_dvp);
6068         }
6069         nameidone(&nd);
6070
6071 out:
6072         if (!(flag & AT_EACCESS)) {
6073                 kauth_cred_unref(&context.vc_ucred);
6074         }
6075         return error;
6076 }
6077
6078 int
6079 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6080 {
6081         return faccessat_internal(vfs_context_current(), AT_FDCWD,
6082                    uap->path, uap->flags, 0, UIO_USERSPACE);
6083 }
6084
6085 int
6086 faccessat(__unused proc_t p, struct faccessat_args *uap,
6087     __unused int32_t *retval)
6088 {
6089         if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
6090                 return EINVAL;
6091         }
6092
6093         return faccessat_internal(vfs_context_current(), uap->fd,
6094                    uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6095 }
6096
6097 /*
6098  * Returns:     0                       Success
6099  *              EFAULT
6100  *      copyout:EFAULT
6101  *      namei:???
6102  *      vn_stat:???
6103  */
6104 static int
6105 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6106     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6107     enum uio_seg segflg, int fd, int flag)
6108 {
6109         struct nameidata nd;
6110         int follow;
6111         union {
6112                 struct stat sb;
6113                 struct stat64 sb64;
6114         } source = {};
6115         union {
6116                 struct user64_stat user64_sb;
6117                 struct user32_stat user32_sb;
6118                 struct user64_stat64 user64_sb64;
6119                 struct user32_stat64 user32_sb64;
6120         } dest = {};
6121         caddr_t sbp;
6122         int error, my_size;
6123         kauth_filesec_t fsec;
6124         size_t xsecurity_bufsize;
6125         void * statptr;
6126         struct fileproc *fp = NULL;
6127         int needsrealdev = 0;
6128
6129         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6130         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6131             segflg, path, ctx);
6132
6133 #if NAMEDRSRCFORK
6134         int is_namedstream = 0;
6135         /* stat calls are allowed for resource forks. */
6136         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6137 #endif
6138
6139         if (flag & AT_FDONLY) {
6140                 vnode_t fvp;
6141
6142                 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6143                 if (error) {
6144                         return error;
6145                 }
6146                 if ((error = vnode_getwithref(fvp))) {
6147                         file_drop(fd);
6148                         return error;
6149                 }
6150                 nd.ni_vp = fvp;
6151         } else {
6152                 error = nameiat(&nd, fd);
6153                 if (error) {
6154                         return error;
6155                 }
6156         }
6157         fsec = KAUTH_FILESEC_NONE;
6158
6159         statptr = (void *)&source;
6160
6161 #if NAMEDRSRCFORK
6162         /* Grab reference on the shadow stream file vnode to
6163          * force an inactive on release which will mark it
6164          * for recycle.
6165          */
6166         if (vnode_isnamedstream(nd.ni_vp) &&
6167             (nd.ni_vp->v_parent != NULLVP) &&
6168             vnode_isshadow(nd.ni_vp)) {
6169                 is_namedstream = 1;
6170                 vnode_ref(nd.ni_vp);
6171         }
6172 #endif
6173
6174         needsrealdev = flag & AT_REALDEV ? 1 : 0;
6175         if (fp && (xsecurity == USER_ADDR_NULL)) {
6176                 /*
6177                  * If the caller has the file open, and is not
6178                  * requesting extended security information, we are
6179                  * going to let them get the basic stat information.
6180                  */
6181                 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6182                     fp->f_fglob->fg_cred);
6183         } else {
6184                 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6185                     isstat64, needsrealdev, ctx);
6186         }
6187
6188 #if NAMEDRSRCFORK
6189         if (is_namedstream) {
6190                 vnode_rele(nd.ni_vp);
6191         }
6192 #endif
6193         vnode_put(nd.ni_vp);
6194         nameidone(&nd);
6195         if (fp) {
6196                 file_drop(fd);
6197                 fp = NULL;
6198         }
6199
6200         if (error) {
6201                 return error;
6202         }
6203         /* Zap spare fields */
6204         if (isstat64 != 0) {
6205                 source.sb64.st_lspare = 0;
6206                 source.sb64.st_qspare[0] = 0LL;
6207                 source.sb64.st_qspare[1] = 0LL;
6208                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6209                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6210                         my_size = sizeof(dest.user64_sb64);
6211                         sbp = (caddr_t)&dest.user64_sb64;
6212                 } else {
6213                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6214                         my_size = sizeof(dest.user32_sb64);
6215                         sbp = (caddr_t)&dest.user32_sb64;
6216                 }
6217                 /*
6218                  * Check if we raced (post lookup) against the last unlink of a file.
6219                  */
6220                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6221                         source.sb64.st_nlink = 1;
6222                 }
6223         } else {
6224                 source.sb.st_lspare = 0;
6225                 source.sb.st_qspare[0] = 0LL;
6226                 source.sb.st_qspare[1] = 0LL;
6227                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6228                         munge_user64_stat(&source.sb, &dest.user64_sb);
6229                         my_size = sizeof(dest.user64_sb);
6230                         sbp = (caddr_t)&dest.user64_sb;
6231                 } else {
6232                         munge_user32_stat(&source.sb, &dest.user32_sb);
6233                         my_size = sizeof(dest.user32_sb);
6234                         sbp = (caddr_t)&dest.user32_sb;
6235                 }
6236
6237                 /*
6238                  * Check if we raced (post lookup) against the last unlink of a file.
6239                  */
6240                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6241                         source.sb.st_nlink = 1;
6242                 }
6243         }
6244         if ((error = copyout(sbp, ub, my_size)) != 0) {
6245                 goto out;
6246         }
6247
6248         /* caller wants extended security information? */
6249         if (xsecurity != USER_ADDR_NULL) {
6250                 /* did we get any? */
6251                 if (fsec == KAUTH_FILESEC_NONE) {
6252                         if (susize(xsecurity_size, 0) != 0) {
6253                                 error = EFAULT;
6254                                 goto out;
6255                         }
6256                 } else {
6257                         /* find the user buffer size */
6258                         xsecurity_bufsize = fusize(xsecurity_size);
6259
6260                         /* copy out the actual data size */
6261                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6262                                 error = EFAULT;
6263                                 goto out;
6264                         }
6265
6266                         /* if the caller supplied enough room, copy out to it */
6267                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6268                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6269                         }
6270                 }
6271         }
6272 out:
6273         if (fsec != KAUTH_FILESEC_NONE) {
6274                 kauth_filesec_free(fsec);
6275         }
6276         return error;
6277 }
6278
6279 /*
6280  * stat_extended: Get file status; with extended security (ACL).
6281  *
6282  * Parameters:    p                       (ignored)
6283  *                uap                     User argument descriptor (see below)
6284  *                retval                  (ignored)
6285  *
6286  * Indirect:      uap->path               Path of file to get status from
6287  *                uap->ub                 User buffer (holds file status info)
6288  *                uap->xsecurity          ACL to get (extended security)
6289  *                uap->xsecurity_size     Size of ACL
6290  *
6291  * Returns:        0                      Success
6292  *                !0                      errno value
6293  *
6294  */
6295 int
6296 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6297     __unused int32_t *retval)
6298 {
6299         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6300                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6301                    0);
6302 }
6303
6304 /*
6305  * Returns:     0                       Success
6306  *      fstatat_internal:???            [see fstatat_internal() in this file]
6307  */
6308 int
6309 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6310 {
6311         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6312                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6313 }
6314
6315 int
6316 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6317 {
6318         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6319                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6320 }
6321
6322 /*
6323  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6324  *
6325  * Parameters:    p                       (ignored)
6326  *                uap                     User argument descriptor (see below)
6327  *                retval                  (ignored)
6328  *
6329  * Indirect:      uap->path               Path of file to get status from
6330  *                uap->ub                 User buffer (holds file status info)
6331  *                uap->xsecurity          ACL to get (extended security)
6332  *                uap->xsecurity_size     Size of ACL
6333  *
6334  * Returns:        0                      Success
6335  *                !0                      errno value
6336  *
6337  */
6338 int
6339 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6340 {
6341         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6342                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6343                    0);
6344 }
6345
6346 /*
6347  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6348  *
6349  * Parameters:    p                       (ignored)
6350  *                uap                     User argument descriptor (see below)
6351  *                retval                  (ignored)
6352  *
6353  * Indirect:      uap->path               Path of file to get status from
6354  *                uap->ub                 User buffer (holds file status info)
6355  *                uap->xsecurity          ACL to get (extended security)
6356  *                uap->xsecurity_size     Size of ACL
6357  *
6358  * Returns:        0                      Success
6359  *                !0                      errno value
6360  *
6361  */
6362 int
6363 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6364 {
6365         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6366                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6367                    AT_SYMLINK_NOFOLLOW);
6368 }
6369
6370 /*
6371  * Get file status; this version does not follow links.
6372  */
6373 int
6374 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6375 {
6376         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6377                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6378 }
6379
6380 int
6381 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6382 {
6383         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6384                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6385 }
6386
6387 /*
6388  * lstat64_extended: Get file status; can handle large inode numbers; does not
6389  * follow links; with extended security (ACL).
6390  *
6391  * Parameters:    p                       (ignored)
6392  *                uap                     User argument descriptor (see below)
6393  *                retval                  (ignored)
6394  *
6395  * Indirect:      uap->path               Path of file to get status from
6396  *                uap->ub                 User buffer (holds file status info)
6397  *                uap->xsecurity          ACL to get (extended security)
6398  *                uap->xsecurity_size     Size of ACL
6399  *
6400  * Returns:        0                      Success
6401  *                !0                      errno value
6402  *
6403  */
6404 int
6405 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6406 {
6407         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6408                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6409                    AT_SYMLINK_NOFOLLOW);
6410 }
6411
6412 int
6413 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6414 {
6415         if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6416                 return EINVAL;
6417         }
6418
6419         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6420                    0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6421 }
6422
6423 int
6424 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6425     __unused int32_t *retval)
6426 {
6427         if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6428                 return EINVAL;
6429         }
6430
6431         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6432                    0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6433 }
6434
6435 /*
6436  * Get configurable pathname variables.
6437  *
6438  * Returns:     0                       Success
6439  *      namei:???
6440  *      vn_pathconf:???
6441  *
6442  * Notes:       Global implementation  constants are intended to be
6443  *              implemented in this function directly; all other constants
6444  *              are per-FS implementation, and therefore must be handled in
6445  *              each respective FS, instead.
6446  *
6447  * XXX We implement some things globally right now that should actually be
6448  * XXX per-FS; we will need to deal with this at some point.
6449  */
6450 /* ARGSUSED */
6451 int
6452 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6453 {
6454         int error;
6455         struct nameidata nd;
6456         vfs_context_t ctx = vfs_context_current();
6457
6458         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6459             UIO_USERSPACE, uap->path, ctx);
6460         error = namei(&nd);
6461         if (error) {
6462                 return error;
6463         }
6464
6465         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6466
6467         vnode_put(nd.ni_vp);
6468         nameidone(&nd);
6469         return error;
6470 }
6471
6472 /*
6473  * Return target name of a symbolic link.
6474  */
6475 /* ARGSUSED */
6476 static int
6477 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
6478     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6479     int *retval)
6480 {
6481         vnode_t vp;
6482         uio_t auio;
6483         int error;
6484         struct nameidata nd;
6485         char uio_buf[UIO_SIZEOF(1)];
6486
6487         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6488             seg, path, ctx);
6489
6490         error = nameiat(&nd, fd);
6491         if (error) {
6492                 return error;
6493         }
6494         vp = nd.ni_vp;
6495
6496         nameidone(&nd);
6497
6498         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6499             &uio_buf[0], sizeof(uio_buf));
6500         uio_addiov(auio, buf, bufsize);
6501         if (vp->v_type != VLNK) {
6502                 error = EINVAL;
6503         } else {
6504 #if CONFIG_MACF
6505                 error = mac_vnode_check_readlink(ctx, vp);
6506 #endif
6507                 if (error == 0) {
6508                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6509                             ctx);
6510                 }
6511                 if (error == 0) {
6512                         error = VNOP_READLINK(vp, auio, ctx);
6513                 }
6514         }
6515         vnode_put(vp);
6516
6517         *retval = bufsize - (int)uio_resid(auio);
6518         return error;
6519 }
6520
6521 int
6522 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6523 {
6524         enum uio_seg procseg;
6525
6526         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6527         return readlinkat_internal(vfs_context_current(), AT_FDCWD,
6528                    CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6529                    uap->count, procseg, retval);
6530 }
6531
6532 int
6533 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6534 {
6535         enum uio_seg procseg;
6536
6537         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6538         return readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
6539                    procseg, uap->buf, uap->bufsize, procseg, retval);
6540 }
6541
6542 /*
6543  * Change file flags, the deep inner layer.
6544  */
6545 static int
6546 chflags0(vnode_t vp, struct vnode_attr *va,
6547     int (*setattr)(vnode_t, void *, vfs_context_t),
6548     void *arg, vfs_context_t ctx)
6549 {
6550         kauth_action_t action = 0;
6551         int error;
6552
6553 #if CONFIG_MACF
6554         error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6555         if (error) {
6556                 goto out;
6557         }
6558 #endif
6559
6560         /* request authorisation, disregard immutability */
6561         if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6562                 goto out;
6563         }
6564         /*
6565          * Request that the auth layer disregard those file flags it's allowed to when
6566          * authorizing this operation; we need to do this in order to be able to
6567          * clear immutable flags.
6568          */
6569         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6570                 goto out;
6571         }
6572         error = (*setattr)(vp, arg, ctx);
6573
6574 #if CONFIG_MACF
6575         if (error == 0) {
6576                 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6577         }
6578 #endif
6579
6580 out:
6581         return error;
6582 }
6583
6584 /*
6585  * Change file flags.
6586  *
6587  * NOTE: this will vnode_put() `vp'
6588  */
6589 static int
6590 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6591 {
6592         struct vnode_attr va;
6593         int error;
6594
6595         VATTR_INIT(&va);
6596         VATTR_SET(&va, va_flags, flags);
6597
6598         error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6599         vnode_put(vp);
6600
6601         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6602                 error = ENOTSUP;
6603         }
6604
6605         return error;
6606 }
6607
6608 /*
6609  * Change flags of a file given a path name.
6610  */
6611 /* ARGSUSED */
6612 int
6613 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6614 {
6615         vnode_t vp;
6616         vfs_context_t ctx = vfs_context_current();
6617         int error;
6618         struct nameidata nd;
6619
6620         AUDIT_ARG(fflags, uap->flags);
6621         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6622             UIO_USERSPACE, uap->path, ctx);
6623         error = namei(&nd);
6624         if (error) {
6625                 return error;
6626         }
6627         vp = nd.ni_vp;
6628         nameidone(&nd);
6629
6630         /* we don't vnode_put() here because chflags1 does internally */
6631         error = chflags1(vp, uap->flags, ctx);
6632
6633         return error;
6634 }
6635
6636 /*
6637  * Change flags of a file given a file descriptor.
6638  */
6639 /* ARGSUSED */
6640 int
6641 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6642 {
6643         vnode_t vp;
6644         int error;
6645
6646         AUDIT_ARG(fd, uap->fd);
6647         AUDIT_ARG(fflags, uap->flags);
6648         if ((error = file_vnode(uap->fd, &vp))) {
6649                 return error;
6650         }
6651
6652         if ((error = vnode_getwithref(vp))) {
6653                 file_drop(uap->fd);
6654                 return error;
6655         }
6656
6657         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6658
6659         /* we don't vnode_put() here because chflags1 does internally */
6660         error = chflags1(vp, uap->flags, vfs_context_current());
6661
6662         file_drop(uap->fd);
6663         return error;
6664 }
6665
6666 /*
6667  * Change security information on a filesystem object.
6668  *
6669  * Returns:     0                       Success
6670  *              EPERM                   Operation not permitted
6671  *              vnode_authattr:???      [anything vnode_authattr can return]
6672  *              vnode_authorize:???     [anything vnode_authorize can return]
6673  *              vnode_setattr:???       [anything vnode_setattr can return]
6674  *
6675  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
6676  *              translated to EPERM before being returned.
6677  */
6678 static int
6679 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6680 {
6681         kauth_action_t action;
6682         int error;
6683
6684         AUDIT_ARG(mode, vap->va_mode);
6685         /* XXX audit new args */
6686
6687 #if NAMEDSTREAMS
6688         /* chmod calls are not allowed for resource forks. */
6689         if (vp->v_flag & VISNAMEDSTREAM) {
6690                 return EPERM;
6691         }
6692 #endif
6693
6694 #if CONFIG_MACF
6695         if (VATTR_IS_ACTIVE(vap, va_mode) &&
6696             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6697                 return error;
6698         }
6699
6700         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6701                 if ((error = mac_vnode_check_setowner(ctx, vp,
6702                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6703                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6704                         return error;
6705                 }
6706         }
6707
6708         if (VATTR_IS_ACTIVE(vap, va_acl) &&
6709             (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
6710                 return error;
6711         }
6712 #endif
6713
6714         /* make sure that the caller is allowed to set this security information */
6715         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6716             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6717                 if (error == EACCES) {
6718                         error = EPERM;
6719                 }
6720                 return error;
6721         }
6722
6723         if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
6724                 return error;
6725         }
6726
6727 #if CONFIG_MACF
6728         if (VATTR_IS_ACTIVE(vap, va_mode)) {
6729                 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6730         }
6731
6732         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6733                 mac_vnode_notify_setowner(ctx, vp,
6734                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6735                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6736         }
6737
6738         if (VATTR_IS_ACTIVE(vap, va_acl)) {
6739                 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6740         }
6741 #endif
6742
6743         return error;
6744 }
6745
6746
6747 /*
6748  * Change mode of a file given a path name.
6749  *
6750  * Returns:     0                       Success
6751  *              namei:???               [anything namei can return]
6752  *              chmod_vnode:???         [anything chmod_vnode can return]
6753  */
6754 static int
6755 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6756     int fd, int flag, enum uio_seg segflg)
6757 {
6758         struct nameidata nd;
6759         int follow, error;
6760
6761         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6762         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
6763             segflg, path, ctx);
6764         if ((error = nameiat(&nd, fd))) {
6765                 return error;
6766         }
6767         error = chmod_vnode(ctx, nd.ni_vp, vap);
6768         vnode_put(nd.ni_vp);
6769         nameidone(&nd);
6770         return error;
6771 }
6772
6773 /*
6774  * chmod_extended: Change the mode of a file given a path name; with extended
6775  * argument list (including extended security (ACL)).
6776  *
6777  * Parameters:  p                       Process requesting the open
6778  *              uap                     User argument descriptor (see below)
6779  *              retval                  (ignored)
6780  *
6781  * Indirect:    uap->path               Path to object (same as 'chmod')
6782  *              uap->uid                UID to set
6783  *              uap->gid                GID to set
6784  *              uap->mode               File mode to set (same as 'chmod')
6785  *              uap->xsecurity          ACL to set (or delete)
6786  *
6787  * Returns:     0                       Success
6788  *              !0                      errno value
6789  *
6790  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
6791  *
6792  * XXX:         We should enummerate the possible errno values here, and where
6793  *              in the code they originated.
6794  */
6795 int
6796 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
6797 {
6798         int error;
6799         struct vnode_attr va;
6800         kauth_filesec_t xsecdst;
6801
6802         AUDIT_ARG(owner, uap->uid, uap->gid);
6803
6804         VATTR_INIT(&va);
6805         if (uap->mode != -1) {
6806                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6807         }
6808         if (uap->uid != KAUTH_UID_NONE) {
6809                 VATTR_SET(&va, va_uid, uap->uid);
6810         }
6811         if (uap->gid != KAUTH_GID_NONE) {
6812                 VATTR_SET(&va, va_gid, uap->gid);
6813         }
6814
6815         xsecdst = NULL;
6816         switch (uap->xsecurity) {
6817         /* explicit remove request */
6818         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6819                 VATTR_SET(&va, va_acl, NULL);
6820                 break;
6821         /* not being set */
6822         case USER_ADDR_NULL:
6823                 break;
6824         default:
6825                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6826                         return error;
6827                 }
6828                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6829                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6830         }
6831
6832         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6833             UIO_USERSPACE);
6834
6835         if (xsecdst != NULL) {
6836                 kauth_filesec_free(xsecdst);
6837         }
6838         return error;
6839 }
6840
6841 /*
6842  * Returns:     0                       Success
6843  *              chmodat:???             [anything chmodat can return]
6844  */
6845 static int
6846 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6847     int flag, enum uio_seg segflg)
6848 {
6849         struct vnode_attr va;
6850
6851         VATTR_INIT(&va);
6852         VATTR_SET(&va, va_mode, mode & ALLPERMS);
6853
6854         return chmodat(ctx, path, &va, fd, flag, segflg);
6855 }
6856
6857 int
6858 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6859 {
6860         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6861                    AT_FDCWD, 0, UIO_USERSPACE);
6862 }
6863
6864 int
6865 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6866 {
6867         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
6868                 return EINVAL;
6869         }
6870
6871         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6872                    uap->fd, uap->flag, UIO_USERSPACE);
6873 }
6874
6875 /*
6876  * Change mode of a file given a file descriptor.
6877  */
6878 static int
6879 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6880 {
6881         vnode_t vp;
6882         int error;
6883
6884         AUDIT_ARG(fd, fd);
6885
6886         if ((error = file_vnode(fd, &vp)) != 0) {
6887                 return error;
6888         }
6889         if ((error = vnode_getwithref(vp)) != 0) {
6890                 file_drop(fd);
6891                 return error;
6892         }
6893         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6894
6895         error = chmod_vnode(vfs_context_current(), vp, vap);
6896         (void)vnode_put(vp);
6897         file_drop(fd);
6898
6899         return error;
6900 }
6901
6902 /*
6903  * fchmod_extended: Change mode of a file given a file descriptor; with
6904  * extended argument list (including extended security (ACL)).
6905  *
6906  * Parameters:    p                       Process requesting to change file mode
6907  *                uap                     User argument descriptor (see below)
6908  *                retval                  (ignored)
6909  *
6910  * Indirect:      uap->mode               File mode to set (same as 'chmod')
6911  *                uap->uid                UID to set
6912  *                uap->gid                GID to set
6913  *                uap->xsecurity          ACL to set (or delete)
6914  *                uap->fd                 File descriptor of file to change mode
6915  *
6916  * Returns:        0                      Success
6917  *                !0                      errno value
6918  *
6919  */
6920 int
6921 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6922 {
6923         int error;
6924         struct vnode_attr va;
6925         kauth_filesec_t xsecdst;
6926
6927         AUDIT_ARG(owner, uap->uid, uap->gid);
6928
6929         VATTR_INIT(&va);
6930         if (uap->mode != -1) {
6931                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6932         }
6933         if (uap->uid != KAUTH_UID_NONE) {
6934                 VATTR_SET(&va, va_uid, uap->uid);
6935         }
6936         if (uap->gid != KAUTH_GID_NONE) {
6937                 VATTR_SET(&va, va_gid, uap->gid);
6938         }
6939
6940         xsecdst = NULL;
6941         switch (uap->xsecurity) {
6942         case USER_ADDR_NULL:
6943                 VATTR_SET(&va, va_acl, NULL);
6944                 break;
6945         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6946                 VATTR_SET(&va, va_acl, NULL);
6947                 break;
6948         /* not being set */
6949         case CAST_USER_ADDR_T(-1):
6950                 break;
6951         default:
6952                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6953                         return error;
6954                 }
6955                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6956         }
6957
6958         error = fchmod1(p, uap->fd, &va);
6959
6960
6961         switch (uap->xsecurity) {
6962         case USER_ADDR_NULL:
6963         case CAST_USER_ADDR_T(-1):
6964                 break;
6965         default:
6966                 if (xsecdst != NULL) {
6967                         kauth_filesec_free(xsecdst);
6968                 }
6969         }
6970         return error;
6971 }
6972
6973 int
6974 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6975 {
6976         struct vnode_attr va;
6977
6978         VATTR_INIT(&va);
6979         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6980
6981         return fchmod1(p, uap->fd, &va);
6982 }
6983
6984
6985 /*
6986  * Set ownership given a path name.
6987  */
6988 /* ARGSUSED */
6989 static int
6990 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6991     gid_t gid, int flag, enum uio_seg segflg)
6992 {
6993         vnode_t vp;
6994         struct vnode_attr va;
6995         int error;
6996         struct nameidata nd;
6997         int follow;
6998         kauth_action_t action;
6999
7000         AUDIT_ARG(owner, uid, gid);
7001
7002         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7003         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
7004             path, ctx);
7005         error = nameiat(&nd, fd);
7006         if (error) {
7007                 return error;
7008         }
7009         vp = nd.ni_vp;
7010
7011         nameidone(&nd);
7012
7013         VATTR_INIT(&va);
7014         if (uid != (uid_t)VNOVAL) {
7015                 VATTR_SET(&va, va_uid, uid);
7016         }
7017         if (gid != (gid_t)VNOVAL) {
7018                 VATTR_SET(&va, va_gid, gid);
7019         }
7020
7021 #if CONFIG_MACF
7022         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7023         if (error) {
7024                 goto out;
7025         }
7026 #endif
7027
7028         /* preflight and authorize attribute changes */
7029         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7030                 goto out;
7031         }
7032         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7033                 goto out;
7034         }
7035         error = vnode_setattr(vp, &va, ctx);
7036
7037 #if CONFIG_MACF
7038         if (error == 0) {
7039                 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7040         }
7041 #endif
7042
7043 out:
7044         /*
7045          * EACCES is only allowed from namei(); permissions failure should
7046          * return EPERM, so we need to translate the error code.
7047          */
7048         if (error == EACCES) {
7049                 error = EPERM;
7050         }
7051
7052         vnode_put(vp);
7053         return error;
7054 }
7055
7056 int
7057 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7058 {
7059         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7060                    uap->uid, uap->gid, 0, UIO_USERSPACE);
7061 }
7062
7063 int
7064 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7065 {
7066         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7067                    uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7068 }
7069
7070 int
7071 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7072 {
7073         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7074                 return EINVAL;
7075         }
7076
7077         return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7078                    uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7079 }
7080
7081 /*
7082  * Set ownership given a file descriptor.
7083  */
7084 /* ARGSUSED */
7085 int
7086 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7087 {
7088         struct vnode_attr va;
7089         vfs_context_t ctx = vfs_context_current();
7090         vnode_t vp;
7091         int error;
7092         kauth_action_t action;
7093
7094         AUDIT_ARG(owner, uap->uid, uap->gid);
7095         AUDIT_ARG(fd, uap->fd);
7096
7097         if ((error = file_vnode(uap->fd, &vp))) {
7098                 return error;
7099         }
7100
7101         if ((error = vnode_getwithref(vp))) {
7102                 file_drop(uap->fd);
7103                 return error;
7104         }
7105         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7106
7107         VATTR_INIT(&va);
7108         if (uap->uid != VNOVAL) {
7109                 VATTR_SET(&va, va_uid, uap->uid);
7110         }
7111         if (uap->gid != VNOVAL) {
7112                 VATTR_SET(&va, va_gid, uap->gid);
7113         }
7114
7115 #if NAMEDSTREAMS
7116         /* chown calls are not allowed for resource forks. */
7117         if (vp->v_flag & VISNAMEDSTREAM) {
7118                 error = EPERM;
7119                 goto out;
7120         }
7121 #endif
7122
7123 #if CONFIG_MACF
7124         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7125         if (error) {
7126                 goto out;
7127         }
7128 #endif
7129
7130         /* preflight and authorize attribute changes */
7131         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7132                 goto out;
7133         }
7134         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7135                 if (error == EACCES) {
7136                         error = EPERM;
7137                 }
7138                 goto out;
7139         }
7140         error = vnode_setattr(vp, &va, ctx);
7141
7142 #if CONFIG_MACF
7143         if (error == 0) {
7144                 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7145         }
7146 #endif
7147
7148 out:
7149         (void)vnode_put(vp);
7150         file_drop(uap->fd);
7151         return error;
7152 }
7153
7154 static int
7155 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7156 {
7157         int error;
7158
7159         if (usrtvp == USER_ADDR_NULL) {
7160                 struct timeval old_tv;
7161                 /* XXX Y2038 bug because of microtime argument */
7162                 microtime(&old_tv);
7163                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7164                 tsp[1] = tsp[0];
7165         } else {
7166                 if (IS_64BIT_PROCESS(current_proc())) {
7167                         struct user64_timeval tv[2];
7168                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
7169                         if (error) {
7170                                 return error;
7171                         }
7172                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7173                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7174                 } else {
7175                         struct user32_timeval tv[2];
7176                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
7177                         if (error) {
7178                                 return error;
7179                         }
7180                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7181                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7182                 }
7183         }
7184         return 0;
7185 }
7186
7187 static int
7188 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7189     int nullflag)
7190 {
7191         int error;
7192         struct vnode_attr va;
7193         kauth_action_t action;
7194
7195         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7196
7197         VATTR_INIT(&va);
7198         VATTR_SET(&va, va_access_time, ts[0]);
7199         VATTR_SET(&va, va_modify_time, ts[1]);
7200         if (nullflag) {
7201                 va.va_vaflags |= VA_UTIMES_NULL;
7202         }
7203
7204 #if NAMEDSTREAMS
7205         /* utimes calls are not allowed for resource forks. */
7206         if (vp->v_flag & VISNAMEDSTREAM) {
7207                 error = EPERM;
7208                 goto out;
7209         }
7210 #endif
7211
7212 #if CONFIG_MACF
7213         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7214         if (error) {
7215                 goto out;
7216         }
7217 #endif
7218         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7219                 if (!nullflag && error == EACCES) {
7220                         error = EPERM;
7221                 }
7222                 goto out;
7223         }
7224
7225         /* since we may not need to auth anything, check here */
7226         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7227                 if (!nullflag && error == EACCES) {
7228                         error = EPERM;
7229                 }
7230                 goto out;
7231         }
7232         error = vnode_setattr(vp, &va, ctx);
7233
7234 #if CONFIG_MACF
7235         if (error == 0) {
7236                 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7237         }
7238 #endif
7239
7240 out:
7241         return error;
7242 }
7243
7244 /*
7245  * Set the access and modification times of a file.
7246  */
7247 /* ARGSUSED */
7248 int
7249 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7250 {
7251         struct timespec ts[2];
7252         user_addr_t usrtvp;
7253         int error;
7254         struct nameidata nd;
7255         vfs_context_t ctx = vfs_context_current();
7256
7257         /*
7258          * AUDIT: Needed to change the order of operations to do the
7259          * name lookup first because auditing wants the path.
7260          */
7261         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7262             UIO_USERSPACE, uap->path, ctx);
7263         error = namei(&nd);
7264         if (error) {
7265                 return error;
7266         }
7267         nameidone(&nd);
7268
7269         /*
7270          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
7271          * the current time instead.
7272          */
7273         usrtvp = uap->tptr;
7274         if ((error = getutimes(usrtvp, ts)) != 0) {
7275                 goto out;
7276         }
7277
7278         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7279
7280 out:
7281         vnode_put(nd.ni_vp);
7282         return error;
7283 }
7284
7285 /*
7286  * Set the access and modification times of a file.
7287  */
7288 /* ARGSUSED */
7289 int
7290 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7291 {
7292         struct timespec ts[2];
7293         vnode_t vp;
7294         user_addr_t usrtvp;
7295         int error;
7296
7297         AUDIT_ARG(fd, uap->fd);
7298         usrtvp = uap->tptr;
7299         if ((error = getutimes(usrtvp, ts)) != 0) {
7300                 return error;
7301         }
7302         if ((error = file_vnode(uap->fd, &vp)) != 0) {
7303                 return error;
7304         }
7305         if ((error = vnode_getwithref(vp))) {
7306                 file_drop(uap->fd);
7307                 return error;
7308         }
7309
7310         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7311         vnode_put(vp);
7312         file_drop(uap->fd);
7313         return error;
7314 }
7315
7316 /*
7317  * Truncate a file given its path name.
7318  */
7319 /* ARGSUSED */
7320 int
7321 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7322 {
7323         vnode_t vp;
7324         struct vnode_attr va;
7325         vfs_context_t ctx = vfs_context_current();
7326         int error;
7327         struct nameidata nd;
7328         kauth_action_t action;
7329
7330         if (uap->length < 0) {
7331                 return EINVAL;
7332         }
7333         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7334             UIO_USERSPACE, uap->path, ctx);
7335         if ((error = namei(&nd))) {
7336                 return error;
7337         }
7338         vp = nd.ni_vp;
7339
7340         nameidone(&nd);
7341
7342         VATTR_INIT(&va);
7343         VATTR_SET(&va, va_data_size, uap->length);
7344
7345 #if CONFIG_MACF
7346         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7347         if (error) {
7348                 goto out;
7349         }
7350 #endif
7351
7352         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7353                 goto out;
7354         }
7355         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7356                 goto out;
7357         }
7358         error = vnode_setattr(vp, &va, ctx);
7359
7360 #if CONFIG_MACF
7361         if (error == 0) {
7362                 mac_vnode_notify_truncate(ctx, NOCRED, vp);
7363         }
7364 #endif
7365
7366 out:
7367         vnode_put(vp);
7368         return error;
7369 }
7370
7371 /*
7372  * Truncate a file given a file descriptor.
7373  */
7374 /* ARGSUSED */
7375 int
7376 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7377 {
7378         vfs_context_t ctx = vfs_context_current();
7379         struct vnode_attr va;
7380         vnode_t vp;
7381         struct fileproc *fp;
7382         int error;
7383         int fd = uap->fd;
7384
7385         AUDIT_ARG(fd, uap->fd);
7386         if (uap->length < 0) {
7387                 return EINVAL;
7388         }
7389
7390         if ((error = fp_lookup(p, fd, &fp, 0))) {
7391                 return error;
7392         }
7393
7394         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
7395         case DTYPE_PSXSHM:
7396                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7397                 goto out;
7398         case DTYPE_VNODE:
7399                 break;
7400         default:
7401                 error = EINVAL;
7402                 goto out;
7403         }
7404
7405         vp = (vnode_t)fp->f_fglob->fg_data;
7406
7407         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
7408                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7409                 error = EINVAL;
7410                 goto out;
7411         }
7412
7413         if ((error = vnode_getwithref(vp)) != 0) {
7414                 goto out;
7415         }
7416
7417         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7418
7419 #if CONFIG_MACF
7420         error = mac_vnode_check_truncate(ctx,
7421             fp->f_fglob->fg_cred, vp);
7422         if (error) {
7423                 (void)vnode_put(vp);
7424                 goto out;
7425         }
7426 #endif
7427         VATTR_INIT(&va);
7428         VATTR_SET(&va, va_data_size, uap->length);
7429         error = vnode_setattr(vp, &va, ctx);
7430
7431 #if CONFIG_MACF
7432         if (error == 0) {
7433                 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
7434         }
7435 #endif
7436
7437         (void)vnode_put(vp);
7438 out:
7439         file_drop(fd);
7440         return error;
7441 }
7442
7443
7444 /*
7445  * Sync an open file with synchronized I/O _file_ integrity completion
7446  */
7447 /* ARGSUSED */
7448 int
7449 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7450 {
7451         __pthread_testcancel(1);
7452         return fsync_common(p, uap, MNT_WAIT);
7453 }
7454
7455
7456 /*
7457  * Sync an open file with synchronized I/O _file_ integrity completion
7458  *
7459  * Notes:       This is a legacy support function that does not test for
7460  *              thread cancellation points.
7461  */
7462 /* ARGSUSED */
7463 int
7464 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7465 {
7466         return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7467 }
7468
7469
7470 /*
7471  * Sync an open file with synchronized I/O _data_ integrity completion
7472  */
7473 /* ARGSUSED */
7474 int
7475 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7476 {
7477         __pthread_testcancel(1);
7478         return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7479 }
7480
7481
7482 /*
7483  * fsync_common
7484  *
7485  * Common fsync code to support both synchronized I/O file integrity completion
7486  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7487  *
7488  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7489  * will only guarantee that the file data contents are retrievable.  If
7490  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7491  * includes additional metadata unnecessary for retrieving the file data
7492  * contents, such as atime, mtime, ctime, etc., also be committed to stable
7493  * storage.
7494  *
7495  * Parameters:  p                               The process
7496  *              uap->fd                         The descriptor to synchronize
7497  *              flags                           The data integrity flags
7498  *
7499  * Returns:     int                             Success
7500  *      fp_getfvp:EBADF                         Bad file descriptor
7501  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
7502  *      VNOP_FSYNC:???                          unspecified
7503  *
7504  * Notes:       We use struct fsync_args because it is a short name, and all
7505  *              caller argument structures are otherwise identical.
7506  */
7507 static int
7508 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7509 {
7510         vnode_t vp;
7511         struct fileproc *fp;
7512         vfs_context_t ctx = vfs_context_current();
7513         int error;
7514
7515         AUDIT_ARG(fd, uap->fd);
7516
7517         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7518                 return error;
7519         }
7520         if ((error = vnode_getwithref(vp))) {
7521                 file_drop(uap->fd);
7522                 return error;
7523         }
7524
7525         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7526
7527         error = VNOP_FSYNC(vp, flags, ctx);
7528
7529 #if NAMEDRSRCFORK
7530         /* Sync resource fork shadow file if necessary. */
7531         if ((error == 0) &&
7532             (vp->v_flag & VISNAMEDSTREAM) &&
7533             (vp->v_parent != NULLVP) &&
7534             vnode_isshadow(vp) &&
7535             (fp->f_flags & FP_WRITTEN)) {
7536                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7537         }
7538 #endif
7539
7540         (void)vnode_put(vp);
7541         file_drop(uap->fd);
7542         return error;
7543 }
7544
7545 /*
7546  * Duplicate files.  Source must be a file, target must be a file or
7547  * must not exist.
7548  *
7549  * XXX Copyfile authorisation checking is woefully inadequate, and will not
7550  *     perform inheritance correctly.
7551  */
7552 /* ARGSUSED */
7553 int
7554 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7555 {
7556         vnode_t tvp, fvp, tdvp, sdvp;
7557         struct nameidata fromnd, tond;
7558         int error;
7559         vfs_context_t ctx = vfs_context_current();
7560 #if CONFIG_MACF
7561         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
7562         struct vnode_attr va;
7563 #endif
7564
7565         /* Check that the flags are valid. */
7566
7567         if (uap->flags & ~CPF_MASK) {
7568                 return EINVAL;
7569         }
7570
7571         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7572             UIO_USERSPACE, uap->from, ctx);
7573         if ((error = namei(&fromnd))) {
7574                 return error;
7575         }
7576         fvp = fromnd.ni_vp;
7577
7578         NDINIT(&tond, CREATE, OP_LINK,
7579             LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7580             UIO_USERSPACE, uap->to, ctx);
7581         if ((error = namei(&tond))) {
7582                 goto out1;
7583         }
7584         tdvp = tond.ni_dvp;
7585         tvp = tond.ni_vp;
7586
7587         if (tvp != NULL) {
7588                 if (!(uap->flags & CPF_OVERWRITE)) {
7589                         error = EEXIST;
7590                         goto out;
7591                 }
7592         }
7593
7594         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7595                 error = EISDIR;
7596                 goto out;
7597         }
7598
7599         /* This calls existing MAC hooks for open  */
7600         if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
7601             NULL))) {
7602                 goto out;
7603         }
7604
7605         if (tvp) {
7606                 /*
7607                  * See unlinkat_internal for an explanation of the potential
7608                  * ENOENT from the MAC hook but the gist is that the MAC hook
7609                  * can fail because vn_getpath isn't able to return the full
7610                  * path. We choose to ignore this failure.
7611                  */
7612                 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
7613                 if (error && error != ENOENT) {
7614                         goto out;
7615                 }
7616                 error = 0;
7617         }
7618
7619 #if CONFIG_MACF
7620         VATTR_INIT(&va);
7621         VATTR_SET(&va, va_type, fvp->v_type);
7622         /* Mask off all but regular access permissions */
7623         VATTR_SET(&va, va_mode,
7624             ((((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
7625         error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
7626         if (error) {
7627                 goto out;
7628         }
7629 #endif /* CONFIG_MACF */
7630
7631         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7632                 goto out;
7633         }
7634
7635         if (fvp == tdvp) {
7636                 error = EINVAL;
7637         }
7638         /*
7639          * If source is the same as the destination (that is the
7640          * same inode number) then there is nothing to do.
7641          * (fixed to have POSIX semantics - CSM 3/2/98)
7642          */
7643         if (fvp == tvp) {
7644                 error = -1;
7645         }
7646         if (!error) {
7647                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7648         }
7649 out:
7650         sdvp = tond.ni_startdir;
7651         /*
7652          * nameidone has to happen before we vnode_put(tdvp)
7653          * since it may need to release the fs_nodelock on the tdvp
7654          */
7655         nameidone(&tond);
7656
7657         if (tvp) {
7658                 vnode_put(tvp);
7659         }
7660         vnode_put(tdvp);
7661         vnode_put(sdvp);
7662 out1:
7663         vnode_put(fvp);
7664
7665         nameidone(&fromnd);
7666
7667         if (error == -1) {
7668                 return 0;
7669         }
7670         return error;
7671 }
7672
7673 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7674
7675 /*
7676  * Helper function for doing clones. The caller is expected to provide an
7677  * iocounted source vnode and release it.
7678  */
7679 static int
7680 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7681     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7682 {
7683         vnode_t tvp, tdvp;
7684         struct nameidata tond;
7685         int error;
7686         int follow;
7687         boolean_t free_src_acl;
7688         boolean_t attr_cleanup;
7689         enum vtype v_type;
7690         kauth_action_t action;
7691         struct componentname *cnp;
7692         uint32_t defaulted;
7693         struct vnode_attr va;
7694         struct vnode_attr nva;
7695         uint32_t vnop_flags;
7696
7697         v_type = vnode_vtype(fvp);
7698         switch (v_type) {
7699         case VLNK:
7700         /* FALLTHRU */
7701         case VREG:
7702                 action = KAUTH_VNODE_ADD_FILE;
7703                 break;
7704         case VDIR:
7705                 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7706                     fvp->v_mountedhere) {
7707                         return EINVAL;
7708                 }
7709                 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7710                 break;
7711         default:
7712                 return EINVAL;
7713         }
7714
7715         AUDIT_ARG(fd2, dst_dirfd);
7716         AUDIT_ARG(value32, flags);
7717
7718         follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7719         NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7720             UIO_USERSPACE, dst, ctx);
7721         if ((error = nameiat(&tond, dst_dirfd))) {
7722                 return error;
7723         }
7724         cnp = &tond.ni_cnd;
7725         tdvp = tond.ni_dvp;
7726         tvp = tond.ni_vp;
7727
7728         free_src_acl = FALSE;
7729         attr_cleanup = FALSE;
7730
7731         if (tvp != NULL) {
7732                 error = EEXIST;
7733                 goto out;
7734         }
7735
7736         if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7737                 error = EXDEV;
7738                 goto out;
7739         }
7740
7741 #if CONFIG_MACF
7742         if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
7743                 goto out;
7744         }
7745 #endif
7746         if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
7747                 goto out;
7748         }
7749
7750         action = KAUTH_VNODE_GENERIC_READ_BITS;
7751         if (data_read_authorised) {
7752                 action &= ~KAUTH_VNODE_READ_DATA;
7753         }
7754         if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
7755                 goto out;
7756         }
7757
7758         /*
7759          * certain attributes may need to be changed from the source, we ask for
7760          * those here.
7761          */
7762         VATTR_INIT(&va);
7763         VATTR_WANTED(&va, va_uid);
7764         VATTR_WANTED(&va, va_gid);
7765         VATTR_WANTED(&va, va_mode);
7766         VATTR_WANTED(&va, va_flags);
7767         VATTR_WANTED(&va, va_acl);
7768
7769         if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
7770                 goto out;
7771         }
7772
7773         VATTR_INIT(&nva);
7774         VATTR_SET(&nva, va_type, v_type);
7775         if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
7776                 VATTR_SET(&nva, va_acl, va.va_acl);
7777                 free_src_acl = TRUE;
7778         }
7779
7780         /* Handle ACL inheritance, initialize vap. */
7781         if (v_type == VLNK) {
7782                 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
7783         } else {
7784                 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
7785                 if (error) {
7786                         goto out;
7787                 }
7788                 attr_cleanup = TRUE;
7789         }
7790
7791         vnop_flags = VNODE_CLONEFILE_DEFAULT;
7792         /*
7793          * We've got initial values for all security parameters,
7794          * If we are superuser, then we can change owners to be the
7795          * same as the source. Both superuser and the owner have default
7796          * WRITE_SECURITY privileges so all other fields can be taken
7797          * from source as well.
7798          */
7799         if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
7800                 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
7801                         VATTR_SET(&nva, va_uid, va.va_uid);
7802                 }
7803                 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
7804                         VATTR_SET(&nva, va_gid, va.va_gid);
7805                 }
7806         } else {
7807                 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
7808         }
7809
7810         if (VATTR_IS_SUPPORTED(&va, va_mode)) {
7811                 VATTR_SET(&nva, va_mode, va.va_mode);
7812         }
7813         if (VATTR_IS_SUPPORTED(&va, va_flags)) {
7814                 VATTR_SET(&nva, va_flags,
7815                     ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
7816                     (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
7817         }
7818
7819         error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
7820
7821         if (!error && tvp) {
7822                 int     update_flags = 0;
7823 #if CONFIG_FSE
7824                 int fsevent;
7825 #endif /* CONFIG_FSE */
7826
7827                 /*
7828                  * If some of the requested attributes weren't handled by the
7829                  * VNOP, use our fallback code.
7830                  */
7831                 if (!VATTR_ALL_SUPPORTED(&va)) {
7832                         (void)vnode_setattr_fallback(tvp, &nva, ctx);
7833                 }
7834
7835 #if CONFIG_MACF
7836                 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
7837                     VNODE_LABEL_CREATE, ctx);
7838 #endif
7839
7840                 // Make sure the name & parent pointers are hooked up
7841                 if (tvp->v_name == NULL) {
7842                         update_flags |= VNODE_UPDATE_NAME;
7843                 }
7844                 if (tvp->v_parent == NULLVP) {
7845                         update_flags |= VNODE_UPDATE_PARENT;
7846                 }
7847
7848                 if (update_flags) {
7849                         (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
7850                             cnp->cn_namelen, cnp->cn_hash, update_flags);
7851                 }
7852
7853 #if CONFIG_FSE
7854                 switch (vnode_vtype(tvp)) {
7855                 case VLNK:
7856                 /* FALLTHRU */
7857                 case VREG:
7858                         fsevent = FSE_CREATE_FILE;
7859                         break;
7860                 case VDIR:
7861                         fsevent = FSE_CREATE_DIR;
7862                         break;
7863                 default:
7864                         goto out;
7865                 }
7866
7867                 if (need_fsevent(fsevent, tvp)) {
7868                         /*
7869                          * The following is a sequence of three explicit events.
7870                          * A pair of FSE_CLONE events representing the source and destination
7871                          * followed by an FSE_CREATE_[FILE | DIR] for the destination.
7872                          * fseventsd may coalesce the destination clone and create events
7873                          * into a single event resulting in the following sequence for a client
7874                          * FSE_CLONE (src)
7875                          * FSE_CLONE | FSE_CREATE (dst)
7876                          */
7877                         add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
7878                             FSE_ARG_DONE);
7879                         add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
7880                             FSE_ARG_DONE);
7881                 }
7882 #endif /* CONFIG_FSE */
7883         }
7884
7885 out:
7886         if (attr_cleanup) {
7887                 vn_attribute_cleanup(&nva, defaulted);
7888         }
7889         if (free_src_acl && va.va_acl) {
7890                 kauth_acl_free(va.va_acl);
7891         }
7892         nameidone(&tond);
7893         if (tvp) {
7894                 vnode_put(tvp);
7895         }
7896         vnode_put(tdvp);
7897         return error;
7898 }
7899
7900 /*
7901  * clone files or directories, target must not exist.
7902  */
7903 /* ARGSUSED */
7904 int
7905 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7906     __unused int32_t *retval)
7907 {
7908         vnode_t fvp;
7909         struct nameidata fromnd;
7910         int follow;
7911         int error;
7912         vfs_context_t ctx = vfs_context_current();
7913
7914         /* Check that the flags are valid. */
7915         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7916                 return EINVAL;
7917         }
7918
7919         AUDIT_ARG(fd, uap->src_dirfd);
7920
7921         follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7922         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7923             UIO_USERSPACE, uap->src, ctx);
7924         if ((error = nameiat(&fromnd, uap->src_dirfd))) {
7925                 return error;
7926         }
7927
7928         fvp = fromnd.ni_vp;
7929         nameidone(&fromnd);
7930
7931         error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7932             uap->flags, ctx);
7933
7934         vnode_put(fvp);
7935         return error;
7936 }
7937
7938 int
7939 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7940     __unused int32_t *retval)
7941 {
7942         vnode_t fvp;
7943         struct fileproc *fp;
7944         int error;
7945         vfs_context_t ctx = vfs_context_current();
7946
7947         /* Check that the flags are valid. */
7948         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7949                 return EINVAL;
7950         }
7951
7952         AUDIT_ARG(fd, uap->src_fd);
7953         error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7954         if (error) {
7955                 return error;
7956         }
7957
7958         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7959                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7960                 error = EBADF;
7961                 goto out;
7962         }
7963
7964         if ((error = vnode_getwithref(fvp))) {
7965                 goto out;
7966         }
7967
7968         AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7969
7970         error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7971             uap->flags, ctx);
7972
7973         vnode_put(fvp);
7974 out:
7975         file_drop(uap->src_fd);
7976         return error;
7977 }
7978
7979 static int
7980 rename_submounts_callback(mount_t mp, void *arg)
7981 {
7982         int error = 0;
7983         mount_t pmp = (mount_t)arg;
7984         int prefix_len = strlen(pmp->mnt_vfsstat.f_mntonname);
7985
7986         if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
7987                 return 0;
7988         }
7989
7990         if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
7991                 return 0;
7992         }
7993
7994         if ((error = vfs_busy(mp, LK_NOWAIT))) {
7995                 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
7996                 return -1;
7997         }
7998
7999         int pathlen = MAXPATHLEN;
8000         if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8001                 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8002         }
8003
8004         vfs_unbusy(mp);
8005
8006         return error;
8007 }
8008
8009 /*
8010  * Rename files.  Source and destination must either both be directories,
8011  * or both not be directories.  If target is a directory, it must be empty.
8012  */
8013 /* ARGSUSED */
8014 static int
8015 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8016     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
8017 {
8018         if (flags & ~VFS_RENAME_FLAGS_MASK) {
8019                 return EINVAL;
8020         }
8021
8022         if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
8023                 return EINVAL;
8024         }
8025
8026         vnode_t tvp, tdvp;
8027         vnode_t fvp, fdvp;
8028         struct nameidata *fromnd, *tond;
8029         int error;
8030         int do_retry;
8031         int retry_count;
8032         int mntrename;
8033         int need_event;
8034         int need_kpath2;
8035         int has_listeners;
8036         const char *oname = NULL;
8037         char *from_name = NULL, *to_name = NULL;
8038         char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8039         int from_len = 0, to_len = 0;
8040         int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8041         int holding_mntlock;
8042         mount_t locked_mp = NULL;
8043         vnode_t oparent = NULLVP;
8044 #if CONFIG_FSE
8045         fse_info from_finfo, to_finfo;
8046 #endif
8047         int from_truncated = 0, to_truncated = 0;
8048         int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8049         int batched = 0;
8050         struct vnode_attr *fvap, *tvap;
8051         int continuing = 0;
8052         /* carving out a chunk for structs that are too big to be on stack. */
8053         struct {
8054                 struct nameidata from_node, to_node;
8055                 struct vnode_attr fv_attr, tv_attr;
8056         } * __rename_data;
8057         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
8058         fromnd = &__rename_data->from_node;
8059         tond = &__rename_data->to_node;
8060
8061         holding_mntlock = 0;
8062         do_retry = 0;
8063         retry_count = 0;
8064 retry:
8065         fvp = tvp = NULL;
8066         fdvp = tdvp = NULL;
8067         fvap = tvap = NULL;
8068         mntrename = FALSE;
8069
8070         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8071             segflg, from, ctx);
8072         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
8073
8074         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8075             segflg, to, ctx);
8076         tond->ni_flag = NAMEI_COMPOUNDRENAME;
8077
8078 continue_lookup:
8079         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8080                 if ((error = nameiat(fromnd, fromfd))) {
8081                         goto out1;
8082                 }
8083                 fdvp = fromnd->ni_dvp;
8084                 fvp  = fromnd->ni_vp;
8085
8086                 if (fvp && fvp->v_type == VDIR) {
8087                         tond->ni_cnd.cn_flags |= WILLBEDIR;
8088                 }
8089         }
8090
8091         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8092                 if ((error = nameiat(tond, tofd))) {
8093                         /*
8094                          * Translate error code for rename("dir1", "dir2/.").
8095                          */
8096                         if (error == EISDIR && fvp->v_type == VDIR) {
8097                                 error = EINVAL;
8098                         }
8099                         goto out1;
8100                 }
8101                 tdvp = tond->ni_dvp;
8102                 tvp  = tond->ni_vp;
8103         }
8104
8105 #if DEVELOPMENT || DEBUG
8106         /*
8107          * XXX VSWAP: Check for entitlements or special flag here
8108          * so we can restrict access appropriately.
8109          */
8110 #else /* DEVELOPMENT || DEBUG */
8111
8112         if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8113                 error = EPERM;
8114                 goto out1;
8115         }
8116
8117         if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8118                 error = EPERM;
8119                 goto out1;
8120         }
8121 #endif /* DEVELOPMENT || DEBUG */
8122
8123         if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8124                 error = ENOENT;
8125                 goto out1;
8126         }
8127
8128         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8129                 error = EEXIST;
8130                 goto out1;
8131         }
8132
8133         batched = vnode_compound_rename_available(fdvp);
8134
8135 #if CONFIG_FSE
8136         need_event = need_fsevent(FSE_RENAME, fdvp);
8137         if (need_event) {
8138                 if (fvp) {
8139                         get_fse_info(fvp, &from_finfo, ctx);
8140                 } else {
8141                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8142                         if (error) {
8143                                 goto out1;
8144                         }
8145
8146                         fvap = &__rename_data->fv_attr;
8147                 }
8148
8149                 if (tvp) {
8150                         get_fse_info(tvp, &to_finfo, ctx);
8151                 } else if (batched) {
8152                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8153                         if (error) {
8154                                 goto out1;
8155                         }
8156
8157                         tvap = &__rename_data->tv_attr;
8158                 }
8159         }
8160 #else
8161         need_event = 0;
8162 #endif /* CONFIG_FSE */
8163
8164         has_listeners = kauth_authorize_fileop_has_listeners();
8165
8166         need_kpath2 = 0;
8167 #if CONFIG_AUDIT
8168         if (AUDIT_RECORD_EXISTS()) {
8169                 need_kpath2 = 1;
8170         }
8171 #endif
8172
8173         if (need_event || has_listeners) {
8174                 if (from_name == NULL) {
8175                         GET_PATH(from_name);
8176                         if (from_name == NULL) {
8177                                 error = ENOMEM;
8178                                 goto out1;
8179                         }
8180                 }
8181
8182                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8183
8184                 if (from_name_no_firmlink == NULL) {
8185                         GET_PATH(from_name_no_firmlink);
8186                         if (from_name_no_firmlink == NULL) {
8187                                 error = ENOMEM;
8188                                 goto out1;
8189                         }
8190                 }
8191
8192                 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8193         }
8194
8195         if (need_event || need_kpath2 || has_listeners) {
8196                 if (to_name == NULL) {
8197                         GET_PATH(to_name);
8198                         if (to_name == NULL) {
8199                                 error = ENOMEM;
8200                                 goto out1;
8201                         }
8202                 }
8203
8204                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8205
8206                 if (to_name_no_firmlink == NULL) {
8207                         GET_PATH(to_name_no_firmlink);
8208                         if (to_name_no_firmlink == NULL) {
8209                                 error = ENOMEM;
8210                                 goto out1;
8211                         }
8212                 }
8213
8214                 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8215                 if (to_name && need_kpath2) {
8216                         AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8217                 }
8218         }
8219         if (!fvp) {
8220                 /*
8221                  * Claim: this check will never reject a valid rename.
8222                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8223                  * Suppose fdvp and tdvp are not on the same mount.
8224                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
8225                  *      then you can't move it to within another dir on the same mountpoint.
8226                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8227                  *
8228                  * If this check passes, then we are safe to pass these vnodes to the same FS.
8229                  */
8230                 if (fdvp->v_mount != tdvp->v_mount) {
8231                         error = EXDEV;
8232                         goto out1;
8233                 }
8234                 goto skipped_lookup;
8235         }
8236
8237         if (!batched) {
8238                 error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
8239                 if (error) {
8240                         if (error == ENOENT) {
8241                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8242                                         /*
8243                                          * We encountered a race where after doing the namei, tvp stops
8244                                          * being valid. If so, simply re-drive the rename call from the
8245                                          * top.
8246                                          */
8247                                         do_retry = 1;
8248                                         retry_count += 1;
8249                                 }
8250                         }
8251                         goto out1;
8252                 }
8253         }
8254
8255         /*
8256          * If the source and destination are the same (i.e. they're
8257          * links to the same vnode) and the target file system is
8258          * case sensitive, then there is nothing to do.
8259          *
8260          * XXX Come back to this.
8261          */
8262         if (fvp == tvp) {
8263                 int pathconf_val;
8264
8265                 /*
8266                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8267                  * then assume that this file system is case sensitive.
8268                  */
8269                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8270                     pathconf_val != 0) {
8271                         goto out1;
8272                 }
8273         }
8274
8275         /*
8276          * Allow the renaming of mount points.
8277          * - target must not exist
8278          * - target must reside in the same directory as source
8279          * - union mounts cannot be renamed
8280          * - "/" cannot be renamed
8281          *
8282          * XXX Handle this in VFS after a continued lookup (if we missed
8283          * in the cache to start off)
8284          *
8285          * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8286          * we'll skip past here.  The file system is responsible for
8287          * checking that @tvp is not a descendent of @fvp and vice versa
8288          * so it should always return EINVAL if either @tvp or @fvp is the
8289          * root of a volume.
8290          */
8291         if ((fvp->v_flag & VROOT) &&
8292             (fvp->v_type == VDIR) &&
8293             (tvp == NULL) &&
8294             (fvp->v_mountedhere == NULL) &&
8295             (fdvp == tdvp) &&
8296             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8297             ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8298             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8299                 vnode_t coveredvp;
8300
8301                 /* switch fvp to the covered vnode */
8302                 coveredvp = fvp->v_mount->mnt_vnodecovered;
8303                 if ((vnode_getwithref(coveredvp))) {
8304                         error = ENOENT;
8305                         goto out1;
8306                 }
8307                 vnode_put(fvp);
8308
8309                 fvp = coveredvp;
8310                 mntrename = TRUE;
8311         }
8312         /*
8313          * Check for cross-device rename.
8314          */
8315         if ((fvp->v_mount != tdvp->v_mount) ||
8316             (tvp && (fvp->v_mount != tvp->v_mount))) {
8317                 error = EXDEV;
8318                 goto out1;
8319         }
8320
8321         /*
8322          * If source is the same as the destination (that is the
8323          * same inode number) then there is nothing to do...
8324          * EXCEPT if the underlying file system supports case
8325          * insensitivity and is case preserving.  In this case
8326          * the file system needs to handle the special case of
8327          * getting the same vnode as target (fvp) and source (tvp).
8328          *
8329          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8330          * and _PC_CASE_PRESERVING can have this exception, and they need to
8331          * handle the special case of getting the same vnode as target and
8332          * source.  NOTE: Then the target is unlocked going into vnop_rename,
8333          * so not to cause locking problems. There is a single reference on tvp.
8334          *
8335          * NOTE - that fvp == tvp also occurs if they are hard linked and
8336          * that correct behaviour then is just to return success without doing
8337          * anything.
8338          *
8339          * XXX filesystem should take care of this itself, perhaps...
8340          */
8341         if (fvp == tvp && fdvp == tdvp) {
8342                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8343                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8344                     fromnd->ni_cnd.cn_namelen)) {
8345                         goto out1;
8346                 }
8347         }
8348
8349         if (holding_mntlock && fvp->v_mount != locked_mp) {
8350                 /*
8351                  * we're holding a reference and lock
8352                  * on locked_mp, but it no longer matches
8353                  * what we want to do... so drop our hold
8354                  */
8355                 mount_unlock_renames(locked_mp);
8356                 mount_drop(locked_mp, 0);
8357                 holding_mntlock = 0;
8358         }
8359         if (tdvp != fdvp && fvp->v_type == VDIR) {
8360                 /*
8361                  * serialize renames that re-shape
8362                  * the tree... if holding_mntlock is
8363                  * set, then we're ready to go...
8364                  * otherwise we
8365                  * first need to drop the iocounts
8366                  * we picked up, second take the
8367                  * lock to serialize the access,
8368                  * then finally start the lookup
8369                  * process over with the lock held
8370                  */
8371                 if (!holding_mntlock) {
8372                         /*
8373                          * need to grab a reference on
8374                          * the mount point before we
8375                          * drop all the iocounts... once
8376                          * the iocounts are gone, the mount
8377                          * could follow
8378                          */
8379                         locked_mp = fvp->v_mount;
8380                         mount_ref(locked_mp, 0);
8381
8382                         /*
8383                          * nameidone has to happen before we vnode_put(tvp)
8384                          * since it may need to release the fs_nodelock on the tvp
8385                          */
8386                         nameidone(tond);
8387
8388                         if (tvp) {
8389                                 vnode_put(tvp);
8390                         }
8391                         vnode_put(tdvp);
8392
8393                         /*
8394                          * nameidone has to happen before we vnode_put(fdvp)
8395                          * since it may need to release the fs_nodelock on the fvp
8396                          */
8397                         nameidone(fromnd);
8398
8399                         vnode_put(fvp);
8400                         vnode_put(fdvp);
8401
8402                         mount_lock_renames(locked_mp);
8403                         holding_mntlock = 1;
8404
8405                         goto retry;
8406                 }
8407         } else {
8408                 /*
8409                  * when we dropped the iocounts to take
8410                  * the lock, we allowed the identity of
8411                  * the various vnodes to change... if they did,
8412                  * we may no longer be dealing with a rename
8413                  * that reshapes the tree... once we're holding
8414                  * the iocounts, the vnodes can't change type
8415                  * so we're free to drop the lock at this point
8416                  * and continue on
8417                  */
8418                 if (holding_mntlock) {
8419                         mount_unlock_renames(locked_mp);
8420                         mount_drop(locked_mp, 0);
8421                         holding_mntlock = 0;
8422                 }
8423         }
8424
8425         // save these off so we can later verify that fvp is the same
8426         oname   = fvp->v_name;
8427         oparent = fvp->v_parent;
8428
8429 skipped_lookup:
8430         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8431             tdvp, &tvp, &tond->ni_cnd, tvap,
8432             flags, ctx);
8433
8434         if (holding_mntlock) {
8435                 /*
8436                  * we can drop our serialization
8437                  * lock now
8438                  */
8439                 mount_unlock_renames(locked_mp);
8440                 mount_drop(locked_mp, 0);
8441                 holding_mntlock = 0;
8442         }
8443         if (error) {
8444                 if (error == EDATALESS) {
8445                         /*
8446                          * If we've been here before, something has gone
8447                          * horribly wrong and we should just get out lest
8448                          * we spiral around the drain forever.
8449                          */
8450                         if (flags & VFS_RENAME_DATALESS) {
8451                                 error = EIO;
8452                                 goto out1;
8453                         }
8454
8455                         /*
8456                          * The object we're renaming is dataless (or has a
8457                          * dataless descendent) and requires materialization
8458                          * before the rename occurs.  But we're holding the
8459                          * mount point's rename lock, so it's not safe to
8460                          * make the upcall.
8461                          *
8462                          * In this case, we release the lock, perform the
8463                          * materialization, and start the whole thing over.
8464                          */
8465                         error = vnode_materialize_dataless_file(fvp,
8466                             NAMESPACE_HANDLER_RENAME_OP);
8467
8468                         if (error == 0) {
8469                                 /*
8470                                  * The next time around we need to tell the
8471                                  * file system that the materializtaion has
8472                                  * been performed.
8473                                  */
8474                                 flags |= VFS_RENAME_DATALESS;
8475                                 do_retry = 1;
8476                         }
8477                         goto out1;
8478                 }
8479                 if (error == EKEEPLOOKING) {
8480                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8481                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8482                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8483                                 }
8484                         }
8485
8486                         fromnd->ni_vp = fvp;
8487                         tond->ni_vp = tvp;
8488
8489                         goto continue_lookup;
8490                 }
8491
8492                 /*
8493                  * We may encounter a race in the VNOP where the destination didn't
8494                  * exist when we did the namei, but it does by the time we go and
8495                  * try to create the entry. In this case, we should re-drive this rename
8496                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
8497                  * but other filesystems susceptible to this race could return it, too.
8498                  */
8499                 if (error == ERECYCLE) {
8500                         if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
8501                                 do_retry = 1;
8502                                 retry_count += 1;
8503                         } else {
8504                                 printf("rename retry limit due to ERECYCLE reached\n");
8505                                 error = ENOENT;
8506                         }
8507                 }
8508
8509                 /*
8510                  * For compound VNOPs, the authorization callback may return
8511                  * ENOENT in case of racing hardlink lookups hitting the name
8512                  * cache, redrive the lookup.
8513                  */
8514                 if (batched && error == ENOENT) {
8515                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8516                                 do_retry = 1;
8517                                 retry_count += 1;
8518                         }
8519                 }
8520
8521                 goto out1;
8522         }
8523
8524         /* call out to allow 3rd party notification of rename.
8525          * Ignore result of kauth_authorize_fileop call.
8526          */
8527         kauth_authorize_fileop(vfs_context_ucred(ctx),
8528             KAUTH_FILEOP_RENAME,
8529             (uintptr_t)from_name, (uintptr_t)to_name);
8530         if (flags & VFS_RENAME_SWAP) {
8531                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8532                     KAUTH_FILEOP_RENAME,
8533                     (uintptr_t)to_name, (uintptr_t)from_name);
8534         }
8535
8536 #if CONFIG_FSE
8537         if (from_name != NULL && to_name != NULL) {
8538                 if (from_truncated || to_truncated) {
8539                         // set it here since only the from_finfo gets reported up to user space
8540                         from_finfo.mode |= FSE_TRUNCATED_PATH;
8541                 }
8542
8543                 if (tvap && tvp) {
8544                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8545                 }
8546                 if (fvap) {
8547                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8548                 }
8549
8550                 if (tvp) {
8551                         add_fsevent(FSE_RENAME, ctx,
8552                             FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8553                             FSE_ARG_FINFO, &from_finfo,
8554                             FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8555                             FSE_ARG_FINFO, &to_finfo,
8556                             FSE_ARG_DONE);
8557                         if (flags & VFS_RENAME_SWAP) {
8558                                 /*
8559                                  * Strictly speaking, swap is the equivalent of
8560                                  * *three* renames.  FSEvents clients should only take
8561                                  * the events as a hint, so we only bother reporting
8562                                  * two.
8563                                  */
8564                                 add_fsevent(FSE_RENAME, ctx,
8565                                     FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8566                                     FSE_ARG_FINFO, &to_finfo,
8567                                     FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8568                                     FSE_ARG_FINFO, &from_finfo,
8569                                     FSE_ARG_DONE);
8570                         }
8571                 } else {
8572                         add_fsevent(FSE_RENAME, ctx,
8573                             FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8574                             FSE_ARG_FINFO, &from_finfo,
8575                             FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8576                             FSE_ARG_DONE);
8577                 }
8578         }
8579 #endif /* CONFIG_FSE */
8580
8581         /*
8582          * update filesystem's mount point data
8583          */
8584         if (mntrename) {
8585                 char *cp, *pathend, *mpname;
8586                 char * tobuf;
8587                 struct mount *mp;
8588                 int maxlen;
8589                 size_t len = 0;
8590
8591                 mp = fvp->v_mountedhere;
8592
8593                 if (vfs_busy(mp, LK_NOWAIT)) {
8594                         error = EBUSY;
8595                         goto out1;
8596                 }
8597                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
8598
8599                 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8600                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8601                 } else {
8602                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8603                 }
8604                 if (!error) {
8605                         /* find current mount point prefix */
8606                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
8607                         for (cp = pathend; *cp != '\0'; ++cp) {
8608                                 if (*cp == '/') {
8609                                         pathend = cp + 1;
8610                                 }
8611                         }
8612                         /* find last component of target name */
8613                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8614                                 if (*cp == '/') {
8615                                         mpname = cp + 1;
8616                                 }
8617                         }
8618
8619                         /* Update f_mntonname of sub mounts */
8620                         vfs_iterate(0, rename_submounts_callback, (void *)mp);
8621
8622                         /* append name to prefix */
8623                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
8624                         bzero(pathend, maxlen);
8625
8626                         strlcpy(pathend, mpname, maxlen);
8627                 }
8628                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
8629
8630                 vfs_unbusy(mp);
8631
8632                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8633         }
8634         /*
8635          * fix up name & parent pointers.  note that we first
8636          * check that fvp has the same name/parent pointers it
8637          * had before the rename call... this is a 'weak' check
8638          * at best...
8639          *
8640          * XXX oparent and oname may not be set in the compound vnop case
8641          */
8642         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8643                 int update_flags;
8644
8645                 update_flags = VNODE_UPDATE_NAME;
8646
8647                 if (fdvp != tdvp) {
8648                         update_flags |= VNODE_UPDATE_PARENT;
8649                 }
8650
8651                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8652         }
8653 out1:
8654         if (to_name != NULL) {
8655                 RELEASE_PATH(to_name);
8656                 to_name = NULL;
8657         }
8658         if (to_name_no_firmlink != NULL) {
8659                 RELEASE_PATH(to_name_no_firmlink);
8660                 to_name_no_firmlink = NULL;
8661         }
8662         if (from_name != NULL) {
8663                 RELEASE_PATH(from_name);
8664                 from_name = NULL;
8665         }
8666         if (from_name_no_firmlink != NULL) {
8667                 RELEASE_PATH(from_name_no_firmlink);
8668                 from_name_no_firmlink = NULL;
8669         }
8670         if (holding_mntlock) {
8671                 mount_unlock_renames(locked_mp);
8672                 mount_drop(locked_mp, 0);
8673                 holding_mntlock = 0;
8674         }
8675         if (tdvp) {
8676                 /*
8677                  * nameidone has to happen before we vnode_put(tdvp)
8678                  * since it may need to release the fs_nodelock on the tdvp
8679                  */
8680                 nameidone(tond);
8681
8682                 if (tvp) {
8683                         vnode_put(tvp);
8684                 }
8685                 vnode_put(tdvp);
8686         }
8687         if (fdvp) {
8688                 /*
8689                  * nameidone has to happen before we vnode_put(fdvp)
8690                  * since it may need to release the fs_nodelock on the fdvp
8691                  */
8692                 nameidone(fromnd);
8693
8694                 if (fvp) {
8695                         vnode_put(fvp);
8696                 }
8697                 vnode_put(fdvp);
8698         }
8699
8700         /*
8701          * If things changed after we did the namei, then we will re-drive
8702          * this rename call from the top.
8703          */
8704         if (do_retry) {
8705                 do_retry = 0;
8706                 goto retry;
8707         }
8708
8709         FREE(__rename_data, M_TEMP);
8710         return error;
8711 }
8712
8713 int
8714 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
8715 {
8716         return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
8717                    AT_FDCWD, uap->to, UIO_USERSPACE, 0);
8718 }
8719
8720 int
8721 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
8722 {
8723         return renameat_internal(
8724                 vfs_context_current(),
8725                 uap->fromfd, uap->from,
8726                 uap->tofd, uap->to,
8727                 UIO_USERSPACE, uap->flags);
8728 }
8729
8730 int
8731 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
8732 {
8733         return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
8734                    uap->tofd, uap->to, UIO_USERSPACE, 0);
8735 }
8736
8737 /*
8738  * Make a directory file.
8739  *
8740  * Returns:     0                       Success
8741  *              EEXIST
8742  *      namei:???
8743  *      vnode_authorize:???
8744  *      vn_create:???
8745  */
8746 /* ARGSUSED */
8747 static int
8748 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
8749     enum uio_seg segflg)
8750 {
8751         vnode_t vp, dvp;
8752         int error;
8753         int update_flags = 0;
8754         int batched;
8755         struct nameidata nd;
8756
8757         AUDIT_ARG(mode, vap->va_mode);
8758         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
8759             path, ctx);
8760         nd.ni_cnd.cn_flags |= WILLBEDIR;
8761         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
8762
8763 continue_lookup:
8764         error = nameiat(&nd, fd);
8765         if (error) {
8766                 return error;
8767         }
8768         dvp = nd.ni_dvp;
8769         vp = nd.ni_vp;
8770
8771         if (vp != NULL) {
8772                 error = EEXIST;
8773                 goto out;
8774         }
8775
8776         batched = vnode_compound_mkdir_available(dvp);
8777
8778         VATTR_SET(vap, va_type, VDIR);
8779
8780         /*
8781          * XXX
8782          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
8783          * only get EXISTS or EISDIR for existing path components, and not that it could see
8784          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
8785          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
8786          */
8787         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
8788                 if (error == EACCES || error == EPERM) {
8789                         int error2;
8790
8791                         nameidone(&nd);
8792                         vnode_put(dvp);
8793                         dvp = NULLVP;
8794
8795                         /*
8796                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
8797                          * rather than EACCESS if the target exists.
8798                          */
8799                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
8800                             path, ctx);
8801                         error2 = nameiat(&nd, fd);
8802                         if (error2) {
8803                                 goto out;
8804                         } else {
8805                                 vp = nd.ni_vp;
8806                                 error = EEXIST;
8807                                 goto out;
8808                         }
8809                 }
8810
8811                 goto out;
8812         }
8813
8814         /*
8815          * make the directory
8816          */
8817         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
8818                 if (error == EKEEPLOOKING) {
8819                         nd.ni_vp = vp;
8820                         goto continue_lookup;
8821                 }
8822
8823                 goto out;
8824         }
8825
8826         // Make sure the name & parent pointers are hooked up
8827         if (vp->v_name == NULL) {
8828                 update_flags |= VNODE_UPDATE_NAME;
8829         }
8830         if (vp->v_parent == NULLVP) {
8831                 update_flags |= VNODE_UPDATE_PARENT;
8832         }
8833
8834         if (update_flags) {
8835                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
8836         }
8837
8838 #if CONFIG_FSE
8839         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
8840 #endif
8841
8842 out:
8843         /*
8844          * nameidone has to happen before we vnode_put(dvp)
8845          * since it may need to release the fs_nodelock on the dvp
8846          */
8847         nameidone(&nd);
8848
8849         if (vp) {
8850                 vnode_put(vp);
8851         }
8852         if (dvp) {
8853                 vnode_put(dvp);
8854         }
8855
8856         return error;
8857 }
8858
8859 /*
8860  * mkdir_extended: Create a directory; with extended security (ACL).
8861  *
8862  * Parameters:    p                       Process requesting to create the directory
8863  *                uap                     User argument descriptor (see below)
8864  *                retval                  (ignored)
8865  *
8866  * Indirect:      uap->path               Path of directory to create
8867  *                uap->mode               Access permissions to set
8868  *                uap->xsecurity          ACL to set
8869  *
8870  * Returns:        0                      Success
8871  *                !0                      Not success
8872  *
8873  */
8874 int
8875 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
8876 {
8877         int ciferror;
8878         kauth_filesec_t xsecdst;
8879         struct vnode_attr va;
8880
8881         AUDIT_ARG(owner, uap->uid, uap->gid);
8882
8883         xsecdst = NULL;
8884         if ((uap->xsecurity != USER_ADDR_NULL) &&
8885             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
8886                 return ciferror;
8887         }
8888
8889         VATTR_INIT(&va);
8890         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8891         if (xsecdst != NULL) {
8892                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
8893         }
8894
8895         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8896             UIO_USERSPACE);
8897         if (xsecdst != NULL) {
8898                 kauth_filesec_free(xsecdst);
8899         }
8900         return ciferror;
8901 }
8902
8903 int
8904 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
8905 {
8906         struct vnode_attr va;
8907
8908         VATTR_INIT(&va);
8909         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8910
8911         return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8912                    UIO_USERSPACE);
8913 }
8914
8915 int
8916 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
8917 {
8918         struct vnode_attr va;
8919
8920         VATTR_INIT(&va);
8921         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8922
8923         return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
8924                    UIO_USERSPACE);
8925 }
8926
8927 static int
8928 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
8929     enum uio_seg segflg, int unlink_flags)
8930 {
8931         vnode_t vp, dvp;
8932         int error;
8933         struct nameidata nd;
8934         char     *path = NULL;
8935         char     *no_firmlink_path = NULL;
8936         int       len_path = 0;
8937         int       len_no_firmlink_path = 0;
8938         int has_listeners = 0;
8939         int need_event = 0;
8940         int truncated_path = 0;
8941         int truncated_no_firmlink_path = 0;
8942 #if CONFIG_FSE
8943         struct vnode_attr va;
8944 #endif /* CONFIG_FSE */
8945         struct vnode_attr *vap = NULL;
8946         int restart_count = 0;
8947         int batched;
8948
8949         int restart_flag;
8950
8951         /*
8952          * This loop exists to restart rmdir in the unlikely case that two
8953          * processes are simultaneously trying to remove the same directory
8954          * containing orphaned appleDouble files.
8955          */
8956         do {
8957                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
8958                     segflg, dirpath, ctx);
8959                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
8960 continue_lookup:
8961                 restart_flag = 0;
8962                 vap = NULL;
8963
8964                 error = nameiat(&nd, fd);
8965                 if (error) {
8966                         return error;
8967                 }
8968
8969                 dvp = nd.ni_dvp;
8970                 vp = nd.ni_vp;
8971
8972                 if (vp) {
8973                         batched = vnode_compound_rmdir_available(vp);
8974
8975                         if (vp->v_flag & VROOT) {
8976                                 /*
8977                                  * The root of a mounted filesystem cannot be deleted.
8978                                  */
8979                                 error = EBUSY;
8980                                 goto out;
8981                         }
8982
8983 #if DEVELOPMENT || DEBUG
8984                         /*
8985                          * XXX VSWAP: Check for entitlements or special flag here
8986                          * so we can restrict access appropriately.
8987                          */
8988 #else /* DEVELOPMENT || DEBUG */
8989
8990                         if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
8991                                 error = EPERM;
8992                                 goto out;
8993                         }
8994 #endif /* DEVELOPMENT || DEBUG */
8995
8996                         /*
8997                          * Removed a check here; we used to abort if vp's vid
8998                          * was not the same as what we'd seen the last time around.
8999                          * I do not think that check was valid, because if we retry
9000                          * and all dirents are gone, the directory could legitimately
9001                          * be recycled but still be present in a situation where we would
9002                          * have had permission to delete.  Therefore, we won't make
9003                          * an effort to preserve that check now that we may not have a
9004                          * vp here.
9005                          */
9006
9007                         if (!batched) {
9008                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
9009                                 if (error) {
9010                                         if (error == ENOENT) {
9011                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9012                                                         restart_flag = 1;
9013                                                         restart_count += 1;
9014                                                 }
9015                                         }
9016                                         goto out;
9017                                 }
9018                         }
9019                 } else {
9020                         batched = 1;
9021
9022                         if (!vnode_compound_rmdir_available(dvp)) {
9023                                 panic("No error, but no compound rmdir?");
9024                         }
9025                 }
9026
9027 #if CONFIG_FSE
9028                 fse_info  finfo;
9029
9030                 need_event = need_fsevent(FSE_DELETE, dvp);
9031                 if (need_event) {
9032                         if (!batched) {
9033                                 get_fse_info(vp, &finfo, ctx);
9034                         } else {
9035                                 error = vfs_get_notify_attributes(&va);
9036                                 if (error) {
9037                                         goto out;
9038                                 }
9039
9040                                 vap = &va;
9041                         }
9042                 }
9043 #endif
9044                 has_listeners = kauth_authorize_fileop_has_listeners();
9045                 if (need_event || has_listeners) {
9046                         if (path == NULL) {
9047                                 GET_PATH(path);
9048                                 if (path == NULL) {
9049                                         error = ENOMEM;
9050                                         goto out;
9051                                 }
9052                         }
9053
9054                         len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9055
9056                         if (no_firmlink_path == NULL) {
9057                                 GET_PATH(no_firmlink_path);
9058                                 if (no_firmlink_path == NULL) {
9059                                         error = ENOMEM;
9060                                         goto out;
9061                                 }
9062                         }
9063
9064                         len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9065 #if CONFIG_FSE
9066                         if (truncated_no_firmlink_path) {
9067                                 finfo.mode |= FSE_TRUNCATED_PATH;
9068                         }
9069 #endif
9070                 }
9071
9072                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9073                 nd.ni_vp = vp;
9074                 if (vp == NULLVP) {
9075                         /* Couldn't find a vnode */
9076                         goto out;
9077                 }
9078
9079                 if (error == EKEEPLOOKING) {
9080                         goto continue_lookup;
9081                 } else if (batched && error == ENOENT) {
9082                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9083                                 /*
9084                                  * For compound VNOPs, the authorization callback
9085                                  * may return ENOENT in case of racing hard link lookups
9086                                  * redrive the lookup.
9087                                  */
9088                                 restart_flag = 1;
9089                                 restart_count += 1;
9090                                 goto out;
9091                         }
9092                 }
9093
9094                 /*
9095                  * XXX There's no provision for passing flags
9096                  * to VNOP_RMDIR().  So, if vn_rmdir() fails
9097                  * because it's not empty, then we try again
9098                  * with VNOP_REMOVE(), passing in a special
9099                  * flag that clever file systems will know
9100                  * how to handle.
9101                  */
9102                 if (error == ENOTEMPTY &&
9103                     (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9104                         /*
9105                          * If this fails, we want to keep the original
9106                          * error.
9107                          */
9108                         if (vn_remove(dvp, &vp, &nd,
9109                             VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9110                                 error = 0;
9111                         }
9112                 }
9113
9114 #if CONFIG_APPLEDOUBLE
9115                 /*
9116                  * Special case to remove orphaned AppleDouble
9117                  * files. I don't like putting this in the kernel,
9118                  * but carbon does not like putting this in carbon either,
9119                  * so here we are.
9120                  */
9121                 if (error == ENOTEMPTY) {
9122                         int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9123                         if (ad_error == EBUSY) {
9124                                 error = ad_error;
9125                                 goto out;
9126                         }
9127
9128
9129                         /*
9130                          * Assuming everything went well, we will try the RMDIR again
9131                          */
9132                         if (!ad_error) {
9133                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9134                         }
9135                 }
9136 #endif /* CONFIG_APPLEDOUBLE */
9137                 /*
9138                  * Call out to allow 3rd party notification of delete.
9139                  * Ignore result of kauth_authorize_fileop call.
9140                  */
9141                 if (!error) {
9142                         if (has_listeners) {
9143                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
9144                                     KAUTH_FILEOP_DELETE,
9145                                     (uintptr_t)vp,
9146                                     (uintptr_t)path);
9147                         }
9148
9149                         if (vp->v_flag & VISHARDLINK) {
9150                                 // see the comment in unlink1() about why we update
9151                                 // the parent of a hard link when it is removed
9152                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9153                         }
9154
9155 #if CONFIG_FSE
9156                         if (need_event) {
9157                                 if (vap) {
9158                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
9159                                 }
9160                                 add_fsevent(FSE_DELETE, ctx,
9161                                     FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9162                                     FSE_ARG_FINFO, &finfo,
9163                                     FSE_ARG_DONE);
9164                         }
9165 #endif
9166                 }
9167
9168 out:
9169                 if (path != NULL) {
9170                         RELEASE_PATH(path);
9171                         path = NULL;
9172                 }
9173
9174                 if (no_firmlink_path != NULL) {
9175                         RELEASE_PATH(no_firmlink_path);
9176                         no_firmlink_path = NULL;
9177                 }
9178
9179                 /*
9180                  * nameidone has to happen before we vnode_put(dvp)
9181                  * since it may need to release the fs_nodelock on the dvp
9182                  */
9183                 nameidone(&nd);
9184                 vnode_put(dvp);
9185
9186                 if (vp) {
9187                         vnode_put(vp);
9188                 }
9189
9190                 if (restart_flag == 0) {
9191                         wakeup_one((caddr_t)vp);
9192                         return error;
9193                 }
9194                 tsleep(vp, PVFS, "rm AD", 1);
9195         } while (restart_flag != 0);
9196
9197         return error;
9198 }
9199
9200 /*
9201  * Remove a directory file.
9202  */
9203 /* ARGSUSED */
9204 int
9205 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9206 {
9207         return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9208                    CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9209 }
9210
9211 /* Get direntry length padded to 8 byte alignment */
9212 #define DIRENT64_LEN(namlen) \
9213         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9214
9215 /* Get dirent length padded to 4 byte alignment */
9216 #define DIRENT_LEN(namelen) \
9217         ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9218
9219 /* Get the end of this dirent */
9220 #define DIRENT_END(dep) \
9221         (((char *)(dep)) + (dep)->d_reclen - 1)
9222
9223 errno_t
9224 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9225     int *numdirent, vfs_context_t ctxp)
9226 {
9227         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
9228         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9229             ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9230                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9231         } else {
9232                 size_t bufsize;
9233                 void * bufptr;
9234                 uio_t auio;
9235                 struct direntry *entry64;
9236                 struct dirent *dep;
9237                 int bytesread;
9238                 int error;
9239
9240                 /*
9241                  * We're here because the underlying file system does not
9242                  * support direnties or we mounted denying support so we must
9243                  * fall back to dirents and convert them to direntries.
9244                  *
9245                  * Our kernel buffer needs to be smaller since re-packing will
9246                  * expand each dirent.  The worse case (when the name length
9247                  * is 3 or less) corresponds to a struct direntry size of 32
9248                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9249                  * (4-byte aligned).  So having a buffer that is 3/8 the size
9250                  * will prevent us from reading more than we can pack.
9251                  *
9252                  * Since this buffer is wired memory, we will limit the
9253                  * buffer size to a maximum of 32K. We would really like to
9254                  * use 32K in the MIN(), but we use magic number 87371 to
9255                  * prevent uio_resid() * 3 / 8 from overflowing.
9256                  */
9257                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9258                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
9259                 if (bufptr == NULL) {
9260                         return ENOMEM;
9261                 }
9262
9263                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9264                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9265                 auio->uio_offset = uio->uio_offset;
9266
9267                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9268
9269                 dep = (struct dirent *)bufptr;
9270                 bytesread = bufsize - uio_resid(auio);
9271
9272                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
9273                     M_TEMP, M_WAITOK);
9274                 /*
9275                  * Convert all the entries and copy them out to user's buffer.
9276                  */
9277                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9278                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
9279
9280                         if (DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9281                             DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9282                                 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9283                                     vp->v_mount->mnt_vfsstat.f_mntonname,
9284                                     vp->v_name ? vp->v_name : "<unknown>");
9285                                 error = EIO;
9286                                 break;
9287                         }
9288
9289                         bzero(entry64, enbufsize);
9290                         /* Convert a dirent to a dirent64. */
9291                         entry64->d_ino = dep->d_ino;
9292                         entry64->d_seekoff = 0;
9293                         entry64->d_reclen = enbufsize;
9294                         entry64->d_namlen = dep->d_namlen;
9295                         entry64->d_type = dep->d_type;
9296                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9297
9298                         /* Move to next entry. */
9299                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
9300
9301                         /* Copy entry64 to user's buffer. */
9302                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9303                 }
9304
9305                 /* Update the real offset using the offset we got from VNOP_READDIR. */
9306                 if (error == 0) {
9307                         uio->uio_offset = auio->uio_offset;
9308                 }
9309                 uio_free(auio);
9310                 FREE(bufptr, M_TEMP);
9311                 FREE(entry64, M_TEMP);
9312                 return error;
9313         }
9314 }
9315
9316 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
9317
9318 /*
9319  * Read a block of directory entries in a file system independent format.
9320  */
9321 static int
9322 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9323     off_t *offset, int *eofflag, int flags)
9324 {
9325         vnode_t vp;
9326         struct vfs_context context = *vfs_context_current();    /* local copy */
9327         struct fileproc *fp;
9328         uio_t auio;
9329         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9330         off_t loff;
9331         int error, numdirent;
9332         char uio_buf[UIO_SIZEOF(1)];
9333
9334         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9335         if (error) {
9336                 return error;
9337         }
9338         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9339                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9340                 error = EBADF;
9341                 goto out;
9342         }
9343
9344         if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9345                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
9346         }
9347
9348 #if CONFIG_MACF
9349         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
9350         if (error) {
9351                 goto out;
9352         }
9353 #endif
9354         if ((error = vnode_getwithref(vp))) {
9355                 goto out;
9356         }
9357         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9358
9359 unionread:
9360         if (vp->v_type != VDIR) {
9361                 (void)vnode_put(vp);
9362                 error = EINVAL;
9363                 goto out;
9364         }
9365
9366 #if CONFIG_MACF
9367         error = mac_vnode_check_readdir(&context, vp);
9368         if (error != 0) {
9369                 (void)vnode_put(vp);
9370                 goto out;
9371         }
9372 #endif /* MAC */
9373
9374         loff = fp->f_fglob->fg_offset;
9375         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9376         uio_addiov(auio, bufp, bufsize);
9377
9378         if (flags & VNODE_READDIR_EXTENDED) {
9379                 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9380                 fp->f_fglob->fg_offset = uio_offset(auio);
9381         } else {
9382                 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9383                 fp->f_fglob->fg_offset = uio_offset(auio);
9384         }
9385         if (error) {
9386                 (void)vnode_put(vp);
9387                 goto out;
9388         }
9389
9390         if ((user_ssize_t)bufsize == uio_resid(auio)) {
9391                 if (union_dircheckp) {
9392                         error = union_dircheckp(&vp, fp, &context);
9393                         if (error == -1) {
9394                                 goto unionread;
9395                         }
9396                         if (error) {
9397                                 (void)vnode_put(vp);
9398                                 goto out;
9399                         }
9400                 }
9401
9402                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
9403                         struct vnode *tvp = vp;
9404                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
9405                                 vnode_ref(vp);
9406                                 fp->f_fglob->fg_data = (caddr_t) vp;
9407                                 fp->f_fglob->fg_offset = 0;
9408                                 vnode_rele(tvp);
9409                                 vnode_put(tvp);
9410                                 goto unionread;
9411                         }
9412                         vp = tvp;
9413                 }
9414         }
9415
9416         vnode_put(vp);
9417         if (offset) {
9418                 *offset = loff;
9419         }
9420
9421         *bytesread = bufsize - uio_resid(auio);
9422 out:
9423         file_drop(fd);
9424         return error;
9425 }
9426
9427
9428 int
9429 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9430 {
9431         off_t offset;
9432         ssize_t bytesread;
9433         int error, eofflag;
9434
9435         AUDIT_ARG(fd, uap->fd);
9436         error = getdirentries_common(uap->fd, uap->buf, uap->count,
9437             &bytesread, &offset, &eofflag, 0);
9438
9439         if (error == 0) {
9440                 if (proc_is64bit(p)) {
9441                         user64_long_t base = (user64_long_t)offset;
9442                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9443                 } else {
9444                         user32_long_t base = (user32_long_t)offset;
9445                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9446                 }
9447                 *retval = bytesread;
9448         }
9449         return error;
9450 }
9451
9452 int
9453 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9454 {
9455         off_t offset;
9456         ssize_t bytesread;
9457         int error, eofflag;
9458         user_size_t bufsize;
9459
9460         AUDIT_ARG(fd, uap->fd);
9461
9462         /*
9463          * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9464          * then the kernel carves out the last 4 bytes to return extended
9465          * information to userspace (namely whether we reached EOF with this call).
9466          */
9467         if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9468                 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9469         } else {
9470                 bufsize = uap->bufsize;
9471         }
9472
9473         error = getdirentries_common(uap->fd, uap->buf, bufsize,
9474             &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9475
9476         if (error == 0) {
9477                 *retval = bytesread;
9478                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9479
9480                 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9481                         getdirentries64_flags_t flags = 0;
9482                         if (eofflag) {
9483                                 flags |= GETDIRENTRIES64_EOF;
9484                         }
9485                         error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9486                             sizeof(flags));
9487                 }
9488         }
9489         return error;
9490 }
9491
9492
9493 /*
9494  * Set the mode mask for creation of filesystem nodes.
9495  * XXX implement xsecurity
9496  */
9497 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
9498 static int
9499 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9500 {
9501         struct filedesc *fdp;
9502
9503         AUDIT_ARG(mask, newmask);
9504         proc_fdlock(p);
9505         fdp = p->p_fd;
9506         *retval = fdp->fd_cmask;
9507         fdp->fd_cmask = newmask & ALLPERMS;
9508         proc_fdunlock(p);
9509         return 0;
9510 }
9511
9512 /*
9513  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9514  *
9515  * Parameters:    p                       Process requesting to set the umask
9516  *                uap                     User argument descriptor (see below)
9517  *                retval                  umask of the process (parameter p)
9518  *
9519  * Indirect:      uap->newmask            umask to set
9520  *                uap->xsecurity          ACL to set
9521  *
9522  * Returns:        0                      Success
9523  *                !0                      Not success
9524  *
9525  */
9526 int
9527 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9528 {
9529         int ciferror;
9530         kauth_filesec_t xsecdst;
9531
9532         xsecdst = KAUTH_FILESEC_NONE;
9533         if (uap->xsecurity != USER_ADDR_NULL) {
9534                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
9535                         return ciferror;
9536                 }
9537         } else {
9538                 xsecdst = KAUTH_FILESEC_NONE;
9539         }
9540
9541         ciferror = umask1(p, uap->newmask, xsecdst, retval);
9542
9543         if (xsecdst != KAUTH_FILESEC_NONE) {
9544                 kauth_filesec_free(xsecdst);
9545         }
9546         return ciferror;
9547 }
9548
9549 int
9550 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9551 {
9552         return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9553 }
9554
9555 /*
9556  * Void all references to file by ripping underlying filesystem
9557  * away from vnode.
9558  */
9559 /* ARGSUSED */
9560 int
9561 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9562 {
9563         vnode_t vp;
9564         struct vnode_attr va;
9565         vfs_context_t ctx = vfs_context_current();
9566         int error;
9567         struct nameidata nd;
9568
9569         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9570             uap->path, ctx);
9571         error = namei(&nd);
9572         if (error) {
9573                 return error;
9574         }
9575         vp = nd.ni_vp;
9576
9577         nameidone(&nd);
9578
9579         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9580                 error = ENOTSUP;
9581                 goto out;
9582         }
9583
9584         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9585                 error = EBUSY;
9586                 goto out;
9587         }
9588
9589 #if CONFIG_MACF
9590         error = mac_vnode_check_revoke(ctx, vp);
9591         if (error) {
9592                 goto out;
9593         }
9594 #endif
9595
9596         VATTR_INIT(&va);
9597         VATTR_WANTED(&va, va_uid);
9598         if ((error = vnode_getattr(vp, &va, ctx))) {
9599                 goto out;
9600         }
9601         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9602             (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9603                 goto out;
9604         }
9605         if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9606                 VNOP_REVOKE(vp, REVOKEALL, ctx);
9607         }
9608 out:
9609         vnode_put(vp);
9610         return error;
9611 }
9612
9613
9614 /*
9615  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9616  *  The following system calls are designed to support features
9617  *  which are specific to the HFS & HFS Plus volume formats
9618  */
9619
9620
9621 /*
9622  * Obtain attribute information on objects in a directory while enumerating
9623  * the directory.
9624  */
9625 /* ARGSUSED */
9626 int
9627 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9628 {
9629         vnode_t vp;
9630         struct fileproc *fp;
9631         uio_t auio = NULL;
9632         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9633         uint32_t count = 0, savecount = 0;
9634         uint32_t newstate = 0;
9635         int error, eofflag;
9636         uint32_t loff = 0;
9637         struct attrlist attributelist;
9638         vfs_context_t ctx = vfs_context_current();
9639         int fd = uap->fd;
9640         char uio_buf[UIO_SIZEOF(1)];
9641         kauth_action_t action;
9642
9643         AUDIT_ARG(fd, fd);
9644
9645         /* Get the attributes into kernel space */
9646         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9647                 return error;
9648         }
9649         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9650                 return error;
9651         }
9652         savecount = count;
9653         if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9654                 return error;
9655         }
9656         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9657                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9658                 error = EBADF;
9659                 goto out;
9660         }
9661
9662
9663 #if CONFIG_MACF
9664         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
9665             fp->f_fglob);
9666         if (error) {
9667                 goto out;
9668         }
9669 #endif
9670
9671
9672         if ((error = vnode_getwithref(vp))) {
9673                 goto out;
9674         }
9675
9676         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9677
9678 unionread:
9679         if (vp->v_type != VDIR) {
9680                 (void)vnode_put(vp);
9681                 error = EINVAL;
9682                 goto out;
9683         }
9684
9685 #if CONFIG_MACF
9686         error = mac_vnode_check_readdir(ctx, vp);
9687         if (error != 0) {
9688                 (void)vnode_put(vp);
9689                 goto out;
9690         }
9691 #endif /* MAC */
9692
9693         /* set up the uio structure which will contain the users return buffer */
9694         loff = fp->f_fglob->fg_offset;
9695         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9696         uio_addiov(auio, uap->buffer, uap->buffersize);
9697
9698         /*
9699          * If the only item requested is file names, we can let that past with
9700          * just LIST_DIRECTORY.  If they want any other attributes, that means
9701          * they need SEARCH as well.
9702          */
9703         action = KAUTH_VNODE_LIST_DIRECTORY;
9704         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
9705             attributelist.fileattr || attributelist.dirattr) {
9706                 action |= KAUTH_VNODE_SEARCH;
9707         }
9708
9709         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
9710                 /* Believe it or not, uap->options only has 32-bits of valid
9711                  * info, so truncate before extending again */
9712
9713                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
9714                     (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
9715         }
9716
9717         if (error) {
9718                 (void) vnode_put(vp);
9719                 goto out;
9720         }
9721
9722         /*
9723          * If we've got the last entry of a directory in a union mount
9724          * then reset the eofflag and pretend there's still more to come.
9725          * The next call will again set eofflag and the buffer will be empty,
9726          * so traverse to the underlying directory and do the directory
9727          * read there.
9728          */
9729         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
9730                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
9731                         eofflag = 0;
9732                 } else {                                                // Empty buffer
9733                         struct vnode *tvp = vp;
9734                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
9735                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
9736                                 fp->f_fglob->fg_data = (caddr_t) vp;
9737                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
9738                                 count = savecount;
9739                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
9740                                 vnode_put(tvp);
9741                                 goto unionread;
9742                         }
9743                         vp = tvp;
9744                 }
9745         }
9746
9747         (void)vnode_put(vp);
9748
9749         if (error) {
9750                 goto out;
9751         }
9752         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
9753
9754         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
9755                 goto out;
9756         }
9757         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
9758                 goto out;
9759         }
9760         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
9761                 goto out;
9762         }
9763
9764         *retval = eofflag;  /* similar to getdirentries */
9765         error = 0;
9766 out:
9767         file_drop(fd);
9768         return error; /* return error earlier, an retval of 0 or 1 now */
9769 } /* end of getdirentriesattr system call */
9770
9771 /*
9772  * Exchange data between two files
9773  */
9774
9775 /* ARGSUSED */
9776 int
9777 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
9778 {
9779         struct nameidata fnd, snd;
9780         vfs_context_t ctx = vfs_context_current();
9781         vnode_t fvp;
9782         vnode_t svp;
9783         int error;
9784         u_int32_t nameiflags;
9785         char *fpath = NULL;
9786         char *spath = NULL;
9787         int   flen = 0, slen = 0;
9788         int from_truncated = 0, to_truncated = 0;
9789 #if CONFIG_FSE
9790         fse_info f_finfo, s_finfo;
9791 #endif
9792
9793         nameiflags = 0;
9794         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
9795                 nameiflags |= FOLLOW;
9796         }
9797
9798         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
9799             UIO_USERSPACE, uap->path1, ctx);
9800
9801         error = namei(&fnd);
9802         if (error) {
9803                 goto out2;
9804         }
9805
9806         nameidone(&fnd);
9807         fvp = fnd.ni_vp;
9808
9809         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
9810             UIO_USERSPACE, uap->path2, ctx);
9811
9812         error = namei(&snd);
9813         if (error) {
9814                 vnode_put(fvp);
9815                 goto out2;
9816         }
9817         nameidone(&snd);
9818         svp = snd.ni_vp;
9819
9820         /*
9821          * if the files are the same, return an inval error
9822          */
9823         if (svp == fvp) {
9824                 error = EINVAL;
9825                 goto out;
9826         }
9827
9828         /*
9829          * if the files are on different volumes, return an error
9830          */
9831         if (svp->v_mount != fvp->v_mount) {
9832                 error = EXDEV;
9833                 goto out;
9834         }
9835
9836         /* If they're not files, return an error */
9837         if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
9838                 error = EINVAL;
9839                 goto out;
9840         }
9841
9842 #if CONFIG_MACF
9843         error = mac_vnode_check_exchangedata(ctx,
9844             fvp, svp);
9845         if (error) {
9846                 goto out;
9847         }
9848 #endif
9849         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
9850             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
9851                 goto out;
9852         }
9853
9854         if (
9855 #if CONFIG_FSE
9856                 need_fsevent(FSE_EXCHANGE, fvp) ||
9857 #endif
9858                 kauth_authorize_fileop_has_listeners()) {
9859                 GET_PATH(fpath);
9860                 GET_PATH(spath);
9861                 if (fpath == NULL || spath == NULL) {
9862                         error = ENOMEM;
9863                         goto out;
9864                 }
9865
9866                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
9867                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
9868
9869 #if CONFIG_FSE
9870                 get_fse_info(fvp, &f_finfo, ctx);
9871                 get_fse_info(svp, &s_finfo, ctx);
9872                 if (from_truncated || to_truncated) {
9873                         // set it here since only the f_finfo gets reported up to user space
9874                         f_finfo.mode |= FSE_TRUNCATED_PATH;
9875                 }
9876 #endif
9877         }
9878         /* Ok, make the call */
9879         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
9880
9881         if (error == 0) {
9882                 const char *tmpname;
9883
9884                 if (fpath != NULL && spath != NULL) {
9885                         /* call out to allow 3rd party notification of exchangedata.
9886                          * Ignore result of kauth_authorize_fileop call.
9887                          */
9888                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
9889                             (uintptr_t)fpath, (uintptr_t)spath);
9890                 }
9891                 name_cache_lock();
9892
9893                 tmpname     = fvp->v_name;
9894                 fvp->v_name = svp->v_name;
9895                 svp->v_name = tmpname;
9896
9897                 if (fvp->v_parent != svp->v_parent) {
9898                         vnode_t tmp;
9899
9900                         tmp           = fvp->v_parent;
9901                         fvp->v_parent = svp->v_parent;
9902                         svp->v_parent = tmp;
9903                 }
9904                 name_cache_unlock();
9905
9906 #if CONFIG_FSE
9907                 if (fpath != NULL && spath != NULL) {
9908                         add_fsevent(FSE_EXCHANGE, ctx,
9909                             FSE_ARG_STRING, flen, fpath,
9910                             FSE_ARG_FINFO, &f_finfo,
9911                             FSE_ARG_STRING, slen, spath,
9912                             FSE_ARG_FINFO, &s_finfo,
9913                             FSE_ARG_DONE);
9914                 }
9915 #endif
9916         }
9917
9918 out:
9919         if (fpath != NULL) {
9920                 RELEASE_PATH(fpath);
9921         }
9922         if (spath != NULL) {
9923                 RELEASE_PATH(spath);
9924         }
9925         vnode_put(svp);
9926         vnode_put(fvp);
9927 out2:
9928         return error;
9929 }
9930
9931 /*
9932  * Return (in MB) the amount of freespace on the given vnode's volume.
9933  */
9934 uint32_t freespace_mb(vnode_t vp);
9935
9936 uint32_t
9937 freespace_mb(vnode_t vp)
9938 {
9939         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
9940         return ((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
9941                vp->v_mount->mnt_vfsstat.f_bsize) >> 20;
9942 }
9943
9944 #if CONFIG_SEARCHFS
9945
9946 /* ARGSUSED */
9947
9948 int
9949 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
9950 {
9951         vnode_t vp, tvp;
9952         int i, error = 0;
9953         int fserror = 0;
9954         struct nameidata nd;
9955         struct user64_fssearchblock searchblock;
9956         struct searchstate *state;
9957         struct attrlist *returnattrs;
9958         struct timeval timelimit;
9959         void *searchparams1, *searchparams2;
9960         uio_t auio = NULL;
9961         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9962         uint32_t nummatches;
9963         int mallocsize;
9964         uint32_t nameiflags;
9965         vfs_context_t ctx = vfs_context_current();
9966         char uio_buf[UIO_SIZEOF(1)];
9967
9968         /* Start by copying in fsearchblock parameter list */
9969         if (IS_64BIT_PROCESS(p)) {
9970                 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
9971                 timelimit.tv_sec = searchblock.timelimit.tv_sec;
9972                 timelimit.tv_usec = searchblock.timelimit.tv_usec;
9973         } else {
9974                 struct user32_fssearchblock tmp_searchblock;
9975
9976                 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
9977                 // munge into 64-bit version
9978                 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
9979                 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
9980                 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
9981                 searchblock.maxmatches = tmp_searchblock.maxmatches;
9982                 /*
9983                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
9984                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
9985                  */
9986                 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
9987                 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
9988                 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
9989                 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
9990                 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
9991                 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
9992                 searchblock.searchattrs = tmp_searchblock.searchattrs;
9993         }
9994         if (error) {
9995                 return error;
9996         }
9997
9998         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
9999          */
10000         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10001             searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10002                 return EINVAL;
10003         }
10004
10005         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10006         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
10007         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10008         /* block.                                                                                             */
10009         /*                                                                                                    */
10010         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
10011         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
10012         /*       assumes the size is still 556 bytes it will continue to work                                 */
10013
10014         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10015             sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10016
10017         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
10018
10019         /* Now set up the various pointers to the correct place in our newly allocated memory */
10020
10021         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10022         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10023         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10024
10025         /* Now copy in the stuff given our local variables. */
10026
10027         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10028                 goto freeandexit;
10029         }
10030
10031         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10032                 goto freeandexit;
10033         }
10034
10035         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10036                 goto freeandexit;
10037         }
10038
10039         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10040                 goto freeandexit;
10041         }
10042
10043         /*
10044          * When searching a union mount, need to set the
10045          * start flag at the first call on each layer to
10046          * reset state for the new volume.
10047          */
10048         if (uap->options & SRCHFS_START) {
10049                 state->ss_union_layer = 0;
10050         } else {
10051                 uap->options |= state->ss_union_flags;
10052         }
10053         state->ss_union_flags = 0;
10054
10055         /*
10056          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10057          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10058          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10059          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10060          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10061          */
10062
10063         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10064                 attrreference_t* string_ref;
10065                 u_int32_t* start_length;
10066                 user64_size_t param_length;
10067
10068                 /* validate searchparams1 */
10069                 param_length = searchblock.sizeofsearchparams1;
10070                 /* skip the word that specifies length of the buffer */
10071                 start_length = (u_int32_t*) searchparams1;
10072                 start_length = start_length + 1;
10073                 string_ref = (attrreference_t*) start_length;
10074
10075                 /* ensure no negative offsets or too big offsets */
10076                 if (string_ref->attr_dataoffset < 0) {
10077                         error = EINVAL;
10078                         goto freeandexit;
10079                 }
10080                 if (string_ref->attr_length > MAXPATHLEN) {
10081                         error = EINVAL;
10082                         goto freeandexit;
10083                 }
10084
10085                 /* Check for pointer overflow in the string ref */
10086                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10087                         error = EINVAL;
10088                         goto freeandexit;
10089                 }
10090
10091                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10092                         error = EINVAL;
10093                         goto freeandexit;
10094                 }
10095                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10096                         error = EINVAL;
10097                         goto freeandexit;
10098                 }
10099         }
10100
10101         /* set up the uio structure which will contain the users return buffer */
10102         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10103         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10104
10105         nameiflags = 0;
10106         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10107                 nameiflags |= FOLLOW;
10108         }
10109         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10110             UIO_USERSPACE, uap->path, ctx);
10111
10112         error = namei(&nd);
10113         if (error) {
10114                 goto freeandexit;
10115         }
10116         vp = nd.ni_vp;
10117         nameidone(&nd);
10118
10119         /*
10120          * Switch to the root vnode for the volume
10121          */
10122         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10123         vnode_put(vp);
10124         if (error) {
10125                 goto freeandexit;
10126         }
10127         vp = tvp;
10128
10129         /*
10130          * If it's a union mount, the path lookup takes
10131          * us to the top layer. But we may need to descend
10132          * to a lower layer. For non-union mounts the layer
10133          * is always zero.
10134          */
10135         for (i = 0; i < (int) state->ss_union_layer; i++) {
10136                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10137                         break;
10138                 }
10139                 tvp = vp;
10140                 vp = vp->v_mount->mnt_vnodecovered;
10141                 if (vp == NULL) {
10142                         vnode_put(tvp);
10143                         error = ENOENT;
10144                         goto freeandexit;
10145                 }
10146                 error = vnode_getwithref(vp);
10147                 vnode_put(tvp);
10148                 if (error) {
10149                         goto freeandexit;
10150                 }
10151         }
10152
10153 #if CONFIG_MACF
10154         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
10155         if (error) {
10156                 vnode_put(vp);
10157                 goto freeandexit;
10158         }
10159 #endif
10160
10161
10162         /*
10163          * If searchblock.maxmatches == 0, then skip the search. This has happened
10164          * before and sometimes the underlying code doesnt deal with it well.
10165          */
10166         if (searchblock.maxmatches == 0) {
10167                 nummatches = 0;
10168                 goto saveandexit;
10169         }
10170
10171         /*
10172          * Allright, we have everything we need, so lets make that call.
10173          *
10174          * We keep special track of the return value from the file system:
10175          * EAGAIN is an acceptable error condition that shouldn't keep us
10176          * from copying out any results...
10177          */
10178
10179         fserror = VNOP_SEARCHFS(vp,
10180             searchparams1,
10181             searchparams2,
10182             &searchblock.searchattrs,
10183             (u_long)searchblock.maxmatches,
10184             &timelimit,
10185             returnattrs,
10186             &nummatches,
10187             (u_long)uap->scriptcode,
10188             (u_long)uap->options,
10189             auio,
10190             (struct searchstate *) &state->ss_fsstate,
10191             ctx);
10192
10193         /*
10194          * If it's a union mount we need to be called again
10195          * to search the mounted-on filesystem.
10196          */
10197         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10198                 state->ss_union_flags = SRCHFS_START;
10199                 state->ss_union_layer++;        // search next layer down
10200                 fserror = EAGAIN;
10201         }
10202
10203 saveandexit:
10204
10205         vnode_put(vp);
10206
10207         /* Now copy out the stuff that needs copying out. That means the number of matches, the
10208          *  search state.  Everything was already put into he return buffer by the vop call. */
10209
10210         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10211                 goto freeandexit;
10212         }
10213
10214         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10215                 goto freeandexit;
10216         }
10217
10218         error = fserror;
10219
10220 freeandexit:
10221
10222         FREE(searchparams1, M_TEMP);
10223
10224         return error;
10225 } /* end of searchfs system call */
10226
10227 #else /* CONFIG_SEARCHFS */
10228
10229 int
10230 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10231 {
10232         return ENOTSUP;
10233 }
10234
10235 #endif /* CONFIG_SEARCHFS */
10236
10237
10238 #if CONFIG_DATALESS_FILES
10239
10240 /*
10241  * === Namespace Resolver Up-call Mechanism ===
10242  *
10243  * When I/O is performed to a dataless file or directory (read, write,
10244  * lookup-in, etc.), the file system performs an upcall to the namespace
10245  * resolver (filecoordinationd) to materialize the object.
10246  *
10247  * We need multiple up-calls to be in flight at once, and we need these
10248  * up-calls to be interruptible, thus the following implementation:
10249  *
10250  * => The nspace_resolver_request represents the in-kernel request state.
10251  *    It contains a request ID, storage space for the errno code returned
10252  *    by filecoordinationd, and flags.
10253  *
10254  * => The request ID is simply a global monotonically incrementing 32-bit
10255  *    number.  Outstanding requests are stored in a hash table, and the
10256  *    hash function is extremely simple.
10257  *
10258  * => When an upcall is to be made to filecoordinationd, a request structure
10259  *    is allocated on the stack (it is small, and needs to live only during
10260  *    the duration of the call to resolve_nspace_item_ext()).  It is
10261  *    initialized and inserted into the table.  Some backpressure from
10262  *    filecoordinationd is applied by limiting the numnber of entries that
10263  *    can be inserted into the table (and thus limiting the number of
10264  *    outstanding requests issued to filecoordinationd); waiting for an
10265  *    available slot is interruptible.
10266  *
10267  * => Once the request has been inserted into the table, the up-call is made
10268  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
10269  *    immediately and filecoordinationd processes the request asynchronously.
10270  *
10271  * => The caller now waits for the request to complete.  Tnis is achieved by
10272  *    sleeping on the address of the request structure and waiting for
10273  *    filecoordinationd to mark the request structure as complete.  This
10274  *    is an interruptible sleep call; if interrupted, the request structure
10275  *    is removed from the table and EINTR is returned to the caller.  If
10276  *    this occurs, an advisory up-call is made to filecoordinationd with
10277  *    the request ID to indicate that the request can be aborted or
10278  *    de-prioritized at the discretion of filecoordinationd.
10279  *
10280  * => When filecoordinationd has completed the request, it signals completion
10281  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
10282  *    decorated as a namespace resolver can write to this sysctl node.  The
10283  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10284  *    The request ID is looked up in the table, and if the request is found,
10285  *    the error code is stored in the request structure and a wakeup()
10286  *    issued on the address of the request structure.  If the request is not
10287  *    found, we simply drop the completion notification, assuming that the
10288  *    caller was interrupted.
10289  *
10290  * => When the waiting thread wakes up, it extracts the error code from the
10291  *    request structure, removes the request from the table, and returns the
10292  *    error code to the calling function.  Fini!
10293  */
10294
10295 struct nspace_resolver_request {
10296         LIST_ENTRY(nspace_resolver_request) r_hashlink;
10297         uint32_t        r_req_id;
10298         int             r_resolver_error;
10299         int             r_flags;
10300 };
10301
10302 #define RRF_COMPLETE    0x0001
10303
10304 static uint32_t
10305 next_nspace_req_id(void)
10306 {
10307         static uint32_t next_req_id;
10308
10309         return OSAddAtomic(1, &next_req_id);
10310 }
10311
10312 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
10313 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
10314
10315 static LIST_HEAD(nspace_resolver_requesthead,
10316     nspace_resolver_request) * nspace_resolver_request_hashtbl;
10317 static u_long nspace_resolver_request_hashmask;
10318 static u_int nspace_resolver_request_count;
10319 static bool nspace_resolver_request_wait_slot;
10320 static lck_grp_t *nspace_resolver_request_lck_grp;
10321 static lck_mtx_t nspace_resolver_request_hash_mutex;
10322
10323 #define NSPACE_REQ_LOCK() \
10324         lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10325 #define NSPACE_REQ_UNLOCK() \
10326         lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10327
10328 #define NSPACE_RESOLVER_HASH(req_id)    \
10329         (&nspace_resolver_request_hashtbl[(req_id) & \
10330          nspace_resolver_request_hashmask])
10331
10332 static struct nspace_resolver_request *
10333 nspace_resolver_req_lookup(uint32_t req_id)
10334 {
10335         struct nspace_resolver_requesthead *bucket;
10336         struct nspace_resolver_request *req;
10337
10338         bucket = NSPACE_RESOLVER_HASH(req_id);
10339         LIST_FOREACH(req, bucket, r_hashlink) {
10340                 if (req->r_req_id == req_id) {
10341                         return req;
10342                 }
10343         }
10344
10345         return NULL;
10346 }
10347
10348 static int
10349 nspace_resolver_req_add(struct nspace_resolver_request *req)
10350 {
10351         struct nspace_resolver_requesthead *bucket;
10352         int error;
10353
10354         while (nspace_resolver_request_count >=
10355             NSPACE_RESOLVER_MAX_OUTSTANDING) {
10356                 nspace_resolver_request_wait_slot = true;
10357                 error = msleep(&nspace_resolver_request_count,
10358                     &nspace_resolver_request_hash_mutex,
10359                     PVFS | PCATCH, "nspacerq", NULL);
10360                 if (error) {
10361                         return error;
10362                 }
10363         }
10364
10365         bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10366 #if DIAGNOSTIC
10367         assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10368 #endif /* DIAGNOSTIC */
10369         LIST_INSERT_HEAD(bucket, req, r_hashlink);
10370         nspace_resolver_request_count++;
10371
10372         return 0;
10373 }
10374
10375 static void
10376 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10377 {
10378         struct nspace_resolver_requesthead *bucket;
10379
10380         bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10381 #if DIAGNOSTIC
10382         assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10383 #endif /* DIAGNOSTIC */
10384         LIST_REMOVE(req, r_hashlink);
10385         nspace_resolver_request_count--;
10386
10387         if (nspace_resolver_request_wait_slot) {
10388                 nspace_resolver_request_wait_slot = false;
10389                 wakeup(&nspace_resolver_request_count);
10390         }
10391 }
10392
10393 static void
10394 nspace_resolver_req_cancel(uint32_t req_id)
10395 {
10396         kern_return_t kr;
10397         mach_port_t mp;
10398
10399         // Failures here aren't fatal -- the cancellation message
10400         // sent to the resolver is merely advisory.
10401
10402         kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10403         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10404                 return;
10405         }
10406
10407         kr = send_nspace_resolve_cancel(mp, req_id);
10408         if (kr != KERN_SUCCESS) {
10409                 os_log_error(OS_LOG_DEFAULT,
10410                     "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10411         }
10412
10413         ipc_port_release_send(mp);
10414 }
10415
10416 static int
10417 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10418 {
10419         bool send_cancel_message = false;
10420         int error;
10421
10422         NSPACE_REQ_LOCK();
10423
10424         while ((req->r_flags & RRF_COMPLETE) == 0) {
10425                 error = msleep(req, &nspace_resolver_request_hash_mutex,
10426                     PVFS | PCATCH, "nspace", NULL);
10427                 if (error && error != ERESTART) {
10428                         req->r_resolver_error = (error == EINTR) ? EINTR :
10429                             ETIMEDOUT;
10430                         send_cancel_message = true;
10431                         break;
10432                 }
10433         }
10434
10435         nspace_resolver_req_remove(req);
10436
10437         NSPACE_REQ_UNLOCK();
10438
10439         if (send_cancel_message) {
10440                 nspace_resolver_req_cancel(req->r_req_id);
10441         }
10442
10443         return req->r_resolver_error;
10444 }
10445
10446 static void
10447 nspace_resolver_req_mark_complete(
10448         struct nspace_resolver_request *req,
10449         int resolver_error)
10450 {
10451         req->r_resolver_error = resolver_error;
10452         req->r_flags |= RRF_COMPLETE;
10453         wakeup(req);
10454 }
10455
10456 static void
10457 nspace_resolver_req_completed(uint32_t req_id, int resolver_error)
10458 {
10459         struct nspace_resolver_request *req;
10460
10461         NSPACE_REQ_LOCK();
10462
10463         // If we don't find the request corresponding to our req_id,
10464         // just drop the completion signal on the floor; it's likely
10465         // that the requester interrupted with a signal.
10466
10467         req = nspace_resolver_req_lookup(req_id);
10468         if (req) {
10469                 nspace_resolver_req_mark_complete(req, resolver_error);
10470         }
10471
10472         NSPACE_REQ_UNLOCK();
10473 }
10474
10475 static struct proc *nspace_resolver_proc;
10476
10477 static int
10478 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10479 {
10480         *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10481             p == nspace_resolver_proc) ? 1 : 0;
10482         return 0;
10483 }
10484
10485 static int
10486 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10487 {
10488         vfs_context_t ctx = vfs_context_current();
10489         int error = 0;
10490
10491         //
10492         // The system filecoordinationd runs as uid == 0.  This also
10493         // has the nice side-effect of filtering out filecoordinationd
10494         // running in the simulator.
10495         //
10496         if (!vfs_context_issuser(ctx)) {
10497                 return EPERM;
10498         }
10499
10500         error = priv_check_cred(vfs_context_ucred(ctx),
10501             PRIV_VFS_DATALESS_RESOLVER, 0);
10502         if (error) {
10503                 return error;
10504         }
10505
10506         if (is_resolver) {
10507                 NSPACE_REQ_LOCK();
10508
10509                 if (nspace_resolver_proc == NULL) {
10510                         proc_lock(p);
10511                         p->p_lflag |= P_LNSPACE_RESOLVER;
10512                         proc_unlock(p);
10513                         nspace_resolver_proc = p;
10514                 } else {
10515                         error = EBUSY;
10516                 }
10517
10518                 NSPACE_REQ_UNLOCK();
10519         } else {
10520                 // This is basically just like the exit case.
10521                 // nspace_resolver_exited() will verify that the
10522                 // process is the resolver, and will clear the
10523                 // global.
10524                 nspace_resolver_exited(p);
10525         }
10526
10527         return error;
10528 }
10529
10530 static int
10531 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10532 {
10533         if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10534             (p->p_vfs_iopolicy &
10535             P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10536                 *is_prevented = 1;
10537         } else {
10538                 *is_prevented = 0;
10539         }
10540         return 0;
10541 }
10542
10543 static int
10544 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10545 {
10546         if (p->p_lflag & P_LNSPACE_RESOLVER) {
10547                 return is_prevented ? 0 : EBUSY;
10548         }
10549
10550         if (is_prevented) {
10551                 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10552         } else {
10553                 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10554         }
10555         return 0;
10556 }
10557
10558 static int
10559 nspace_materialization_get_thread_state(int *is_prevented)
10560 {
10561         uthread_t ut = get_bsdthread_info(current_thread());
10562
10563         *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10564         return 0;
10565 }
10566
10567 static int
10568 nspace_materialization_set_thread_state(int is_prevented)
10569 {
10570         uthread_t ut = get_bsdthread_info(current_thread());
10571
10572         if (is_prevented) {
10573                 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10574         } else {
10575                 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10576         }
10577         return 0;
10578 }
10579
10580 static int
10581 nspace_materialization_is_prevented(void)
10582 {
10583         proc_t p = current_proc();
10584         uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
10585         vfs_context_t ctx = vfs_context_current();
10586
10587         /*
10588          * Kernel context ==> return EDEADLK, as we would with any random
10589          * process decorated as no-materialize.
10590          */
10591         if (ctx == vfs_context_kernel()) {
10592                 return EDEADLK;
10593         }
10594
10595         /*
10596          * If the process has the dataless-manipulation entitlement,
10597          * materialization is prevented, and depending on the kind
10598          * of file system operation, things get to proceed as if the
10599          * object is not dataless.
10600          */
10601         if (vfs_context_is_dataless_manipulator(ctx)) {
10602                 return EJUSTRETURN;
10603         }
10604
10605         /*
10606          * Per-thread decorations override any process-wide decorations.
10607          * (Foundation uses this, and this overrides even the dataless-
10608          * manipulation entitlement so as to make API contracts consistent.)
10609          */
10610         if (ut != NULL) {
10611                 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
10612                         return EDEADLK;
10613                 }
10614                 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
10615                         return 0;
10616                 }
10617         }
10618
10619         /*
10620          * If the process's iopolicy specifies that dataless files
10621          * can be materialized, then we let it go ahead.
10622          */
10623         if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
10624                 return 0;
10625         }
10626
10627         /*
10628          * The default behavior is to not materialize dataless files;
10629          * return to the caller that deadlock was detected.
10630          */
10631         return EDEADLK;
10632 }
10633
10634 /* the vfs.nspace branch */
10635 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10636
10637 static int
10638 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10639     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10640 {
10641         struct proc *p = req->p;
10642         int new_value, old_value, changed = 0;
10643         int error;
10644
10645         error = nspace_resolver_get_proc_state(p, &old_value);
10646         if (error) {
10647                 return error;
10648         }
10649
10650         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10651             &changed);
10652         if (error == 0 && changed) {
10653                 error = nspace_resolver_set_proc_state(p, new_value);
10654         }
10655         return error;
10656 }
10657
10658 /* decorate this process as the dataless file resolver */
10659 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
10660     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10661     0, 0, sysctl_nspace_resolver, "I", "");
10662
10663 static int
10664 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
10665     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10666 {
10667         struct proc *p = req->p;
10668         int new_value, old_value, changed = 0;
10669         int error;
10670
10671         error = nspace_materialization_get_proc_state(p, &old_value);
10672         if (error) {
10673                 return error;
10674         }
10675
10676         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10677             &changed);
10678         if (error == 0 && changed) {
10679                 error = nspace_materialization_set_proc_state(p, new_value);
10680         }
10681         return error;
10682 }
10683
10684 /* decorate this process as not wanting to materialize dataless files */
10685 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
10686     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10687     0, 0, sysctl_nspace_prevent_materialization, "I", "");
10688
10689 static int
10690 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
10691     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10692 {
10693         int new_value, old_value, changed = 0;
10694         int error;
10695
10696         error = nspace_materialization_get_thread_state(&old_value);
10697         if (error) {
10698                 return error;
10699         }
10700
10701         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10702             &changed);
10703         if (error == 0 && changed) {
10704                 error = nspace_materialization_set_thread_state(new_value);
10705         }
10706         return error;
10707 }
10708
10709 /* decorate this thread as not wanting to materialize dataless files */
10710 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
10711     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10712     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
10713
10714 static int
10715 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
10716     __unused int arg2, struct sysctl_req *req)
10717 {
10718         struct proc *p = req->p;
10719         uint32_t req_status[2] = { 0, 0 };
10720         int error, is_resolver, changed = 0;
10721
10722         error = nspace_resolver_get_proc_state(p, &is_resolver);
10723         if (error) {
10724                 return error;
10725         }
10726
10727         if (!is_resolver) {
10728                 return EPERM;
10729         }
10730
10731         error = sysctl_io_opaque(req, req_status, sizeof(req_status),
10732             &changed);
10733         if (error) {
10734                 return error;
10735         }
10736
10737         /*
10738          * req_status[0] is the req_id
10739          *
10740          * req_status[1] is the errno
10741          */
10742         if (error == 0 && changed) {
10743                 nspace_resolver_req_completed(req_status[0],
10744                     (int)req_status[1]);
10745         }
10746         return error;
10747 }
10748
10749 /* Resolver reports completed reqs here. */
10750 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
10751     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10752     0, 0, sysctl_nspace_complete, "-", "");
10753
10754 #endif /* CONFIG_DATALESS_FILES */
10755
10756 #if CONFIG_DATALESS_FILES
10757 #define __no_dataless_unused    /* nothing */
10758 #else
10759 #define __no_dataless_unused    __unused
10760 #endif
10761
10762 void
10763 nspace_resolver_init(void)
10764 {
10765 #if CONFIG_DATALESS_FILES
10766         nspace_resolver_request_lck_grp =
10767             lck_grp_alloc_init("file namespace resolver", NULL);
10768
10769         lck_mtx_init(&nspace_resolver_request_hash_mutex,
10770             nspace_resolver_request_lck_grp, NULL);
10771
10772         nspace_resolver_request_hashtbl =
10773             hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
10774             M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
10775 #endif /* CONFIG_DATALESS_FILES */
10776 }
10777
10778 void
10779 nspace_resolver_exited(struct proc *p __no_dataless_unused)
10780 {
10781 #if CONFIG_DATALESS_FILES
10782         struct nspace_resolver_requesthead *bucket;
10783         struct nspace_resolver_request *req;
10784         u_long idx;
10785
10786         NSPACE_REQ_LOCK();
10787
10788         if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10789             p == nspace_resolver_proc) {
10790                 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
10791                         bucket = &nspace_resolver_request_hashtbl[idx];
10792                         LIST_FOREACH(req, bucket, r_hashlink) {
10793                                 nspace_resolver_req_mark_complete(req,
10794                                     ETIMEDOUT);
10795                         }
10796                 }
10797                 nspace_resolver_proc = NULL;
10798         }
10799
10800         NSPACE_REQ_UNLOCK();
10801 #endif /* CONFIG_DATALESS_FILES */
10802 }
10803
10804 int
10805 resolve_nspace_item(struct vnode *vp, uint64_t op)
10806 {
10807         return resolve_nspace_item_ext(vp, op, NULL);
10808 }
10809
10810 #define DATALESS_RESOLVER_ENTITLEMENT     \
10811         "com.apple.private.vfs.dataless-resolver"
10812 #define DATALESS_MANIPULATION_ENTITLEMENT \
10813         "com.apple.private.vfs.dataless-manipulation"
10814
10815 /*
10816  * Return TRUE if the vfs context is associated with a process entitled
10817  * for dataless manipulation.
10818  *
10819  * XXX Arguably belongs in vfs_subr.c, but is here because of the
10820  * complication around CONFIG_DATALESS_FILES.
10821  */
10822 boolean_t
10823 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
10824 {
10825 #if CONFIG_DATALESS_FILES
10826         assert(ctx->vc_thread == current_thread());
10827         task_t const task = current_task();
10828         return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
10829                IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
10830 #else
10831         return false;
10832 #endif /* CONFIG_DATALESS_FILES */
10833 }
10834
10835 int
10836 resolve_nspace_item_ext(
10837         struct vnode *vp __no_dataless_unused,
10838         uint64_t op __no_dataless_unused,
10839         void *arg __unused)
10840 {
10841 #if CONFIG_DATALESS_FILES
10842         int error;
10843         mach_port_t mp;
10844         char *path = NULL;
10845         int path_len;
10846         kern_return_t kr;
10847         struct nspace_resolver_request req;
10848
10849         // only allow namespace events on regular files, directories and symlinks.
10850         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
10851                 return EFTYPE;
10852         }
10853
10854         //
10855         // if this is a snapshot event and the vnode is on a
10856         // disk image just pretend nothing happened since any
10857         // change to the disk image will cause the disk image
10858         // itself to get backed up and this avoids multi-way
10859         // deadlocks between the snapshot handler and the ever
10860         // popular diskimages-helper process.  the variable
10861         // nspace_allow_virtual_devs allows this behavior to
10862         // be overridden (for use by the Mobile TimeMachine
10863         // testing infrastructure which uses disk images)
10864         //
10865         if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
10866                 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
10867                 return ENOTSUP;
10868         }
10869
10870         error = nspace_materialization_is_prevented();
10871         if (error) {
10872                 os_log_debug(OS_LOG_DEFAULT,
10873                     "NSPACE process/thread is decorated as no-materialization");
10874                 return error;
10875         }
10876
10877         kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10878         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10879                 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
10880                 // Treat this like being unable to access the backing
10881                 // store server.
10882                 return ETIMEDOUT;
10883         }
10884
10885         MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
10886         if (path == NULL) {
10887                 error = ENOMEM;
10888                 goto out_release_port;
10889         }
10890         path_len = MAXPATHLEN;
10891
10892         error = vn_getpath(vp, path, &path_len);
10893         if (error == 0) {
10894                 int xxx_rdar44371223;   /* XXX Mig bug */
10895                 req.r_req_id = next_nspace_req_id();
10896                 req.r_resolver_error = 0;
10897                 req.r_flags = 0;
10898
10899                 NSPACE_REQ_LOCK();
10900                 error = nspace_resolver_req_add(&req);
10901                 NSPACE_REQ_UNLOCK();
10902                 if (error) {
10903                         goto out_release_port;
10904                 }
10905
10906                 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
10907                 kr = send_nspace_resolve_path(mp, req.r_req_id,
10908                     current_proc()->p_pid, (uint32_t)(op & 0xffffffff),
10909                     path, &xxx_rdar44371223);
10910                 if (kr != KERN_SUCCESS) {
10911                         // Also treat this like being unable to access
10912                         // the backing store server.
10913                         os_log_error(OS_LOG_DEFAULT,
10914                             "NSPACE resolve_path failure: %d", kr);
10915                         error = ETIMEDOUT;
10916
10917                         NSPACE_REQ_LOCK();
10918                         nspace_resolver_req_remove(&req);
10919                         NSPACE_REQ_UNLOCK();
10920                         goto out_release_port;
10921                 }
10922
10923                 // Give back the memory we allocated earlier while
10924                 // we wait; we no longer need it.
10925                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10926                 path = NULL;
10927
10928                 // Request has been submitted to the resolver.
10929                 // Now (interruptibly) wait for completion.
10930                 // Upon requrn, the request will have been removed
10931                 // from the lookup table.
10932                 error = nspace_resolver_req_wait(&req);
10933         }
10934
10935 out_release_port:
10936         if (path != NULL) {
10937                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10938         }
10939         ipc_port_release_send(mp);
10940
10941         return error;
10942 #else
10943         return ENOTSUP;
10944 #endif /* CONFIG_DATALESS_FILES */
10945 }
10946
10947 int
10948 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
10949     __unused uint64_t op_type, __unused void *arg)
10950 {
10951         return 0;
10952 }
10953
10954 #if 0
10955 static int
10956 build_volfs_path(struct vnode *vp, char *path, int *len)
10957 {
10958         struct vnode_attr va;
10959         int ret;
10960
10961         VATTR_INIT(&va);
10962         VATTR_WANTED(&va, va_fsid);
10963         VATTR_WANTED(&va, va_fileid);
10964
10965         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
10966                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
10967                 ret = -1;
10968         } else {
10969                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
10970                 ret = 0;
10971         }
10972
10973         return ret;
10974 }
10975 #endif
10976
10977 static unsigned long
10978 fsctl_bogus_command_compat(unsigned long cmd)
10979 {
10980         switch (cmd) {
10981         case IOCBASECMD(FSIOC_SYNC_VOLUME):
10982                 return FSIOC_SYNC_VOLUME;
10983         case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
10984                 return FSIOC_ROUTEFS_SETROUTEID;
10985         case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
10986                 return FSIOC_SET_PACKAGE_EXTS;
10987         case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
10988                 return FSIOC_SET_FSTYPENAME_OVERRIDE;
10989         case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
10990                 return DISK_CONDITIONER_IOC_GET;
10991         case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
10992                 return DISK_CONDITIONER_IOC_SET;
10993         case IOCBASECMD(FSIOC_FIOSEEKHOLE):
10994                 return FSIOC_FIOSEEKHOLE;
10995         case IOCBASECMD(FSIOC_FIOSEEKDATA):
10996                 return FSIOC_FIOSEEKDATA;
10997         case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
10998                 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
10999         case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
11000                 return SPOTLIGHT_IOC_GET_LAST_MTIME;
11001         }
11002
11003         return cmd;
11004 }
11005
11006 static int
11007 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
11008 {
11009         return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
11010 }
11011
11012 /*
11013  * Make a filesystem-specific control call:
11014  */
11015 /* ARGSUSED */
11016 static int
11017 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
11018 {
11019         int error = 0;
11020         boolean_t is64bit;
11021         u_int size;
11022 #define STK_PARAMS 128
11023         char stkbuf[STK_PARAMS] = {0};
11024         caddr_t data, memp;
11025         vnode_t vp = *arg_vp;
11026
11027         if (vp->v_type == VCHR || vp->v_type == VBLK) {
11028                 return ENOTTY;
11029         }
11030
11031         cmd = fsctl_bogus_command_compat(cmd);
11032
11033         size = IOCPARM_LEN(cmd);
11034         if (size > IOCPARM_MAX) {
11035                 return EINVAL;
11036         }
11037
11038         is64bit = proc_is64bit(p);
11039
11040         memp = NULL;
11041
11042         if (size > sizeof(stkbuf)) {
11043                 if ((memp = (caddr_t)kalloc(size)) == 0) {
11044                         return ENOMEM;
11045                 }
11046                 data = memp;
11047         } else {
11048                 data = &stkbuf[0];
11049         };
11050
11051         if (cmd & IOC_IN) {
11052                 if (size) {
11053                         error = copyin(udata, data, size);
11054                         if (error) {
11055                                 if (memp) {
11056                                         kfree(memp, size);
11057                                 }
11058                                 return error;
11059                         }
11060                 } else {
11061                         if (is64bit) {
11062                                 *(user_addr_t *)data = udata;
11063                         } else {
11064                                 *(uint32_t *)data = (uint32_t)udata;
11065                         }
11066                 };
11067         } else if ((cmd & IOC_OUT) && size) {
11068                 /*
11069                  * Zero the buffer so the user always
11070                  * gets back something deterministic.
11071                  */
11072                 bzero(data, size);
11073         } else if (cmd & IOC_VOID) {
11074                 if (is64bit) {
11075                         *(user_addr_t *)data = udata;
11076                 } else {
11077                         *(uint32_t *)data = (uint32_t)udata;
11078                 }
11079         }
11080
11081         /* Check to see if it's a generic command */
11082         switch (cmd) {
11083         case FSIOC_SYNC_VOLUME: {
11084                 struct vfs_attr vfa;
11085                 mount_t mp = vp->v_mount;
11086                 unsigned arg;
11087
11088
11089                 /* record vid of vp so we can drop it below. */
11090                 uint32_t vvid = vp->v_id;
11091
11092                 /*
11093                  * Then grab mount_iterref so that we can release the vnode.
11094                  * Without this, a thread may call vnode_iterate_prepare then
11095                  * get into a deadlock because we've never released the root vp
11096                  */
11097                 error = mount_iterref(mp, 0);
11098                 if (error) {
11099                         break;
11100                 }
11101                 vnode_put(vp);
11102
11103                 arg = MNT_NOWAIT;
11104                 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11105                         arg = MNT_WAIT;
11106                 }
11107
11108                 /*
11109                  * If the filessytem supports multiple filesytems in a
11110                  * partition (For eg APFS volumes in a container, it knows
11111                  * that the waitfor argument to VFS_SYNC are flags.
11112                  */
11113                 VFSATTR_INIT(&vfa);
11114                 VFSATTR_WANTED(&vfa, f_capabilities);
11115                 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11116                     VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11117                     ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11118                     ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11119                         arg |= MNT_VOLUME;
11120                 }
11121
11122                 /* issue the sync for this volume */
11123                 (void)sync_callback(mp, &arg);
11124
11125                 /*
11126                  * Then release the mount_iterref once we're done syncing; it's not
11127                  * needed for the VNOP_IOCTL below
11128                  */
11129                 mount_iterdrop(mp);
11130
11131                 if (arg & FSCTL_SYNC_FULLSYNC) {
11132                         /* re-obtain vnode iocount on the root vp, if possible */
11133                         error = vnode_getwithvid(vp, vvid);
11134                         if (error == 0) {
11135                                 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11136                                 vnode_put(vp);
11137                         }
11138                 }
11139                 /* mark the argument VP as having been released */
11140                 *arg_vp = NULL;
11141         }
11142         break;
11143
11144         case FSIOC_ROUTEFS_SETROUTEID: {
11145 #if ROUTEFS
11146                 char routepath[MAXPATHLEN];
11147                 size_t len = 0;
11148
11149                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11150                         break;
11151                 }
11152                 bzero(routepath, MAXPATHLEN);
11153                 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11154                 if (error) {
11155                         break;
11156                 }
11157                 error = routefs_kernel_mount(routepath);
11158                 if (error) {
11159                         break;
11160                 }
11161 #endif
11162         }
11163         break;
11164
11165         case FSIOC_SET_PACKAGE_EXTS: {
11166                 user_addr_t ext_strings;
11167                 uint32_t    num_entries;
11168                 uint32_t    max_width;
11169
11170                 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11171                         break;
11172                 }
11173
11174                 if ((is64bit && size != sizeof(user64_package_ext_info))
11175                     || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11176                         // either you're 64-bit and passed a 64-bit struct or
11177                         // you're 32-bit and passed a 32-bit struct.  otherwise
11178                         // it's not ok.
11179                         error = EINVAL;
11180                         break;
11181                 }
11182
11183                 if (is64bit) {
11184                         ext_strings = ((user64_package_ext_info *)data)->strings;
11185                         num_entries = ((user64_package_ext_info *)data)->num_entries;
11186                         max_width   = ((user64_package_ext_info *)data)->max_width;
11187                 } else {
11188                         ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11189                         num_entries = ((user32_package_ext_info *)data)->num_entries;
11190                         max_width   = ((user32_package_ext_info *)data)->max_width;
11191                 }
11192                 error = set_package_extensions_table(ext_strings, num_entries, max_width);
11193         }
11194         break;
11195
11196         case FSIOC_SET_FSTYPENAME_OVERRIDE:
11197         {
11198                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11199                         break;
11200                 }
11201                 if (vp->v_mount) {
11202                         mount_lock(vp->v_mount);
11203                         if (data[0] != 0) {
11204                                 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11205                                 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11206                                 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11207                                         vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
11208                                         vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
11209                                 }
11210                         } else {
11211                                 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11212                                         vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
11213                                 }
11214                                 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11215                                 vp->v_mount->fstypename_override[0] = '\0';
11216                         }
11217                         mount_unlock(vp->v_mount);
11218                 }
11219         }
11220         break;
11221
11222         case DISK_CONDITIONER_IOC_GET: {
11223                 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
11224         }
11225         break;
11226
11227         case DISK_CONDITIONER_IOC_SET: {
11228                 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
11229         }
11230         break;
11231
11232         case FSIOC_CAS_BSDFLAGS: {
11233                 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11234                 struct vnode_attr va;
11235
11236                 VATTR_INIT(&va);
11237                 VATTR_SET(&va, va_flags, cas->new_flags);
11238
11239                 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11240         }
11241         break;
11242
11243         case FSIOC_FD_ONLY_OPEN_ONCE: {
11244                 if (vnode_usecount(vp) > 1) {
11245                         error = EBUSY;
11246                 } else {
11247                         error = 0;
11248                 }
11249         }
11250         break;
11251
11252         default: {
11253                 /* other, known commands shouldn't be passed down here */
11254                 switch (cmd) {
11255                 case F_PUNCHHOLE:
11256                 case F_TRIM_ACTIVE_FILE:
11257                 case F_RDADVISE:
11258                 case F_TRANSCODEKEY:
11259                 case F_GETPROTECTIONLEVEL:
11260                 case F_GETDEFAULTPROTLEVEL:
11261                 case F_MAKECOMPRESSED:
11262                 case F_SET_GREEDY_MODE:
11263                 case F_SETSTATICCONTENT:
11264                 case F_SETIOTYPE:
11265                 case F_SETBACKINGSTORE:
11266                 case F_GETPATH_MTMINFO:
11267                 case APFSIOC_REVERT_TO_SNAPSHOT:
11268                 case FSIOC_FIOSEEKHOLE:
11269                 case FSIOC_FIOSEEKDATA:
11270                 case HFS_GET_BOOT_INFO:
11271                 case HFS_SET_BOOT_INFO:
11272                 case FIOPINSWAP:
11273                 case F_CHKCLEAN:
11274                 case F_FULLFSYNC:
11275                 case F_BARRIERFSYNC:
11276                 case F_FREEZE_FS:
11277                 case F_THAW_FS:
11278                         error = EINVAL;
11279                         goto outdrop;
11280                 }
11281                 /* Invoke the filesystem-specific code */
11282                 error = VNOP_IOCTL(vp, cmd, data, options, ctx);
11283         }
11284         } /* end switch stmt */
11285
11286         /*
11287          * if no errors, copy any data to user. Size was
11288          * already set and checked above.
11289          */
11290         if (error == 0 && (cmd & IOC_OUT) && size) {
11291                 error = copyout(data, udata, size);
11292         }
11293
11294 outdrop:
11295         if (memp) {
11296                 kfree(memp, size);
11297         }
11298
11299         return error;
11300 }
11301
11302 /* ARGSUSED */
11303 int
11304 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
11305 {
11306         int error;
11307         struct nameidata nd;
11308         u_long nameiflags;
11309         vnode_t vp = NULL;
11310         vfs_context_t ctx = vfs_context_current();
11311
11312         AUDIT_ARG(cmd, uap->cmd);
11313         AUDIT_ARG(value32, uap->options);
11314         /* Get the vnode for the file we are getting info on:  */
11315         nameiflags = 0;
11316         //
11317         // if we come through fsctl() then the file is by definition not open.
11318         // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
11319         // lest the caller mistakenly thinks the only open is their own (but in
11320         // reality it's someone elses).
11321         //
11322         if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
11323                 return EINVAL;
11324         }
11325         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11326                 nameiflags |= FOLLOW;
11327         }
11328         if (uap->cmd == FSIOC_FIRMLINK_CTL) {
11329                 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
11330         }
11331         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
11332             UIO_USERSPACE, uap->path, ctx);
11333         if ((error = namei(&nd))) {
11334                 goto done;
11335         }
11336         vp = nd.ni_vp;
11337         nameidone(&nd);
11338
11339 #if CONFIG_MACF
11340         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
11341         if (error) {
11342                 goto done;
11343         }
11344 #endif
11345
11346         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11347
11348 done:
11349         if (vp) {
11350                 vnode_put(vp);
11351         }
11352         return error;
11353 }
11354 /* ARGSUSED */
11355 int
11356 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
11357 {
11358         int error;
11359         vnode_t vp = NULL;
11360         vfs_context_t ctx = vfs_context_current();
11361         int fd = -1;
11362
11363         AUDIT_ARG(fd, uap->fd);
11364         AUDIT_ARG(cmd, uap->cmd);
11365         AUDIT_ARG(value32, uap->options);
11366
11367         /* Get the vnode for the file we are getting info on:  */
11368         if ((error = file_vnode(uap->fd, &vp))) {
11369                 return error;
11370         }
11371         fd = uap->fd;
11372         if ((error = vnode_getwithref(vp))) {
11373                 file_drop(fd);
11374                 return error;
11375         }
11376
11377 #if CONFIG_MACF
11378         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
11379                 file_drop(fd);
11380                 vnode_put(vp);
11381                 return error;
11382         }
11383 #endif
11384
11385         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11386
11387         file_drop(fd);
11388
11389         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
11390         if (vp) {
11391                 vnode_put(vp);
11392         }
11393
11394         return error;
11395 }
11396 /* end of fsctl system call */
11397
11398 /*
11399  *  Retrieve the data of an extended attribute.
11400  */
11401 int
11402 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
11403 {
11404         vnode_t vp;
11405         struct nameidata nd;
11406         char attrname[XATTR_MAXNAMELEN + 1];
11407         vfs_context_t ctx = vfs_context_current();
11408         uio_t auio = NULL;
11409         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11410         size_t attrsize = 0;
11411         size_t namelen;
11412         u_int32_t nameiflags;
11413         int error;
11414         char uio_buf[UIO_SIZEOF(1)];
11415
11416         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11417                 return EINVAL;
11418         }
11419
11420         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11421         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
11422         if ((error = namei(&nd))) {
11423                 return error;
11424         }
11425         vp = nd.ni_vp;
11426         nameidone(&nd);
11427
11428         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11429         if (error != 0) {
11430                 goto out;
11431         }
11432         if (xattr_protected(attrname)) {
11433                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
11434                         error = EPERM;
11435                         goto out;
11436                 }
11437         }
11438         /*
11439          * the specific check for 0xffffffff is a hack to preserve
11440          * binaray compatibilty in K64 with applications that discovered
11441          * that passing in a buf pointer and a size of -1 resulted in
11442          * just the size of the indicated extended attribute being returned.
11443          * this isn't part of the documented behavior, but because of the
11444          * original implemtation's check for "uap->size > 0", this behavior
11445          * was allowed. In K32 that check turned into a signed comparison
11446          * even though uap->size is unsigned...  in K64, we blow by that
11447          * check because uap->size is unsigned and doesn't get sign smeared
11448          * in the munger for a 32 bit user app.  we also need to add a
11449          * check to limit the maximum size of the buffer being passed in...
11450          * unfortunately, the underlying fileystems seem to just malloc
11451          * the requested size even if the actual extended attribute is tiny.
11452          * because that malloc is for kernel wired memory, we have to put a
11453          * sane limit on it.
11454          *
11455          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
11456          * U64 running on K64 will yield -1 (64 bits wide)
11457          * U32/U64 running on K32 will yield -1 (32 bits wide)
11458          */
11459         if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
11460                 goto no_uio;
11461         }
11462
11463         if (uap->value) {
11464                 if (uap->size > (size_t)XATTR_MAXSIZE) {
11465                         uap->size = XATTR_MAXSIZE;
11466                 }
11467
11468                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11469                     &uio_buf[0], sizeof(uio_buf));
11470                 uio_addiov(auio, uap->value, uap->size);
11471         }
11472 no_uio:
11473         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
11474 out:
11475         vnode_put(vp);
11476
11477         if (auio) {
11478                 *retval = uap->size - uio_resid(auio);
11479         } else {
11480                 *retval = (user_ssize_t)attrsize;
11481         }
11482
11483         return error;
11484 }
11485
11486 /*
11487  * Retrieve the data of an extended attribute.
11488  */
11489 int
11490 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
11491 {
11492         vnode_t vp;
11493         char attrname[XATTR_MAXNAMELEN + 1];
11494         uio_t auio = NULL;
11495         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11496         size_t attrsize = 0;
11497         size_t namelen;
11498         int error;
11499         char uio_buf[UIO_SIZEOF(1)];
11500
11501         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11502                 return EINVAL;
11503         }
11504
11505         if ((error = file_vnode(uap->fd, &vp))) {
11506                 return error;
11507         }
11508         if ((error = vnode_getwithref(vp))) {
11509                 file_drop(uap->fd);
11510                 return error;
11511         }
11512         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11513         if (error != 0) {
11514                 goto out;
11515         }
11516         if (xattr_protected(attrname)) {
11517                 error = EPERM;
11518                 goto out;
11519         }
11520         if (uap->value && uap->size > 0) {
11521                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11522                     &uio_buf[0], sizeof(uio_buf));
11523                 uio_addiov(auio, uap->value, uap->size);
11524         }
11525
11526         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
11527 out:
11528         (void)vnode_put(vp);
11529         file_drop(uap->fd);
11530
11531         if (auio) {
11532                 *retval = uap->size - uio_resid(auio);
11533         } else {
11534                 *retval = (user_ssize_t)attrsize;
11535         }
11536         return error;
11537 }
11538
11539 /*
11540  * Set the data of an extended attribute.
11541  */
11542 int
11543 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
11544 {
11545         vnode_t vp;
11546         struct nameidata nd;
11547         char attrname[XATTR_MAXNAMELEN + 1];
11548         vfs_context_t ctx = vfs_context_current();
11549         uio_t auio = NULL;
11550         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11551         size_t namelen;
11552         u_int32_t nameiflags;
11553         int error;
11554         char uio_buf[UIO_SIZEOF(1)];
11555
11556         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11557                 return EINVAL;
11558         }
11559
11560         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11561         if (error != 0) {
11562                 if (error == EPERM) {
11563                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11564                         return ENAMETOOLONG;
11565                 }
11566                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11567                 return error;
11568         }
11569         if (xattr_protected(attrname)) {
11570                 return EPERM;
11571         }
11572         if (uap->size != 0 && uap->value == 0) {
11573                 return EINVAL;
11574         }
11575
11576         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11577         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
11578         if ((error = namei(&nd))) {
11579                 return error;
11580         }
11581         vp = nd.ni_vp;
11582         nameidone(&nd);
11583
11584         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11585             &uio_buf[0], sizeof(uio_buf));
11586         uio_addiov(auio, uap->value, uap->size);
11587
11588         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
11589 #if CONFIG_FSE
11590         if (error == 0) {
11591                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11592                     FSE_ARG_VNODE, vp,
11593                     FSE_ARG_DONE);
11594         }
11595 #endif
11596         vnode_put(vp);
11597         *retval = 0;
11598         return error;
11599 }
11600
11601 /*
11602  * Set the data of an extended attribute.
11603  */
11604 int
11605 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
11606 {
11607         vnode_t vp;
11608         char attrname[XATTR_MAXNAMELEN + 1];
11609         uio_t auio = NULL;
11610         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11611         size_t namelen;
11612         int error;
11613         char uio_buf[UIO_SIZEOF(1)];
11614 #if CONFIG_FSE
11615         vfs_context_t ctx = vfs_context_current();
11616 #endif
11617
11618         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11619                 return EINVAL;
11620         }
11621
11622         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11623         if (error != 0) {
11624                 if (error == EPERM) {
11625                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11626                         return ENAMETOOLONG;
11627                 }
11628                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11629                 return error;
11630         }
11631         if (xattr_protected(attrname)) {
11632                 return EPERM;
11633         }
11634         if (uap->size != 0 && uap->value == 0) {
11635                 return EINVAL;
11636         }
11637         if ((error = file_vnode(uap->fd, &vp))) {
11638                 return error;
11639         }
11640         if ((error = vnode_getwithref(vp))) {
11641                 file_drop(uap->fd);
11642                 return error;
11643         }
11644         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11645             &uio_buf[0], sizeof(uio_buf));
11646         uio_addiov(auio, uap->value, uap->size);
11647
11648         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
11649 #if CONFIG_FSE
11650         if (error == 0) {
11651                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11652                     FSE_ARG_VNODE, vp,
11653                     FSE_ARG_DONE);
11654         }
11655 #endif
11656         vnode_put(vp);
11657         file_drop(uap->fd);
11658         *retval = 0;
11659         return error;
11660 }
11661
11662 /*
11663  * Remove an extended attribute.
11664  * XXX Code duplication here.
11665  */
11666 int
11667 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
11668 {
11669         vnode_t vp;
11670         struct nameidata nd;
11671         char attrname[XATTR_MAXNAMELEN + 1];
11672         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11673         vfs_context_t ctx = vfs_context_current();
11674         size_t namelen;
11675         u_int32_t nameiflags;
11676         int error;
11677
11678         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11679                 return EINVAL;
11680         }
11681
11682         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11683         if (error != 0) {
11684                 return error;
11685         }
11686         if (xattr_protected(attrname)) {
11687                 return EPERM;
11688         }
11689         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11690         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
11691         if ((error = namei(&nd))) {
11692                 return error;
11693         }
11694         vp = nd.ni_vp;
11695         nameidone(&nd);
11696
11697         error = vn_removexattr(vp, attrname, uap->options, ctx);
11698 #if CONFIG_FSE
11699         if (error == 0) {
11700                 add_fsevent(FSE_XATTR_REMOVED, ctx,
11701                     FSE_ARG_VNODE, vp,
11702                     FSE_ARG_DONE);
11703         }
11704 #endif
11705         vnode_put(vp);
11706         *retval = 0;
11707         return error;
11708 }
11709
11710 /*
11711  * Remove an extended attribute.
11712  * XXX Code duplication here.
11713  */
11714 int
11715 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
11716 {
11717         vnode_t vp;
11718         char attrname[XATTR_MAXNAMELEN + 1];
11719         size_t namelen;
11720         int error;
11721 #if CONFIG_FSE
11722         vfs_context_t ctx = vfs_context_current();
11723 #endif
11724
11725         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11726                 return EINVAL;
11727         }
11728
11729         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11730         if (error != 0) {
11731                 return error;
11732         }
11733         if (xattr_protected(attrname)) {
11734                 return EPERM;
11735         }
11736         if ((error = file_vnode(uap->fd, &vp))) {
11737                 return error;
11738         }
11739         if ((error = vnode_getwithref(vp))) {
11740                 file_drop(uap->fd);
11741                 return error;
11742         }
11743
11744         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
11745 #if CONFIG_FSE
11746         if (error == 0) {
11747                 add_fsevent(FSE_XATTR_REMOVED, ctx,
11748                     FSE_ARG_VNODE, vp,
11749                     FSE_ARG_DONE);
11750         }
11751 #endif
11752         vnode_put(vp);
11753         file_drop(uap->fd);
11754         *retval = 0;
11755         return error;
11756 }
11757
11758 /*
11759  * Retrieve the list of extended attribute names.
11760  * XXX Code duplication here.
11761  */
11762 int
11763 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
11764 {
11765         vnode_t vp;
11766         struct nameidata nd;
11767         vfs_context_t ctx = vfs_context_current();
11768         uio_t auio = NULL;
11769         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11770         size_t attrsize = 0;
11771         u_int32_t nameiflags;
11772         int error;
11773         char uio_buf[UIO_SIZEOF(1)];
11774
11775         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11776                 return EINVAL;
11777         }
11778
11779         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11780         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
11781         if ((error = namei(&nd))) {
11782                 return error;
11783         }
11784         vp = nd.ni_vp;
11785         nameidone(&nd);
11786         if (uap->namebuf != 0 && uap->bufsize > 0) {
11787                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
11788                     &uio_buf[0], sizeof(uio_buf));
11789                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11790         }
11791
11792         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
11793
11794         vnode_put(vp);
11795         if (auio) {
11796                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11797         } else {
11798                 *retval = (user_ssize_t)attrsize;
11799         }
11800         return error;
11801 }
11802
11803 /*
11804  * Retrieve the list of extended attribute names.
11805  * XXX Code duplication here.
11806  */
11807 int
11808 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
11809 {
11810         vnode_t vp;
11811         uio_t auio = NULL;
11812         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11813         size_t attrsize = 0;
11814         int error;
11815         char uio_buf[UIO_SIZEOF(1)];
11816
11817         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11818                 return EINVAL;
11819         }
11820
11821         if ((error = file_vnode(uap->fd, &vp))) {
11822                 return error;
11823         }
11824         if ((error = vnode_getwithref(vp))) {
11825                 file_drop(uap->fd);
11826                 return error;
11827         }
11828         if (uap->namebuf != 0 && uap->bufsize > 0) {
11829                 auio = uio_createwithbuffer(1, 0, spacetype,
11830                     UIO_READ, &uio_buf[0], sizeof(uio_buf));
11831                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11832         }
11833
11834         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
11835
11836         vnode_put(vp);
11837         file_drop(uap->fd);
11838         if (auio) {
11839                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11840         } else {
11841                 *retval = (user_ssize_t)attrsize;
11842         }
11843         return error;
11844 }
11845
11846 static int
11847 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
11848     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
11849 {
11850         int error;
11851         struct mount *mp = NULL;
11852         vnode_t vp;
11853         int length;
11854         int bpflags;
11855         /* maximum number of times to retry build_path */
11856         unsigned int retries = 0x10;
11857
11858         if (bufsize > PAGE_SIZE) {
11859                 return EINVAL;
11860         }
11861
11862         if (buf == NULL) {
11863                 return ENOMEM;
11864         }
11865
11866 retry:
11867         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
11868                 error = ENOTSUP;  /* unexpected failure */
11869                 return ENOTSUP;
11870         }
11871
11872 unionget:
11873         if (objid == 2) {
11874                 struct vfs_attr vfsattr;
11875                 int use_vfs_root = TRUE;
11876
11877                 VFSATTR_INIT(&vfsattr);
11878                 VFSATTR_WANTED(&vfsattr, f_capabilities);
11879                 if (!(options & FSOPT_ISREALFSID) &&
11880                     vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
11881                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
11882                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
11883                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
11884                                 use_vfs_root = FALSE;
11885                         }
11886                 }
11887
11888                 if (use_vfs_root) {
11889                         error = VFS_ROOT(mp, &vp, ctx);
11890                 } else {
11891                         error = VFS_VGET(mp, objid, &vp, ctx);
11892                 }
11893         } else {
11894                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
11895         }
11896
11897         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
11898                 /*
11899                  * If the fileid isn't found and we're in a union
11900                  * mount volume, then see if the fileid is in the
11901                  * mounted-on volume.
11902                  */
11903                 struct mount *tmp = mp;
11904                 mp = vnode_mount(tmp->mnt_vnodecovered);
11905                 vfs_unbusy(tmp);
11906                 if (vfs_busy(mp, LK_NOWAIT) == 0) {
11907                         goto unionget;
11908                 }
11909         } else {
11910                 vfs_unbusy(mp);
11911         }
11912
11913         if (error) {
11914                 return error;
11915         }
11916
11917 #if CONFIG_MACF
11918         error = mac_vnode_check_fsgetpath(ctx, vp);
11919         if (error) {
11920                 vnode_put(vp);
11921                 return error;
11922         }
11923 #endif
11924
11925         /* Obtain the absolute path to this vnode. */
11926         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
11927         if (options & FSOPT_NOFIRMLINKPATH) {
11928                 bpflags |= BUILDPATH_NO_FIRMLINK;
11929         }
11930         bpflags |= BUILDPATH_CHECK_MOVED;
11931         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
11932         vnode_put(vp);
11933
11934         if (error) {
11935                 /* there was a race building the path, try a few more times */
11936                 if (error == EAGAIN) {
11937                         --retries;
11938                         if (retries > 0) {
11939                                 goto retry;
11940                         }
11941
11942                         error = ENOENT;
11943                 }
11944                 goto out;
11945         }
11946
11947         AUDIT_ARG(text, buf);
11948
11949         if (kdebug_enable) {
11950                 long dbg_parms[NUMPARMS];
11951                 int  dbg_namelen;
11952
11953                 dbg_namelen = (int)sizeof(dbg_parms);
11954
11955                 if (length < dbg_namelen) {
11956                         memcpy((char *)dbg_parms, buf, length);
11957                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
11958
11959                         dbg_namelen = length;
11960                 } else {
11961                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
11962                 }
11963
11964                 kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
11965                     KDBG_VFS_LOOKUP_FLAG_LOOKUP);
11966         }
11967
11968         *pathlen = (user_ssize_t)length; /* may be superseded by error */
11969
11970 out:
11971         return error;
11972 }
11973
11974 /*
11975  * Obtain the full pathname of a file system object by id.
11976  */
11977 static int
11978 fsgetpath_extended(user_addr_t buf, int bufsize, user_addr_t user_fsid, uint64_t objid,
11979     uint32_t options, user_ssize_t *retval)
11980 {
11981         vfs_context_t ctx = vfs_context_current();
11982         fsid_t fsid;
11983         char *realpath;
11984         int length;
11985         int error;
11986
11987         if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
11988                 return EINVAL;
11989         }
11990
11991         if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
11992                 return error;
11993         }
11994         AUDIT_ARG(value32, fsid.val[0]);
11995         AUDIT_ARG(value64, objid);
11996         /* Restrict output buffer size for now. */
11997
11998         if (bufsize > PAGE_SIZE || bufsize <= 0) {
11999                 return EINVAL;
12000         }
12001         MALLOC(realpath, char *, bufsize, M_TEMP, M_WAITOK | M_ZERO);
12002         if (realpath == NULL) {
12003                 return ENOMEM;
12004         }
12005
12006         error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
12007             options, &length);
12008
12009         if (error) {
12010                 goto out;
12011         }
12012
12013         error = copyout((caddr_t)realpath, buf, length);
12014
12015         *retval = (user_ssize_t)length; /* may be superseded by error */
12016 out:
12017         if (realpath) {
12018                 FREE(realpath, M_TEMP);
12019         }
12020         return error;
12021 }
12022
12023 int
12024 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
12025 {
12026         return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12027                    0, retval);
12028 }
12029
12030 int
12031 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
12032 {
12033         return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12034                    uap->options, retval);
12035 }
12036
12037 /*
12038  * Common routine to handle various flavors of statfs data heading out
12039  *      to user space.
12040  *
12041  * Returns:     0                       Success
12042  *              EFAULT
12043  */
12044 static int
12045 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
12046     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12047     boolean_t partial_copy)
12048 {
12049         int             error;
12050         int             my_size, copy_size;
12051
12052         if (is_64_bit) {
12053                 struct user64_statfs sfs;
12054                 my_size = copy_size = sizeof(sfs);
12055                 bzero(&sfs, my_size);
12056                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12057                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12058                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12059                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12060                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12061                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12062                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12063                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12064                 sfs.f_files = (user64_long_t)sfsp->f_files;
12065                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12066                 sfs.f_fsid = sfsp->f_fsid;
12067                 sfs.f_owner = sfsp->f_owner;
12068                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12069                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12070                 } else {
12071                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12072                 }
12073                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12074                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12075
12076                 if (partial_copy) {
12077                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12078                 }
12079                 error = copyout((caddr_t)&sfs, bufp, copy_size);
12080         } else {
12081                 struct user32_statfs sfs;
12082
12083                 my_size = copy_size = sizeof(sfs);
12084                 bzero(&sfs, my_size);
12085
12086                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12087                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12088                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12089
12090                 /*
12091                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12092                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
12093                  * to reflect the filesystem size as best we can.
12094                  */
12095                 if ((sfsp->f_blocks > INT_MAX)
12096                     /* Hack for 4061702 . I think the real fix is for Carbon to
12097                      * look for some volume capability and not depend on hidden
12098                      * semantics agreed between a FS and carbon.
12099                      * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12100                      * for Carbon to set bNoVolumeSizes volume attribute.
12101                      * Without this the webdavfs files cannot be copied onto
12102                      * disk as they look huge. This change should not affect
12103                      * XSAN as they should not setting these to -1..
12104                      */
12105                     && (sfsp->f_blocks != 0xffffffffffffffffULL)
12106                     && (sfsp->f_bfree != 0xffffffffffffffffULL)
12107                     && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12108                         int             shift;
12109
12110                         /*
12111                          * Work out how far we have to shift the block count down to make it fit.
12112                          * Note that it's possible to have to shift so far that the resulting
12113                          * blocksize would be unreportably large.  At that point, we will clip
12114                          * any values that don't fit.
12115                          *
12116                          * For safety's sake, we also ensure that f_iosize is never reported as
12117                          * being smaller than f_bsize.
12118                          */
12119                         for (shift = 0; shift < 32; shift++) {
12120                                 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12121                                         break;
12122                                 }
12123                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12124                                         break;
12125                                 }
12126                         }
12127 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12128                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12129                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12130                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12131 #undef __SHIFT_OR_CLIP
12132                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12133                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
12134                 } else {
12135                         /* filesystem is small enough to be reported honestly */
12136                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12137                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12138                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12139                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12140                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12141                 }
12142                 sfs.f_files = (user32_long_t)sfsp->f_files;
12143                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12144                 sfs.f_fsid = sfsp->f_fsid;
12145                 sfs.f_owner = sfsp->f_owner;
12146                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12147                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12148                 } else {
12149                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12150                 }
12151                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12152                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12153
12154                 if (partial_copy) {
12155                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12156                 }
12157                 error = copyout((caddr_t)&sfs, bufp, copy_size);
12158         }
12159
12160         if (sizep != NULL) {
12161                 *sizep = my_size;
12162         }
12163         return error;
12164 }
12165
12166 /*
12167  * copy stat structure into user_stat structure.
12168  */
12169 void
12170 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
12171 {
12172         bzero(usbp, sizeof(*usbp));
12173
12174         usbp->st_dev = sbp->st_dev;
12175         usbp->st_ino = sbp->st_ino;
12176         usbp->st_mode = sbp->st_mode;
12177         usbp->st_nlink = sbp->st_nlink;
12178         usbp->st_uid = sbp->st_uid;
12179         usbp->st_gid = sbp->st_gid;
12180         usbp->st_rdev = sbp->st_rdev;
12181 #ifndef _POSIX_C_SOURCE
12182         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12183         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12184         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12185         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12186         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12187         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12188 #else
12189         usbp->st_atime = sbp->st_atime;
12190         usbp->st_atimensec = sbp->st_atimensec;
12191         usbp->st_mtime = sbp->st_mtime;
12192         usbp->st_mtimensec = sbp->st_mtimensec;
12193         usbp->st_ctime = sbp->st_ctime;
12194         usbp->st_ctimensec = sbp->st_ctimensec;
12195 #endif
12196         usbp->st_size = sbp->st_size;
12197         usbp->st_blocks = sbp->st_blocks;
12198         usbp->st_blksize = sbp->st_blksize;
12199         usbp->st_flags = sbp->st_flags;
12200         usbp->st_gen = sbp->st_gen;
12201         usbp->st_lspare = sbp->st_lspare;
12202         usbp->st_qspare[0] = sbp->st_qspare[0];
12203         usbp->st_qspare[1] = sbp->st_qspare[1];
12204 }
12205
12206 void
12207 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
12208 {
12209         bzero(usbp, sizeof(*usbp));
12210
12211         usbp->st_dev = sbp->st_dev;
12212         usbp->st_ino = sbp->st_ino;
12213         usbp->st_mode = sbp->st_mode;
12214         usbp->st_nlink = sbp->st_nlink;
12215         usbp->st_uid = sbp->st_uid;
12216         usbp->st_gid = sbp->st_gid;
12217         usbp->st_rdev = sbp->st_rdev;
12218 #ifndef _POSIX_C_SOURCE
12219         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12220         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12221         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12222         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12223         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12224         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12225 #else
12226         usbp->st_atime = sbp->st_atime;
12227         usbp->st_atimensec = sbp->st_atimensec;
12228         usbp->st_mtime = sbp->st_mtime;
12229         usbp->st_mtimensec = sbp->st_mtimensec;
12230         usbp->st_ctime = sbp->st_ctime;
12231         usbp->st_ctimensec = sbp->st_ctimensec;
12232 #endif
12233         usbp->st_size = sbp->st_size;
12234         usbp->st_blocks = sbp->st_blocks;
12235         usbp->st_blksize = sbp->st_blksize;
12236         usbp->st_flags = sbp->st_flags;
12237         usbp->st_gen = sbp->st_gen;
12238         usbp->st_lspare = sbp->st_lspare;
12239         usbp->st_qspare[0] = sbp->st_qspare[0];
12240         usbp->st_qspare[1] = sbp->st_qspare[1];
12241 }
12242
12243 /*
12244  * copy stat64 structure into user_stat64 structure.
12245  */
12246 void
12247 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
12248 {
12249         bzero(usbp, sizeof(*usbp));
12250
12251         usbp->st_dev = sbp->st_dev;
12252         usbp->st_ino = sbp->st_ino;
12253         usbp->st_mode = sbp->st_mode;
12254         usbp->st_nlink = sbp->st_nlink;
12255         usbp->st_uid = sbp->st_uid;
12256         usbp->st_gid = sbp->st_gid;
12257         usbp->st_rdev = sbp->st_rdev;
12258 #ifndef _POSIX_C_SOURCE
12259         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12260         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12261         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12262         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12263         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12264         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12265         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12266         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12267 #else
12268         usbp->st_atime = sbp->st_atime;
12269         usbp->st_atimensec = sbp->st_atimensec;
12270         usbp->st_mtime = sbp->st_mtime;
12271         usbp->st_mtimensec = sbp->st_mtimensec;
12272         usbp->st_ctime = sbp->st_ctime;
12273         usbp->st_ctimensec = sbp->st_ctimensec;
12274         usbp->st_birthtime = sbp->st_birthtime;
12275         usbp->st_birthtimensec = sbp->st_birthtimensec;
12276 #endif
12277         usbp->st_size = sbp->st_size;
12278         usbp->st_blocks = sbp->st_blocks;
12279         usbp->st_blksize = sbp->st_blksize;
12280         usbp->st_flags = sbp->st_flags;
12281         usbp->st_gen = sbp->st_gen;
12282         usbp->st_lspare = sbp->st_lspare;
12283         usbp->st_qspare[0] = sbp->st_qspare[0];
12284         usbp->st_qspare[1] = sbp->st_qspare[1];
12285 }
12286
12287 void
12288 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
12289 {
12290         bzero(usbp, sizeof(*usbp));
12291
12292         usbp->st_dev = sbp->st_dev;
12293         usbp->st_ino = sbp->st_ino;
12294         usbp->st_mode = sbp->st_mode;
12295         usbp->st_nlink = sbp->st_nlink;
12296         usbp->st_uid = sbp->st_uid;
12297         usbp->st_gid = sbp->st_gid;
12298         usbp->st_rdev = sbp->st_rdev;
12299 #ifndef _POSIX_C_SOURCE
12300         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12301         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12302         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12303         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12304         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12305         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12306         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12307         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12308 #else
12309         usbp->st_atime = sbp->st_atime;
12310         usbp->st_atimensec = sbp->st_atimensec;
12311         usbp->st_mtime = sbp->st_mtime;
12312         usbp->st_mtimensec = sbp->st_mtimensec;
12313         usbp->st_ctime = sbp->st_ctime;
12314         usbp->st_ctimensec = sbp->st_ctimensec;
12315         usbp->st_birthtime = sbp->st_birthtime;
12316         usbp->st_birthtimensec = sbp->st_birthtimensec;
12317 #endif
12318         usbp->st_size = sbp->st_size;
12319         usbp->st_blocks = sbp->st_blocks;
12320         usbp->st_blksize = sbp->st_blksize;
12321         usbp->st_flags = sbp->st_flags;
12322         usbp->st_gen = sbp->st_gen;
12323         usbp->st_lspare = sbp->st_lspare;
12324         usbp->st_qspare[0] = sbp->st_qspare[0];
12325         usbp->st_qspare[1] = sbp->st_qspare[1];
12326 }
12327
12328 /*
12329  * Purge buffer cache for simulating cold starts
12330  */
12331 static int
12332 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
12333 {
12334         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
12335
12336         return VNODE_RETURNED;
12337 }
12338
12339 static int
12340 vfs_purge_callback(mount_t mp, __unused void * arg)
12341 {
12342         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
12343
12344         return VFS_RETURNED;
12345 }
12346
12347 int
12348 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
12349 {
12350         if (!kauth_cred_issuser(kauth_cred_get())) {
12351                 return EPERM;
12352         }
12353
12354         vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
12355
12356         return 0;
12357 }
12358
12359 /*
12360  * gets the vnode associated with the (unnamed) snapshot directory
12361  * for a Filesystem. The snapshot directory vnode is returned with
12362  * an iocount on it.
12363  */
12364 int
12365 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
12366 {
12367         return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
12368 }
12369
12370 /*
12371  * Get the snapshot vnode.
12372  *
12373  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
12374  * needs nameidone() on ndp.
12375  *
12376  * If the snapshot vnode exists it is returned in ndp->ni_vp.
12377  *
12378  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
12379  * not needed.
12380  */
12381 static int
12382 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
12383     user_addr_t name, struct nameidata *ndp, int32_t op,
12384 #if !CONFIG_TRIGGERS
12385     __unused
12386 #endif
12387     enum path_operation pathop,
12388     vfs_context_t ctx)
12389 {
12390         int error, i;
12391         caddr_t name_buf;
12392         size_t name_len;
12393         struct vfs_attr vfa;
12394
12395         *sdvpp = NULLVP;
12396         *rvpp = NULLVP;
12397
12398         error = vnode_getfromfd(ctx, dirfd, rvpp);
12399         if (error) {
12400                 return error;
12401         }
12402
12403         if (!vnode_isvroot(*rvpp)) {
12404                 error = EINVAL;
12405                 goto out;
12406         }
12407
12408         /* Make sure the filesystem supports snapshots */
12409         VFSATTR_INIT(&vfa);
12410         VFSATTR_WANTED(&vfa, f_capabilities);
12411         if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
12412             !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
12413             !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
12414             VOL_CAP_INT_SNAPSHOT)) ||
12415             !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
12416             VOL_CAP_INT_SNAPSHOT))) {
12417                 error = ENOTSUP;
12418                 goto out;
12419         }
12420
12421         error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
12422         if (error) {
12423                 goto out;
12424         }
12425
12426         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12427         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12428         if (error) {
12429                 goto out1;
12430         }
12431
12432         /*
12433          * Some sanity checks- name can't be empty, "." or ".." or have slashes.
12434          * (the length returned by copyinstr includes the terminating NUL)
12435          */
12436         if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
12437             (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
12438                 error = EINVAL;
12439                 goto out1;
12440         }
12441         for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
12442                 ;
12443         }
12444         if (i < (int)name_len) {
12445                 error = EINVAL;
12446                 goto out1;
12447         }
12448
12449 #if CONFIG_MACF
12450         if (op == CREATE) {
12451                 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
12452                     name_buf);
12453         } else if (op == DELETE) {
12454                 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
12455                     name_buf);
12456         }
12457         if (error) {
12458                 goto out1;
12459         }
12460 #endif
12461
12462         /* Check if the snapshot already exists ... */
12463         NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
12464             UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
12465         ndp->ni_dvp = *sdvpp;
12466
12467         error = namei(ndp);
12468 out1:
12469         FREE(name_buf, M_TEMP);
12470 out:
12471         if (error) {
12472                 if (*sdvpp) {
12473                         vnode_put(*sdvpp);
12474                         *sdvpp = NULLVP;
12475                 }
12476                 if (*rvpp) {
12477                         vnode_put(*rvpp);
12478                         *rvpp = NULLVP;
12479                 }
12480         }
12481         return error;
12482 }
12483
12484 /*
12485  * create a filesystem snapshot (for supporting filesystems)
12486  *
12487  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
12488  * We get to the (unnamed) snapshot directory vnode and create the vnode
12489  * for the snapshot in it.
12490  *
12491  * Restrictions:
12492  *
12493  *    a) Passed in name for snapshot cannot have slashes.
12494  *    b) name can't be "." or ".."
12495  *
12496  * Since this requires superuser privileges, vnode_authorize calls are not
12497  * made.
12498  */
12499 static int
12500 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
12501     vfs_context_t ctx)
12502 {
12503         vnode_t rvp, snapdvp;
12504         int error;
12505         struct nameidata namend;
12506
12507         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
12508             OP_LINK, ctx);
12509         if (error) {
12510                 return error;
12511         }
12512
12513         if (namend.ni_vp) {
12514                 vnode_put(namend.ni_vp);
12515                 error = EEXIST;
12516         } else {
12517                 struct vnode_attr va;
12518                 vnode_t vp = NULLVP;
12519
12520                 VATTR_INIT(&va);
12521                 VATTR_SET(&va, va_type, VREG);
12522                 VATTR_SET(&va, va_mode, 0);
12523
12524                 error = vn_create(snapdvp, &vp, &namend, &va,
12525                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
12526                 if (!error && vp) {
12527                         vnode_put(vp);
12528                 }
12529         }
12530
12531         nameidone(&namend);
12532         vnode_put(snapdvp);
12533         vnode_put(rvp);
12534         return error;
12535 }
12536
12537 /*
12538  * Delete a Filesystem snapshot
12539  *
12540  * get the vnode for the unnamed snapshot directory and the snapshot and
12541  * delete the snapshot.
12542  */
12543 static int
12544 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
12545     vfs_context_t ctx)
12546 {
12547         vnode_t rvp, snapdvp;
12548         int error;
12549         struct nameidata namend;
12550
12551         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
12552             OP_UNLINK, ctx);
12553         if (error) {
12554                 goto out;
12555         }
12556
12557         error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
12558             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
12559
12560         vnode_put(namend.ni_vp);
12561         nameidone(&namend);
12562         vnode_put(snapdvp);
12563         vnode_put(rvp);
12564 out:
12565         return error;
12566 }
12567
12568 /*
12569  * Revert a filesystem to a snapshot
12570  *
12571  * Marks the filesystem to revert to the given snapshot on next mount.
12572  */
12573 static int
12574 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
12575     vfs_context_t ctx)
12576 {
12577         int error;
12578         vnode_t rvp;
12579         mount_t mp;
12580         struct fs_snapshot_revert_args revert_data;
12581         struct componentname cnp;
12582         caddr_t name_buf;
12583         size_t name_len;
12584
12585         error = vnode_getfromfd(ctx, dirfd, &rvp);
12586         if (error) {
12587                 return error;
12588         }
12589         mp = vnode_mount(rvp);
12590
12591         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12592         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12593         if (error) {
12594                 FREE(name_buf, M_TEMP);
12595                 vnode_put(rvp);
12596                 return error;
12597         }
12598
12599 #if CONFIG_MACF
12600         error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
12601         if (error) {
12602                 FREE(name_buf, M_TEMP);
12603                 vnode_put(rvp);
12604                 return error;
12605         }
12606 #endif
12607
12608         /*
12609          * Grab mount_iterref so that we can release the vnode,
12610          * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
12611          */
12612         error = mount_iterref(mp, 0);
12613         vnode_put(rvp);
12614         if (error) {
12615                 FREE(name_buf, M_TEMP);
12616                 return error;
12617         }
12618
12619         memset(&cnp, 0, sizeof(cnp));
12620         cnp.cn_pnbuf = (char *)name_buf;
12621         cnp.cn_nameiop = LOOKUP;
12622         cnp.cn_flags = ISLASTCN | HASBUF;
12623         cnp.cn_pnlen = MAXPATHLEN;
12624         cnp.cn_nameptr = cnp.cn_pnbuf;
12625         cnp.cn_namelen = (int)name_len;
12626         revert_data.sr_cnp = &cnp;
12627
12628         error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
12629         mount_iterdrop(mp);
12630         FREE(name_buf, M_TEMP);
12631
12632         if (error) {
12633                 /* If there was any error, try again using VNOP_IOCTL */
12634
12635                 vnode_t snapdvp;
12636                 struct nameidata namend;
12637
12638                 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
12639                     OP_LOOKUP, ctx);
12640                 if (error) {
12641                         return error;
12642                 }
12643
12644
12645                 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
12646                     0, ctx);
12647
12648                 vnode_put(namend.ni_vp);
12649                 nameidone(&namend);
12650                 vnode_put(snapdvp);
12651                 vnode_put(rvp);
12652         }
12653
12654         return error;
12655 }
12656
12657 /*
12658  * rename a Filesystem snapshot
12659  *
12660  * get the vnode for the unnamed snapshot directory and the snapshot and
12661  * rename the snapshot. This is a very specialised (and simple) case of
12662  * rename(2) (which has to deal with a lot more complications). It differs
12663  * slightly from rename(2) in that EEXIST is returned if the new name exists.
12664  */
12665 static int
12666 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
12667     __unused uint32_t flags, vfs_context_t ctx)
12668 {
12669         vnode_t rvp, snapdvp;
12670         int error, i;
12671         caddr_t newname_buf;
12672         size_t name_len;
12673         vnode_t fvp;
12674         struct nameidata *fromnd, *tond;
12675         /* carving out a chunk for structs that are too big to be on stack. */
12676         struct {
12677                 struct nameidata from_node;
12678                 struct nameidata to_node;
12679         } * __rename_data;
12680
12681         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
12682         fromnd = &__rename_data->from_node;
12683         tond = &__rename_data->to_node;
12684
12685         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
12686             OP_UNLINK, ctx);
12687         if (error) {
12688                 goto out;
12689         }
12690         fvp  = fromnd->ni_vp;
12691
12692         MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12693         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
12694         if (error) {
12695                 goto out1;
12696         }
12697
12698         /*
12699          * Some sanity checks- new name can't be empty, "." or ".." or have
12700          * slashes.
12701          * (the length returned by copyinstr includes the terminating NUL)
12702          *
12703          * The FS rename VNOP is suppossed to handle this but we'll pick it
12704          * off here itself.
12705          */
12706         if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
12707             (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
12708                 error = EINVAL;
12709                 goto out1;
12710         }
12711         for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
12712                 ;
12713         }
12714         if (i < (int)name_len) {
12715                 error = EINVAL;
12716                 goto out1;
12717         }
12718
12719 #if CONFIG_MACF
12720         error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
12721             newname_buf);
12722         if (error) {
12723                 goto out1;
12724         }
12725 #endif
12726
12727         NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
12728             UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
12729         tond->ni_dvp = snapdvp;
12730
12731         error = namei(tond);
12732         if (error) {
12733                 goto out2;
12734         } else if (tond->ni_vp) {
12735                 /*
12736                  * snapshot rename behaves differently than rename(2) - if the
12737                  * new name exists, EEXIST is returned.
12738                  */
12739                 vnode_put(tond->ni_vp);
12740                 error = EEXIST;
12741                 goto out2;
12742         }
12743
12744         error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
12745             &tond->ni_cnd, ctx);
12746
12747 out2:
12748         nameidone(tond);
12749 out1:
12750         FREE(newname_buf, M_TEMP);
12751         vnode_put(fvp);
12752         vnode_put(snapdvp);
12753         vnode_put(rvp);
12754         nameidone(fromnd);
12755 out:
12756         FREE(__rename_data, M_TEMP);
12757         return error;
12758 }
12759
12760 /*
12761  * Mount a Filesystem snapshot
12762  *
12763  * get the vnode for the unnamed snapshot directory and the snapshot and
12764  * mount the snapshot.
12765  */
12766 static int
12767 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
12768     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
12769 {
12770         mount_t mp;
12771         vnode_t rvp, snapdvp, snapvp, vp, pvp;
12772         struct fs_snapshot_mount_args smnt_data;
12773         int error;
12774         struct nameidata *snapndp, *dirndp;
12775         /* carving out a chunk for structs that are too big to be on stack. */
12776         struct {
12777                 struct nameidata snapnd;
12778                 struct nameidata dirnd;
12779         } * __snapshot_mount_data;
12780
12781         MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
12782             M_TEMP, M_WAITOK);
12783         snapndp = &__snapshot_mount_data->snapnd;
12784         dirndp = &__snapshot_mount_data->dirnd;
12785
12786         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
12787             OP_LOOKUP, ctx);
12788         if (error) {
12789                 goto out;
12790         }
12791
12792         snapvp  = snapndp->ni_vp;
12793         if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
12794                 error = EIO;
12795                 goto out1;
12796         }
12797
12798         /* Get the vnode to be covered */
12799         NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
12800             UIO_USERSPACE, directory, ctx);
12801         error = namei(dirndp);
12802         if (error) {
12803                 goto out1;
12804         }
12805
12806         vp = dirndp->ni_vp;
12807         pvp = dirndp->ni_dvp;
12808         mp = vnode_mount(rvp);
12809
12810         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
12811                 error = EINVAL;
12812                 goto out2;
12813         }
12814
12815 #if CONFIG_MACF
12816         error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
12817             mp->mnt_vfsstat.f_fstypename);
12818         if (error) {
12819                 goto out2;
12820         }
12821 #endif
12822
12823         smnt_data.sm_mp  = mp;
12824         smnt_data.sm_cnp = &snapndp->ni_cnd;
12825         error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
12826             &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
12827             KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
12828
12829 out2:
12830         vnode_put(vp);
12831         vnode_put(pvp);
12832         nameidone(dirndp);
12833 out1:
12834         vnode_put(snapvp);
12835         vnode_put(snapdvp);
12836         vnode_put(rvp);
12837         nameidone(snapndp);
12838 out:
12839         FREE(__snapshot_mount_data, M_TEMP);
12840         return error;
12841 }
12842
12843 /*
12844  * Root from a snapshot of the filesystem
12845  *
12846  * Marks the filesystem to root from the given snapshot on next boot.
12847  */
12848 static int
12849 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
12850     vfs_context_t ctx)
12851 {
12852         int error;
12853         vnode_t rvp;
12854         mount_t mp;
12855         struct fs_snapshot_root_args root_data;
12856         struct componentname cnp;
12857         caddr_t name_buf;
12858         size_t name_len;
12859
12860         error = vnode_getfromfd(ctx, dirfd, &rvp);
12861         if (error) {
12862                 return error;
12863         }
12864         mp = vnode_mount(rvp);
12865
12866         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12867         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12868         if (error) {
12869                 FREE(name_buf, M_TEMP);
12870                 vnode_put(rvp);
12871                 return error;
12872         }
12873
12874         // XXX MAC checks ?
12875
12876         /*
12877          * Grab mount_iterref so that we can release the vnode,
12878          * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
12879          */
12880         error = mount_iterref(mp, 0);
12881         vnode_put(rvp);
12882         if (error) {
12883                 FREE(name_buf, M_TEMP);
12884                 return error;
12885         }
12886
12887         memset(&cnp, 0, sizeof(cnp));
12888         cnp.cn_pnbuf = (char *)name_buf;
12889         cnp.cn_nameiop = LOOKUP;
12890         cnp.cn_flags = ISLASTCN | HASBUF;
12891         cnp.cn_pnlen = MAXPATHLEN;
12892         cnp.cn_nameptr = cnp.cn_pnbuf;
12893         cnp.cn_namelen = (int)name_len;
12894         root_data.sr_cnp = &cnp;
12895
12896         error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
12897
12898         mount_iterdrop(mp);
12899         FREE(name_buf, M_TEMP);
12900
12901         return error;
12902 }
12903
12904 /*
12905  * FS snapshot operations dispatcher
12906  */
12907 int
12908 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
12909     __unused int32_t *retval)
12910 {
12911         int error;
12912         vfs_context_t ctx = vfs_context_current();
12913
12914         AUDIT_ARG(fd, uap->dirfd);
12915         AUDIT_ARG(value32, uap->op);
12916
12917         error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
12918         if (error) {
12919                 return error;
12920         }
12921
12922         /*
12923          * Enforce user authorization for snapshot modification operations
12924          */
12925         if ((uap->op != SNAPSHOT_OP_MOUNT) &&
12926             (uap->op != SNAPSHOT_OP_ROOT)) {
12927                 vnode_t dvp = NULLVP;
12928                 vnode_t devvp = NULLVP;
12929                 mount_t mp;
12930
12931                 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
12932                 if (error) {
12933                         return error;
12934                 }
12935                 mp = vnode_mount(dvp);
12936                 devvp = mp->mnt_devvp;
12937
12938                 /* get an iocount on devvp */
12939                 if (devvp == NULLVP) {
12940                         error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
12941                         /* for mounts which arent block devices */
12942                         if (error == ENOENT) {
12943                                 error = ENXIO;
12944                         }
12945                 } else {
12946                         error = vnode_getwithref(devvp);
12947                 }
12948
12949                 if (error) {
12950                         vnode_put(dvp);
12951                         return error;
12952                 }
12953
12954                 if ((vfs_context_issuser(ctx) == 0) &&
12955                     (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0)) {
12956                         error = EPERM;
12957                 }
12958                 vnode_put(dvp);
12959                 vnode_put(devvp);
12960
12961                 if (error) {
12962                         return error;
12963                 }
12964         }
12965
12966         switch (uap->op) {
12967         case SNAPSHOT_OP_CREATE:
12968                 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
12969                 break;
12970         case SNAPSHOT_OP_DELETE:
12971                 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
12972                 break;
12973         case SNAPSHOT_OP_RENAME:
12974                 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
12975                     uap->flags, ctx);
12976                 break;
12977         case SNAPSHOT_OP_MOUNT:
12978                 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
12979                     uap->data, uap->flags, ctx);
12980                 break;
12981         case SNAPSHOT_OP_REVERT:
12982                 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
12983                 break;
12984 #if CONFIG_MNT_ROOTSNAP
12985         case SNAPSHOT_OP_ROOT:
12986                 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
12987                 break;
12988 #endif /* CONFIG_MNT_ROOTSNAP */
12989         default:
12990                 error = ENOSYS;
12991         }
12992
12993         return error;
12994 }