bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/sysctl.h>
  98 #include <sys/xattr.h>
  99 #include <sys/fcntl.h>
 100 #include <sys/fsctl.h>
 101 #include <sys/ubc_internal.h>
 102 #include <sys/disk.h>
 103 #include <sys/content_protection.h>
 104 #include <sys/clonefile.h>
 105 #include <sys/snapshot.h>
 106 #include <sys/priv.h>
 107 #include <sys/fsgetpath.h>
 108 #include <machine/cons.h>
 109 #include <machine/limits.h>
 110 #include <miscfs/specfs/specdev.h>
 111
 112 #include <vfs/vfs_disk_conditioner.h>
 113
 114 #include <security/audit/audit.h>
 115 #include <bsm/audit_kevents.h>
 116
 117 #include <mach/mach_types.h>
 118 #include <kern/kern_types.h>
 119 #include <kern/kalloc.h>
 120 #include <kern/task.h>
 121
 122 #include <vm/vm_pageout.h>
 123 #include <vm/vm_protos.h>
 124
 125 #include <libkern/OSAtomic.h>
 126 #include <pexpert/pexpert.h>
 127 #include <IOKit/IOBSD.h>
 128
 129 // deps for MIG call
 130 #include <kern/host.h>
 131 #include <kern/ipc_misc.h>
 132 #include <mach/host_priv.h>
 133 #include <mach/vfs_nspace.h>
 134 #include <os/log.h>
 135
 136 #if ROUTEFS
 137 #include <miscfs/routefs/routefs.h>
 138 #endif /* ROUTEFS */
 139
 140 #if CONFIG_MACF
 141 #include <security/mac.h>
 142 #include <security/mac_framework.h>
 143 #endif
 144
 145 #if CONFIG_FSE
 146 #define GET_PATH(x) \
 147         (x) = get_pathbuff();
 148 #define RELEASE_PATH(x) \
 149         release_pathbuff(x);
 150 #else
 151 #define GET_PATH(x)     \
 152         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 153 #define RELEASE_PATH(x) \
 154         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 155 #endif /* CONFIG_FSE */
 156
 157 #ifndef HFS_GET_BOOT_INFO
 158 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
 159 #endif
 160
 161 #ifndef HFS_SET_BOOT_INFO
 162 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
 163 #endif
 164
 165 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
 166 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
 167 #endif
 168
 169 extern void disk_conditioner_unmount(mount_t mp);
 170
 171 /* struct for checkdirs iteration */
 172 struct cdirargs {
 173         vnode_t olddp;
 174         vnode_t newdp;
 175 };
 176 /* callback  for checkdirs iteration */
 177 static int checkdirs_callback(proc_t p, void * arg);
 178
 179 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 180 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 181 void enablequotas(struct mount *mp, vfs_context_t ctx);
 182 static int getfsstat_callback(mount_t mp, void * arg);
 183 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 184 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 185 static int sync_callback(mount_t, void *);
 186 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 187     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 188     boolean_t partial_copy);
 189 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 190 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 191     struct componentname *cnp, user_addr_t fsmountargs,
 192     int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 193     vfs_context_t ctx);
 194 void vfs_notify_mount(vnode_t pdvp);
 195
 196 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 197
 198 struct fd_vn_data * fg_vn_data_alloc(void);
 199
 200 /*
 201  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 202  * Concurrent lookups (or lookups by ids) on hard links can cause the
 203  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 204  * does) to return ENOENT as the path cannot be returned from the name cache
 205  * alone. We have no option but to retry and hope to get one namei->reverse path
 206  * generation done without an intervening lookup, lookup by id on the hard link
 207  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 208  * which currently are the MAC hooks for rename, unlink and rmdir.
 209  */
 210 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 211
 212 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
 213     int unlink_flags);
 214
 215 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, uint32_t options, int *);
 216
 217 #ifdef CONFIG_IMGSRC_ACCESS
 218 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 219 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 220 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 221 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 222 static void mount_end_update(mount_t mp);
 223 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 224 #endif /* CONFIG_IMGSRC_ACCESS */
 225
 226 #if CONFIG_LOCKERBOOT
 227 int mount_locker_protoboot(const char *fsname, const char *mntpoint,
 228     const char *pbdevpath);
 229 #endif
 230
 231 //snapshot functions
 232 #if CONFIG_MNT_ROOTSNAP
 233 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
 234 #else
 235 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
 236 #endif
 237
 238 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 239
 240 __private_extern__
 241 int sync_internal(void);
 242
 243 __private_extern__
 244 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 245
 246 extern lck_grp_t *fd_vn_lck_grp;
 247 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 248 extern lck_attr_t *fd_vn_lck_attr;
 249
 250 /*
 251  * incremented each time a mount or unmount operation occurs
 252  * used to invalidate the cached value of the rootvp in the
 253  * mount structure utilized by cache_lookup_path
 254  */
 255 uint32_t mount_generation = 0;
 256
 257 /* counts number of mount and unmount operations */
 258 unsigned int vfs_nummntops = 0;
 259
 260 extern const struct fileops vnops;
 261 #if CONFIG_APPLEDOUBLE
 262 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 263 #endif /* CONFIG_APPLEDOUBLE */
 264
 265 /*
 266  * Virtual File System System Calls
 267  */
 268
 269 #if NFSCLIENT || DEVFS || ROUTEFS
 270 /*
 271  * Private in-kernel mounting spi (NFS only, not exported)
 272  */
 273 __private_extern__
 274 boolean_t
 275 vfs_iskernelmount(mount_t mp)
 276 {
 277         return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
 278 }
 279
 280 __private_extern__
 281 int
 282 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 283     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags, vfs_context_t ctx)
 284 {
 285         struct nameidata nd;
 286         boolean_t did_namei;
 287         int error;
 288
 289         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 290             UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 291
 292         /*
 293          * Get the vnode to be covered if it's not supplied
 294          */
 295         if (vp == NULLVP) {
 296                 error = namei(&nd);
 297                 if (error) {
 298                         if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VMVOL | KERNEL_MOUNT_DATAVOL)) {
 299                                 printf("failed to locate mount-on path: %s ", path);
 300                         }
 301                         return error;
 302                 }
 303                 vp = nd.ni_vp;
 304                 pvp = nd.ni_dvp;
 305                 did_namei = TRUE;
 306         } else {
 307                 char *pnbuf = CAST_DOWN(char *, path);
 308
 309                 nd.ni_cnd.cn_pnbuf = pnbuf;
 310                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 311                 did_namei = FALSE;
 312         }
 313
 314         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 315             syscall_flags, kern_flags, NULL, TRUE, ctx);
 316
 317         if (did_namei) {
 318                 vnode_put(vp);
 319                 vnode_put(pvp);
 320                 nameidone(&nd);
 321         }
 322
 323         return error;
 324 }
 325 #endif /* NFSCLIENT || DEVFS */
 326
 327 /*
 328  * Mount a file system.
 329  */
 330 /* ARGSUSED */
 331 int
 332 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 333 {
 334         struct __mac_mount_args muap;
 335
 336         muap.type = uap->type;
 337         muap.path = uap->path;
 338         muap.flags = uap->flags;
 339         muap.data = uap->data;
 340         muap.mac_p = USER_ADDR_NULL;
 341         return __mac_mount(p, &muap, retval);
 342 }
 343
 344 int
 345 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
 346 {
 347         struct componentname    cn;
 348         vfs_context_t           ctx = vfs_context_current();
 349         size_t                  dummy = 0;
 350         int                     error;
 351         int                     flags = uap->flags;
 352         char                    fstypename[MFSNAMELEN];
 353         char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
 354         vnode_t                 pvp;
 355         vnode_t                 vp;
 356
 357         AUDIT_ARG(fd, uap->fd);
 358         AUDIT_ARG(fflags, flags);
 359         /* fstypename will get audited by mount_common */
 360
 361         /* Sanity check the flags */
 362         if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
 363                 return ENOTSUP;
 364         }
 365
 366         if (flags & MNT_UNION) {
 367                 return EPERM;
 368         }
 369
 370         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 371         if (error) {
 372                 return error;
 373         }
 374
 375         if ((error = file_vnode(uap->fd, &vp)) != 0) {
 376                 return error;
 377         }
 378
 379         if ((error = vnode_getwithref(vp)) != 0) {
 380                 file_drop(uap->fd);
 381                 return error;
 382         }
 383
 384         pvp = vnode_getparent(vp);
 385         if (pvp == NULL) {
 386                 vnode_put(vp);
 387                 file_drop(uap->fd);
 388                 return EINVAL;
 389         }
 390
 391         memset(&cn, 0, sizeof(struct componentname));
 392         MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
 393         cn.cn_pnlen = MAXPATHLEN;
 394
 395         if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
 396                 FREE(cn.cn_pnbuf, M_TEMP);
 397                 vnode_put(pvp);
 398                 vnode_put(vp);
 399                 file_drop(uap->fd);
 400                 return error;
 401         }
 402
 403         error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
 404
 405         FREE(cn.cn_pnbuf, M_TEMP);
 406         vnode_put(pvp);
 407         vnode_put(vp);
 408         file_drop(uap->fd);
 409
 410         return error;
 411 }
 412
 413 void
 414 vfs_notify_mount(vnode_t pdvp)
 415 {
 416         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 417         lock_vnode_and_post(pdvp, NOTE_WRITE);
 418 }
 419
 420 /*
 421  * __mac_mount:
 422  *      Mount a file system taking into account MAC label behavior.
 423  *      See mount(2) man page for more information
 424  *
 425  * Parameters:    p                        Process requesting the mount
 426  *                uap                      User argument descriptor (see below)
 427  *                retval                   (ignored)
 428  *
 429  * Indirect:      uap->type                Filesystem type
 430  *                uap->path                Path to mount
 431  *                uap->data                Mount arguments
 432  *                uap->mac_p               MAC info
 433  *                uap->flags               Mount flags
 434  *
 435  *
 436  * Returns:        0                       Success
 437  *                !0                       Not success
 438  */
 439 boolean_t root_fs_upgrade_try = FALSE;
 440
 441 int
 442 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 443 {
 444         vnode_t pvp = NULL;
 445         vnode_t vp = NULL;
 446         int need_nameidone = 0;
 447         vfs_context_t ctx = vfs_context_current();
 448         char fstypename[MFSNAMELEN];
 449         struct nameidata nd;
 450         size_t dummy = 0;
 451         char *labelstr = NULL;
 452         int flags = uap->flags;
 453         int error;
 454 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 455         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 456 #else
 457 #pragma unused(p)
 458 #endif
 459         /*
 460          * Get the fs type name from user space
 461          */
 462         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 463         if (error) {
 464                 return error;
 465         }
 466
 467         /*
 468          * Get the vnode to be covered
 469          */
 470         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 471             UIO_USERSPACE, uap->path, ctx);
 472         error = namei(&nd);
 473         if (error) {
 474                 goto out;
 475         }
 476         need_nameidone = 1;
 477         vp = nd.ni_vp;
 478         pvp = nd.ni_dvp;
 479
 480 #ifdef CONFIG_IMGSRC_ACCESS
 481         /* Mounting image source cannot be batched with other operations */
 482         if (flags == MNT_IMGSRC_BY_INDEX) {
 483                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 484                     ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 485                 goto out;
 486         }
 487 #endif /* CONFIG_IMGSRC_ACCESS */
 488
 489 #if CONFIG_MACF
 490         /*
 491          * Get the label string (if any) from user space
 492          */
 493         if (uap->mac_p != USER_ADDR_NULL) {
 494                 struct user_mac mac;
 495                 size_t ulen = 0;
 496
 497                 if (is_64bit) {
 498                         struct user64_mac mac64;
 499                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 500                         mac.m_buflen = mac64.m_buflen;
 501                         mac.m_string = mac64.m_string;
 502                 } else {
 503                         struct user32_mac mac32;
 504                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 505                         mac.m_buflen = mac32.m_buflen;
 506                         mac.m_string = mac32.m_string;
 507                 }
 508                 if (error) {
 509                         goto out;
 510                 }
 511                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 512                     (mac.m_buflen < 2)) {
 513                         error = EINVAL;
 514                         goto out;
 515                 }
 516                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 517                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 518                 if (error) {
 519                         goto out;
 520                 }
 521                 AUDIT_ARG(mac_string, labelstr);
 522         }
 523 #endif /* CONFIG_MACF */
 524
 525         AUDIT_ARG(fflags, flags);
 526
 527 #if SECURE_KERNEL
 528         if (flags & MNT_UNION) {
 529                 /* No union mounts on release kernels */
 530                 error = EPERM;
 531                 goto out;
 532         }
 533 #endif
 534
 535         if ((vp->v_flag & VROOT) &&
 536             (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 537                 if (!(flags & MNT_UNION)) {
 538                         flags |= MNT_UPDATE;
 539                 } else {
 540                         /*
 541                          * For a union mount on '/', treat it as fresh
 542                          * mount instead of update.
 543                          * Otherwise, union mouting on '/' used to panic the
 544                          * system before, since mnt_vnodecovered was found to
 545                          * be NULL for '/' which is required for unionlookup
 546                          * after it gets ENOENT on union mount.
 547                          */
 548                         flags = (flags & ~(MNT_UPDATE));
 549                 }
 550
 551 #if SECURE_KERNEL
 552                 if ((flags & MNT_RDONLY) == 0) {
 553                         /* Release kernels are not allowed to mount "/" as rw */
 554                         error = EPERM;
 555                         goto out;
 556                 }
 557 #endif
 558                 /*
 559                  * See 7392553 for more details on why this check exists.
 560                  * Suffice to say: If this check is ON and something tries
 561                  * to mount the rootFS RW, we'll turn off the codesign
 562                  * bitmap optimization.
 563                  */
 564 #if CHECK_CS_VALIDATION_BITMAP
 565                 if ((flags & MNT_RDONLY) == 0) {
 566                         root_fs_upgrade_try = TRUE;
 567                 }
 568 #endif
 569         }
 570
 571         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 572             labelstr, FALSE, ctx);
 573
 574 out:
 575
 576 #if CONFIG_MACF
 577         if (labelstr) {
 578                 FREE(labelstr, M_MACTEMP);
 579         }
 580 #endif /* CONFIG_MACF */
 581
 582         if (vp) {
 583                 vnode_put(vp);
 584         }
 585         if (pvp) {
 586                 vnode_put(pvp);
 587         }
 588         if (need_nameidone) {
 589                 nameidone(&nd);
 590         }
 591
 592         return error;
 593 }
 594
 595 /*
 596  * common mount implementation (final stage of mounting)
 597  *
 598  * Arguments:
 599  *  fstypename  file system type (ie it's vfs name)
 600  *  pvp         parent of covered vnode
 601  *  vp          covered vnode
 602  *  cnp         component name (ie path) of covered vnode
 603  *  flags       generic mount flags
 604  *  fsmountargs file system specific data
 605  *  labelstr    optional MAC label
 606  *  kernelmount TRUE for mounts initiated from inside the kernel
 607  *  ctx         caller's context
 608  */
 609 static int
 610 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 611     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 612     char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 613 {
 614 #if !CONFIG_MACF
 615 #pragma unused(labelstr)
 616 #endif
 617         struct vnode *devvp = NULLVP;
 618         struct vnode *device_vnode = NULLVP;
 619 #if CONFIG_MACF
 620         struct vnode *rvp;
 621 #endif
 622         struct mount *mp;
 623         struct vfstable *vfsp = (struct vfstable *)0;
 624         struct proc *p = vfs_context_proc(ctx);
 625         int error, flag = 0;
 626         user_addr_t devpath = USER_ADDR_NULL;
 627         int ronly = 0;
 628         int mntalloc = 0;
 629         boolean_t vfsp_ref = FALSE;
 630         boolean_t is_rwlock_locked = FALSE;
 631         boolean_t did_rele = FALSE;
 632         boolean_t have_usecount = FALSE;
 633
 634 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM
 635         /* Check for mutually-exclusive flag bits */
 636         uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL));
 637         int bitcount = 0;
 638         while (checkflags != 0) {
 639                 checkflags &= (checkflags - 1);
 640                 bitcount++;
 641         }
 642
 643         if (bitcount > 1) {
 644                 //not allowed to request multiple mount-by-role flags
 645                 error = EINVAL;
 646                 goto out1;
 647         }
 648 #endif
 649
 650         /*
 651          * Process an update for an existing mount
 652          */
 653         if (flags & MNT_UPDATE) {
 654                 if ((vp->v_flag & VROOT) == 0) {
 655                         error = EINVAL;
 656                         goto out1;
 657                 }
 658                 mp = vp->v_mount;
 659
 660                 /* unmount in progress return error */
 661                 mount_lock_spin(mp);
 662                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 663                         mount_unlock(mp);
 664                         error = EBUSY;
 665                         goto out1;
 666                 }
 667                 mount_unlock(mp);
 668                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 669                 is_rwlock_locked = TRUE;
 670                 /*
 671                  * We only allow the filesystem to be reloaded if it
 672                  * is currently mounted read-only.
 673                  */
 674                 if ((flags & MNT_RELOAD) &&
 675                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 676                         error = ENOTSUP;
 677                         goto out1;
 678                 }
 679
 680                 /*
 681                  * If content protection is enabled, update mounts are not
 682                  * allowed to turn it off.
 683                  */
 684                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 685                     ((flags & MNT_CPROTECT) == 0)) {
 686                         error = EINVAL;
 687                         goto out1;
 688                 }
 689
 690                 /*
 691                  * can't turn off MNT_REMOVABLE either but it may be an unexpected
 692                  * failure to return an error for this so we'll just silently
 693                  * add it if it is not passed in.
 694                  */
 695                 if ((mp->mnt_flag & MNT_REMOVABLE) &&
 696                     ((flags & MNT_REMOVABLE) == 0)) {
 697                         flags |= MNT_REMOVABLE;
 698                 }
 699
 700 #ifdef CONFIG_IMGSRC_ACCESS
 701                 /* Can't downgrade the backer of the root FS */
 702                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 703                     (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 704                         error = ENOTSUP;
 705                         goto out1;
 706                 }
 707 #endif /* CONFIG_IMGSRC_ACCESS */
 708
 709                 /*
 710                  * Only root, or the user that did the original mount is
 711                  * permitted to update it.
 712                  */
 713                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 714                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 715                         goto out1;
 716                 }
 717 #if CONFIG_MACF
 718                 error = mac_mount_check_remount(ctx, mp);
 719                 if (error != 0) {
 720                         goto out1;
 721                 }
 722 #endif
 723                 /*
 724                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 725                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 726                  */
 727                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 728                         flags |= MNT_NOSUID | MNT_NODEV;
 729                         if (mp->mnt_flag & MNT_NOEXEC) {
 730                                 flags |= MNT_NOEXEC;
 731                         }
 732                 }
 733                 flag = mp->mnt_flag;
 734
 735
 736
 737                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 738
 739                 vfsp = mp->mnt_vtable;
 740                 goto update;
 741         } // MNT_UPDATE
 742
 743         /*
 744          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 745          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 746          */
 747         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 748                 flags |= MNT_NOSUID | MNT_NODEV;
 749                 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
 750                         flags |= MNT_NOEXEC;
 751                 }
 752         }
 753
 754         /* XXXAUDIT: Should we capture the type on the error path as well? */
 755         AUDIT_ARG(text, fstypename);
 756         mount_list_lock();
 757         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 758                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 759                         vfsp->vfc_refcount++;
 760                         vfsp_ref = TRUE;
 761                         break;
 762                 }
 763         }
 764         mount_list_unlock();
 765         if (vfsp == NULL) {
 766                 error = ENODEV;
 767                 goto out1;
 768         }
 769
 770         /*
 771          * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
 772          * except in ROSV configs.
 773          */
 774         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
 775             ((internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL)) == 0)) {
 776                 error = EINVAL;  /* unsupported request */
 777                 goto out1;
 778         }
 779
 780         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 781         if (error != 0) {
 782                 goto out1;
 783         }
 784
 785         /*
 786          * Allocate and initialize the filesystem (mount_t)
 787          */
 788         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 789             M_MOUNT, M_WAITOK);
 790         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 791         mntalloc = 1;
 792
 793         /* Initialize the default IO constraints */
 794         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 795         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 796         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 797         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 798         mp->mnt_devblocksize = DEV_BSIZE;
 799         mp->mnt_alignmentmask = PAGE_MASK;
 800         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 801         mp->mnt_ioscale = 1;
 802         mp->mnt_ioflags = 0;
 803         mp->mnt_realrootvp = NULLVP;
 804         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 805
 806         TAILQ_INIT(&mp->mnt_vnodelist);
 807         TAILQ_INIT(&mp->mnt_workerqueue);
 808         TAILQ_INIT(&mp->mnt_newvnodes);
 809         mount_lock_init(mp);
 810         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 811         is_rwlock_locked = TRUE;
 812         mp->mnt_op = vfsp->vfc_vfsops;
 813         mp->mnt_vtable = vfsp;
 814         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 815         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 816         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 817         do {
 818                 int pathlen = MAXPATHLEN;
 819
 820                 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
 821                         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 822                 }
 823         } while (0);
 824         mp->mnt_vnodecovered = vp;
 825         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 826         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 827         mp->mnt_devbsdunit = 0;
 828
 829         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 830         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 831
 832 #if NFSCLIENT || DEVFS || ROUTEFS
 833         if (kernelmount) {
 834                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 835         }
 836         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
 837                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 838         }
 839 #endif /* NFSCLIENT || DEVFS */
 840
 841 update:
 842
 843         /*
 844          * Set the mount level flags.
 845          */
 846         if (flags & MNT_RDONLY) {
 847                 mp->mnt_flag |= MNT_RDONLY;
 848         } else if (mp->mnt_flag & MNT_RDONLY) {
 849                 // disallow read/write upgrades of file systems that
 850                 // had the TYPENAME_OVERRIDE feature set.
 851                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 852                         error = EPERM;
 853                         goto out1;
 854                 }
 855                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 856         }
 857         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 858             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 859             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 860             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
 861             MNT_QUARANTINE | MNT_CPROTECT);
 862
 863 #if SECURE_KERNEL
 864 #if !CONFIG_MNT_SUID
 865         /*
 866          * On release builds of iOS based platforms, always enforce NOSUID on
 867          * all mounts. We do this here because we can catch update mounts as well as
 868          * non-update mounts in this case.
 869          */
 870         mp->mnt_flag |= (MNT_NOSUID);
 871 #endif
 872 #endif
 873
 874         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 875             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 876             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 877             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
 878             MNT_QUARANTINE | MNT_CPROTECT);
 879
 880 #if CONFIG_MACF
 881         if (flags & MNT_MULTILABEL) {
 882                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 883                         error = EINVAL;
 884                         goto out1;
 885                 }
 886                 mp->mnt_flag |= MNT_MULTILABEL;
 887         }
 888 #endif
 889         /*
 890          * Process device path for local file systems if requested
 891          */
 892         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
 893             !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL))) {
 894                 //snapshot, vm, datavolume mounts are special
 895                 if (vfs_context_is64bit(ctx)) {
 896                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
 897                                 goto out1;
 898                         }
 899                         fsmountargs += sizeof(devpath);
 900                 } else {
 901                         user32_addr_t tmp;
 902                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
 903                                 goto out1;
 904                         }
 905                         /* munge into LP64 addr */
 906                         devpath = CAST_USER_ADDR_T(tmp);
 907                         fsmountargs += sizeof(tmp);
 908                 }
 909
 910                 /* Lookup device and authorize access to it */
 911                 if ((devpath)) {
 912                         struct nameidata nd;
 913
 914                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 915                         if ((error = namei(&nd))) {
 916                                 goto out1;
 917                         }
 918
 919                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 920                         devvp = nd.ni_vp;
 921
 922                         nameidone(&nd);
 923
 924                         if (devvp->v_type != VBLK) {
 925                                 error = ENOTBLK;
 926                                 goto out2;
 927                         }
 928                         if (major(devvp->v_rdev) >= nblkdev) {
 929                                 error = ENXIO;
 930                                 goto out2;
 931                         }
 932                         /*
 933                          * If mount by non-root, then verify that user has necessary
 934                          * permissions on the device.
 935                          */
 936                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 937                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 938
 939                                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 940                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 941                                 }
 942                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
 943                                         goto out2;
 944                                 }
 945                         }
 946                 }
 947                 /* On first mount, preflight and open device */
 948                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 949                         if ((error = vnode_ref(devvp))) {
 950                                 goto out2;
 951                         }
 952                         /*
 953                          * Disallow multiple mounts of the same device.
 954                          * Disallow mounting of a device that is currently in use
 955                          * (except for root, which might share swap device for miniroot).
 956                          * Flush out any old buffers remaining from a previous use.
 957                          */
 958                         if ((error = vfs_mountedon(devvp))) {
 959                                 goto out3;
 960                         }
 961
 962                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 963                                 error = EBUSY;
 964                                 goto out3;
 965                         }
 966                         if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
 967                                 error = ENOTBLK;
 968                                 goto out3;
 969                         }
 970                         if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
 971                                 goto out3;
 972                         }
 973
 974                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 975 #if CONFIG_MACF
 976                         error = mac_vnode_check_open(ctx,
 977                             devvp,
 978                             ronly ? FREAD : FREAD | FWRITE);
 979                         if (error) {
 980                                 goto out3;
 981                         }
 982 #endif /* MAC */
 983                         if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
 984                                 goto out3;
 985                         }
 986
 987                         mp->mnt_devvp = devvp;
 988                         device_vnode = devvp;
 989                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 990                     (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 991                     (device_vnode = mp->mnt_devvp)) {
 992                         dev_t dev;
 993                         int maj;
 994                         /*
 995                          * If upgrade to read-write by non-root, then verify
 996                          * that user has necessary permissions on the device.
 997                          */
 998                         vnode_getalways(device_vnode);
 999
1000                         if (suser(vfs_context_ucred(ctx), NULL) &&
1001                             (error = vnode_authorize(device_vnode, NULL,
1002                             KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1003                             ctx)) != 0) {
1004                                 vnode_put(device_vnode);
1005                                 goto out2;
1006                         }
1007
1008                         /* Tell the device that we're upgrading */
1009                         dev = (dev_t)device_vnode->v_rdev;
1010                         maj = major(dev);
1011
1012                         if ((u_int)maj >= (u_int)nblkdev) {
1013                                 panic("Volume mounted on a device with invalid major number.");
1014                         }
1015
1016                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1017                         vnode_put(device_vnode);
1018                         device_vnode = NULLVP;
1019                         if (error != 0) {
1020                                 goto out2;
1021                         }
1022                 }
1023         } // localargs && !(snapshot | data | vm)
1024
1025 #if CONFIG_MACF
1026         if ((flags & MNT_UPDATE) == 0) {
1027                 mac_mount_label_init(mp);
1028                 mac_mount_label_associate(ctx, mp);
1029         }
1030         if (labelstr) {
1031                 if ((flags & MNT_UPDATE) != 0) {
1032                         error = mac_mount_check_label_update(ctx, mp);
1033                         if (error != 0) {
1034                                 goto out3;
1035                         }
1036                 }
1037         }
1038 #endif
1039         /*
1040          * Mount the filesystem.  We already asserted that internal_flags
1041          * cannot have more than one mount-by-role bit set.
1042          */
1043         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1044                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1045                     (caddr_t)fsmountargs, 0, ctx);
1046         } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1047 #if CONFIG_ROSV_STARTUP
1048                 struct mount *origin_mp = (struct mount*)fsmountargs;
1049                 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1050                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1051                 if (error) {
1052                         printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1053                 } else {
1054                         /* Mark volume associated with system volume */
1055                         mp->mnt_kern_flag |= MNTK_SYSTEM;
1056
1057                         /* Attempt to acquire the mnt_devvp and set it up */
1058                         struct vnode *mp_devvp = NULL;
1059                         if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1060                                 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1061                                     0, &mp_devvp, vfs_context_kernel());
1062                                 if (!lerr) {
1063                                         mp->mnt_devvp = mp_devvp;
1064                                         //vnode_lookup took an iocount, need to drop it.
1065                                         vnode_put(mp_devvp);
1066                                         // now set `device_vnode` to the devvp that was acquired.
1067                                         // this is needed in order to ensure vfs_init_io_attributes is invoked.
1068                                         // note that though the iocount above was dropped, the mount acquires
1069                                         // an implicit reference against the device.
1070                                         device_vnode = mp_devvp;
1071                                 }
1072                         }
1073                 }
1074 #else
1075                 error = EINVAL;
1076 #endif
1077         } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1078 #if CONFIG_MOUNT_VM
1079                 struct mount *origin_mp = (struct mount*)fsmountargs;
1080                 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1081                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1082                 if (error) {
1083                         printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1084                 } else {
1085                         /* Mark volume associated with system volume and a swap mount */
1086                         mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1087                         /* Attempt to acquire the mnt_devvp and set it up */
1088                         struct vnode *mp_devvp = NULL;
1089                         if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1090                                 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1091                                     0, &mp_devvp, vfs_context_kernel());
1092                                 if (!lerr) {
1093                                         mp->mnt_devvp = mp_devvp;
1094                                         //vnode_lookup took an iocount, need to drop it.
1095                                         vnode_put(mp_devvp);
1096
1097                                         // now set `device_vnode` to the devvp that was acquired.
1098                                         // note that though the iocount above was dropped, the mount acquires
1099                                         // an implicit reference against the device.
1100                                         device_vnode = mp_devvp;
1101                                 }
1102                         }
1103                 }
1104 #else
1105                 error = EINVAL;
1106 #endif
1107         } else {
1108                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1109         }
1110
1111         if (flags & MNT_UPDATE) {
1112                 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1113                         mp->mnt_flag &= ~MNT_RDONLY;
1114                 }
1115                 mp->mnt_flag &= ~
1116                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1117                 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1118                 if (error) {
1119                         mp->mnt_flag = flag;  /* restore flag value */
1120                 }
1121                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1122                 lck_rw_done(&mp->mnt_rwlock);
1123                 is_rwlock_locked = FALSE;
1124                 if (!error) {
1125                         enablequotas(mp, ctx);
1126                 }
1127                 goto exit;
1128         }
1129
1130         /*
1131          * Put the new filesystem on the mount list after root.
1132          */
1133         if (error == 0) {
1134                 struct vfs_attr vfsattr;
1135 #if CONFIG_MACF
1136                 error = mac_mount_check_mount_late(ctx, mp);
1137                 if (error != 0) {
1138                         goto out3;
1139                 }
1140
1141                 if (vfs_flags(mp) & MNT_MULTILABEL) {
1142                         error = VFS_ROOT(mp, &rvp, ctx);
1143                         if (error) {
1144                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1145                                 goto out3;
1146                         }
1147                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1148                         /*
1149                          * drop reference provided by VFS_ROOT
1150                          */
1151                         vnode_put(rvp);
1152
1153                         if (error) {
1154                                 goto out3;
1155                         }
1156                 }
1157 #endif  /* MAC */
1158
1159                 vnode_lock_spin(vp);
1160                 CLR(vp->v_flag, VMOUNT);
1161                 vp->v_mountedhere = mp;
1162                 vnode_unlock(vp);
1163
1164                 /*
1165                  * taking the name_cache_lock exclusively will
1166                  * insure that everyone is out of the fast path who
1167                  * might be trying to use a now stale copy of
1168                  * vp->v_mountedhere->mnt_realrootvp
1169                  * bumping mount_generation causes the cached values
1170                  * to be invalidated
1171                  */
1172                 name_cache_lock();
1173                 mount_generation++;
1174                 name_cache_unlock();
1175
1176                 error = vnode_ref(vp);
1177                 if (error != 0) {
1178                         goto out4;
1179                 }
1180
1181                 have_usecount = TRUE;
1182
1183                 error = checkdirs(vp, ctx);
1184                 if (error != 0) {
1185                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
1186                         goto out4;
1187                 }
1188                 /*
1189                  * there is no cleanup code here so I have made it void
1190                  * we need to revisit this
1191                  */
1192                 (void)VFS_START(mp, 0, ctx);
1193
1194                 if (mount_list_add(mp) != 0) {
1195                         /*
1196                          * The system is shutting down trying to umount
1197                          * everything, so fail with a plausible errno.
1198                          */
1199                         error = EBUSY;
1200                         goto out4;
1201                 }
1202                 lck_rw_done(&mp->mnt_rwlock);
1203                 is_rwlock_locked = FALSE;
1204
1205                 /* Check if this mounted file system supports EAs or named streams. */
1206                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1207                 VFSATTR_INIT(&vfsattr);
1208                 VFSATTR_WANTED(&vfsattr, f_capabilities);
1209                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1210                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1211                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1212                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1213                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1214                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1215                         }
1216 #if NAMEDSTREAMS
1217                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1218                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1219                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1220                         }
1221 #endif
1222                         /* Check if this file system supports path from id lookups. */
1223                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1224                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1225                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1226                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
1227                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1228                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1229                         }
1230
1231                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1232                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1233                                 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1234                         }
1235                 }
1236                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1237                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1238                 }
1239                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1240                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1241                 }
1242                 /* increment the operations count */
1243                 OSAddAtomic(1, &vfs_nummntops);
1244                 enablequotas(mp, ctx);
1245
1246                 if (device_vnode) {
1247                         device_vnode->v_specflags |= SI_MOUNTEDON;
1248
1249                         /*
1250                          *   cache the IO attributes for the underlying physical media...
1251                          *   an error return indicates the underlying driver doesn't
1252                          *   support all the queries necessary... however, reasonable
1253                          *   defaults will have been set, so no reason to bail or care
1254                          */
1255                         vfs_init_io_attributes(device_vnode, mp);
1256                 }
1257
1258                 /* Now that mount is setup, notify the listeners */
1259                 vfs_notify_mount(pvp);
1260                 IOBSDMountChange(mp, kIOMountChangeMount);
1261         } else {
1262                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1263                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1264                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1265                             mp->mnt_vtable->vfc_name, error);
1266                 }
1267
1268                 vnode_lock_spin(vp);
1269                 CLR(vp->v_flag, VMOUNT);
1270                 vnode_unlock(vp);
1271                 mount_list_lock();
1272                 mp->mnt_vtable->vfc_refcount--;
1273                 mount_list_unlock();
1274
1275                 if (device_vnode) {
1276                         vnode_rele(device_vnode);
1277                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1278                 }
1279                 lck_rw_done(&mp->mnt_rwlock);
1280                 is_rwlock_locked = FALSE;
1281
1282                 /*
1283                  * if we get here, we have a mount structure that needs to be freed,
1284                  * but since the coveredvp hasn't yet been updated to point at it,
1285                  * no need to worry about other threads holding a crossref on this mp
1286                  * so it's ok to just free it
1287                  */
1288                 mount_lock_destroy(mp);
1289 #if CONFIG_MACF
1290                 mac_mount_label_destroy(mp);
1291 #endif
1292                 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1293         }
1294 exit:
1295         /*
1296          * drop I/O count on the device vp if there was one
1297          */
1298         if (devpath && devvp) {
1299                 vnode_put(devvp);
1300         }
1301
1302         return error;
1303
1304 /* Error condition exits */
1305 out4:
1306         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1307
1308         /*
1309          * If the mount has been placed on the covered vp,
1310          * it may have been discovered by now, so we have
1311          * to treat this just like an unmount
1312          */
1313         mount_lock_spin(mp);
1314         mp->mnt_lflag |= MNT_LDEAD;
1315         mount_unlock(mp);
1316
1317         if (device_vnode != NULLVP) {
1318                 vnode_rele(device_vnode);
1319                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1320                     ctx);
1321                 did_rele = TRUE;
1322         }
1323
1324         vnode_lock_spin(vp);
1325
1326         mp->mnt_crossref++;
1327         vp->v_mountedhere = (mount_t) 0;
1328
1329         vnode_unlock(vp);
1330
1331         if (have_usecount) {
1332                 vnode_rele(vp);
1333         }
1334 out3:
1335         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1336                 vnode_rele(devvp);
1337         }
1338 out2:
1339         if (devpath && devvp) {
1340                 vnode_put(devvp);
1341         }
1342 out1:
1343         /* Release mnt_rwlock only when it was taken */
1344         if (is_rwlock_locked == TRUE) {
1345                 lck_rw_done(&mp->mnt_rwlock);
1346         }
1347
1348         if (mntalloc) {
1349                 if (mp->mnt_crossref) {
1350                         mount_dropcrossref(mp, vp, 0);
1351                 } else {
1352                         mount_lock_destroy(mp);
1353 #if CONFIG_MACF
1354                         mac_mount_label_destroy(mp);
1355 #endif
1356                         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1357                 }
1358         }
1359         if (vfsp_ref) {
1360                 mount_list_lock();
1361                 vfsp->vfc_refcount--;
1362                 mount_list_unlock();
1363         }
1364
1365         return error;
1366 }
1367
1368 /*
1369  * Flush in-core data, check for competing mount attempts,
1370  * and set VMOUNT
1371  */
1372 int
1373 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1374 {
1375 #if !CONFIG_MACF
1376 #pragma unused(cnp,fsname)
1377 #endif
1378         struct vnode_attr va;
1379         int error;
1380
1381         if (!skip_auth) {
1382                 /*
1383                  * If the user is not root, ensure that they own the directory
1384                  * onto which we are attempting to mount.
1385                  */
1386                 VATTR_INIT(&va);
1387                 VATTR_WANTED(&va, va_uid);
1388                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1389                     (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1390                     (!vfs_context_issuser(ctx)))) {
1391                         error = EPERM;
1392                         goto out;
1393                 }
1394         }
1395
1396         if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1397                 goto out;
1398         }
1399
1400         if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1401                 goto out;
1402         }
1403
1404         if (vp->v_type != VDIR) {
1405                 error = ENOTDIR;
1406                 goto out;
1407         }
1408
1409         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1410                 error = EBUSY;
1411                 goto out;
1412         }
1413
1414 #if CONFIG_MACF
1415         error = mac_mount_check_mount(ctx, vp,
1416             cnp, fsname);
1417         if (error != 0) {
1418                 goto out;
1419         }
1420 #endif
1421
1422         vnode_lock_spin(vp);
1423         SET(vp->v_flag, VMOUNT);
1424         vnode_unlock(vp);
1425
1426 out:
1427         return error;
1428 }
1429
1430 #if CONFIG_IMGSRC_ACCESS
1431
1432 #define DEBUG_IMGSRC 0
1433
1434 #if DEBUG_IMGSRC
1435 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1436 #else
1437 #define IMGSRC_DEBUG(args...) do { } while(0)
1438 #endif
1439
1440 static int
1441 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1442 {
1443         struct nameidata nd;
1444         vnode_t vp, realdevvp;
1445         mode_t accessmode;
1446         int error;
1447         enum uio_seg uio = UIO_USERSPACE;
1448
1449         if (ctx == vfs_context_kernel()) {
1450                 uio = UIO_SYSSPACE;
1451         }
1452
1453         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1454         if ((error = namei(&nd))) {
1455                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1456                 return error;
1457         }
1458
1459         vp = nd.ni_vp;
1460
1461         if (!vnode_isblk(vp)) {
1462                 IMGSRC_DEBUG("Not block device.\n");
1463                 error = ENOTBLK;
1464                 goto out;
1465         }
1466
1467         realdevvp = mp->mnt_devvp;
1468         if (realdevvp == NULLVP) {
1469                 IMGSRC_DEBUG("No device backs the mount.\n");
1470                 error = ENXIO;
1471                 goto out;
1472         }
1473
1474         error = vnode_getwithref(realdevvp);
1475         if (error != 0) {
1476                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1477                 goto out;
1478         }
1479
1480         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1481                 IMGSRC_DEBUG("Wrong dev_t.\n");
1482                 error = ENXIO;
1483                 goto out1;
1484         }
1485
1486         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1487
1488         /*
1489          * If mount by non-root, then verify that user has necessary
1490          * permissions on the device.
1491          */
1492         if (!vfs_context_issuser(ctx)) {
1493                 accessmode = KAUTH_VNODE_READ_DATA;
1494                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1495                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1496                 }
1497                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1498                         IMGSRC_DEBUG("Access denied.\n");
1499                         goto out1;
1500                 }
1501         }
1502
1503         *devvpp = vp;
1504
1505 out1:
1506         vnode_put(realdevvp);
1507
1508 out:
1509         nameidone(&nd);
1510
1511         if (error) {
1512                 vnode_put(vp);
1513         }
1514
1515         return error;
1516 }
1517
1518 /*
1519  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1520  * and call checkdirs()
1521  */
1522 static int
1523 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1524 {
1525         int error;
1526
1527         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1528
1529         IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1530             mp->mnt_vtable->vfc_name, vnode_getname(vp));
1531
1532         vnode_lock_spin(vp);
1533         CLR(vp->v_flag, VMOUNT);
1534         vp->v_mountedhere = mp;
1535         vnode_unlock(vp);
1536
1537         /*
1538          * taking the name_cache_lock exclusively will
1539          * insure that everyone is out of the fast path who
1540          * might be trying to use a now stale copy of
1541          * vp->v_mountedhere->mnt_realrootvp
1542          * bumping mount_generation causes the cached values
1543          * to be invalidated
1544          */
1545         name_cache_lock();
1546         mount_generation++;
1547         name_cache_unlock();
1548
1549         error = vnode_ref(vp);
1550         if (error != 0) {
1551                 goto out;
1552         }
1553
1554         error = checkdirs(vp, ctx);
1555         if (error != 0) {
1556                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1557                 vnode_rele(vp);
1558                 goto out;
1559         }
1560
1561 out:
1562         if (error != 0) {
1563                 mp->mnt_vnodecovered = NULLVP;
1564         }
1565         return error;
1566 }
1567
1568 static void
1569 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1570 {
1571         vnode_rele(vp);
1572         vnode_lock_spin(vp);
1573         vp->v_mountedhere = (mount_t)NULL;
1574         vnode_unlock(vp);
1575
1576         mp->mnt_vnodecovered = NULLVP;
1577 }
1578
1579 static int
1580 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1581 {
1582         int error;
1583
1584         /* unmount in progress return error */
1585         mount_lock_spin(mp);
1586         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1587                 mount_unlock(mp);
1588                 return EBUSY;
1589         }
1590         mount_unlock(mp);
1591         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1592
1593         /*
1594          * We only allow the filesystem to be reloaded if it
1595          * is currently mounted read-only.
1596          */
1597         if ((flags & MNT_RELOAD) &&
1598             ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1599                 error = ENOTSUP;
1600                 goto out;
1601         }
1602
1603         /*
1604          * Only root, or the user that did the original mount is
1605          * permitted to update it.
1606          */
1607         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1608             (!vfs_context_issuser(ctx))) {
1609                 error = EPERM;
1610                 goto out;
1611         }
1612 #if CONFIG_MACF
1613         error = mac_mount_check_remount(ctx, mp);
1614         if (error != 0) {
1615                 goto out;
1616         }
1617 #endif
1618
1619 out:
1620         if (error) {
1621                 lck_rw_done(&mp->mnt_rwlock);
1622         }
1623
1624         return error;
1625 }
1626
1627 static void
1628 mount_end_update(mount_t mp)
1629 {
1630         lck_rw_done(&mp->mnt_rwlock);
1631 }
1632
1633 static int
1634 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1635 {
1636         vnode_t vp;
1637
1638         if (height >= MAX_IMAGEBOOT_NESTING) {
1639                 return EINVAL;
1640         }
1641
1642         vp = imgsrc_rootvnodes[height];
1643         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1644                 *rvpp = vp;
1645                 return 0;
1646         } else {
1647                 return ENOENT;
1648         }
1649 }
1650
1651 static int
1652 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1653     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1654     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1655 {
1656         int error;
1657         mount_t mp;
1658         boolean_t placed = FALSE;
1659         struct vfstable *vfsp;
1660         user_addr_t devpath;
1661         char *old_mntonname;
1662         vnode_t rvp;
1663         vnode_t devvp;
1664         uint32_t height;
1665         uint32_t flags;
1666
1667         /* If we didn't imageboot, nothing to move */
1668         if (imgsrc_rootvnodes[0] == NULLVP) {
1669                 return EINVAL;
1670         }
1671
1672         /* Only root can do this */
1673         if (!vfs_context_issuser(ctx)) {
1674                 return EPERM;
1675         }
1676
1677         IMGSRC_DEBUG("looking for root vnode.\n");
1678
1679         /*
1680          * Get root vnode of filesystem we're moving.
1681          */
1682         if (by_index) {
1683                 if (is64bit) {
1684                         struct user64_mnt_imgsrc_args mia64;
1685                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1686                         if (error != 0) {
1687                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1688                                 return error;
1689                         }
1690
1691                         height = mia64.mi_height;
1692                         flags = mia64.mi_flags;
1693                         devpath = mia64.mi_devpath;
1694                 } else {
1695                         struct user32_mnt_imgsrc_args mia32;
1696                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1697                         if (error != 0) {
1698                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1699                                 return error;
1700                         }
1701
1702                         height = mia32.mi_height;
1703                         flags = mia32.mi_flags;
1704                         devpath = mia32.mi_devpath;
1705                 }
1706         } else {
1707                 /*
1708                  * For binary compatibility--assumes one level of nesting.
1709                  */
1710                 if (is64bit) {
1711                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1712                                 return error;
1713                         }
1714                 } else {
1715                         user32_addr_t tmp;
1716                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1717                                 return error;
1718                         }
1719
1720                         /* munge into LP64 addr */
1721                         devpath = CAST_USER_ADDR_T(tmp);
1722                 }
1723
1724                 height = 0;
1725                 flags = 0;
1726         }
1727
1728         if (flags != 0) {
1729                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1730                 return EINVAL;
1731         }
1732
1733         error = get_imgsrc_rootvnode(height, &rvp);
1734         if (error != 0) {
1735                 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1736                 return error;
1737         }
1738
1739         IMGSRC_DEBUG("got old root vnode\n");
1740
1741         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1742
1743         /* Can only move once */
1744         mp = vnode_mount(rvp);
1745         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1746                 IMGSRC_DEBUG("Already moved.\n");
1747                 error = EBUSY;
1748                 goto out0;
1749         }
1750
1751         IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1752         IMGSRC_DEBUG("Starting updated.\n");
1753
1754         /* Get exclusive rwlock on mount, authorize update on mp */
1755         error = mount_begin_update(mp, ctx, 0);
1756         if (error != 0) {
1757                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1758                 goto out0;
1759         }
1760
1761         /*
1762          * It can only be moved once.  Flag is set under the rwlock,
1763          * so we're now safe to proceed.
1764          */
1765         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1766                 IMGSRC_DEBUG("Already moved [2]\n");
1767                 goto out1;
1768         }
1769
1770         IMGSRC_DEBUG("Preparing coveredvp.\n");
1771
1772         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1773         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1774         if (error != 0) {
1775                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1776                 goto out1;
1777         }
1778
1779         IMGSRC_DEBUG("Covered vp OK.\n");
1780
1781         /* Sanity check the name caller has provided */
1782         vfsp = mp->mnt_vtable;
1783         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1784                 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1785                     vfsp->vfc_name, fsname);
1786                 error = EINVAL;
1787                 goto out2;
1788         }
1789
1790         /* Check the device vnode and update mount-from name, for local filesystems */
1791         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1792                 IMGSRC_DEBUG("Local, doing device validation.\n");
1793
1794                 if (devpath != USER_ADDR_NULL) {
1795                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1796                         if (error) {
1797                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1798                                 goto out2;
1799                         }
1800
1801                         vnode_put(devvp);
1802                 }
1803         }
1804
1805         /*
1806          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1807          * and increment the name cache's mount generation
1808          */
1809
1810         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1811         error = place_mount_and_checkdirs(mp, vp, ctx);
1812         if (error != 0) {
1813                 goto out2;
1814         }
1815
1816         placed = TRUE;
1817
1818         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1819         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1820
1821         /* Forbid future moves */
1822         mount_lock(mp);
1823         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1824         mount_unlock(mp);
1825
1826         /* Finally, add to mount list, completely ready to go */
1827         if (mount_list_add(mp) != 0) {
1828                 /*
1829                  * The system is shutting down trying to umount
1830                  * everything, so fail with a plausible errno.
1831                  */
1832                 error = EBUSY;
1833                 goto out3;
1834         }
1835
1836         mount_end_update(mp);
1837         vnode_put(rvp);
1838         FREE(old_mntonname, M_TEMP);
1839
1840         vfs_notify_mount(pvp);
1841
1842         return 0;
1843 out3:
1844         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1845
1846         mount_lock(mp);
1847         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1848         mount_unlock(mp);
1849
1850 out2:
1851         /*
1852          * Placing the mp on the vnode clears VMOUNT,
1853          * so cleanup is different after that point
1854          */
1855         if (placed) {
1856                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1857                 undo_place_on_covered_vp(mp, vp);
1858         } else {
1859                 vnode_lock_spin(vp);
1860                 CLR(vp->v_flag, VMOUNT);
1861                 vnode_unlock(vp);
1862         }
1863 out1:
1864         mount_end_update(mp);
1865
1866 out0:
1867         vnode_put(rvp);
1868         FREE(old_mntonname, M_TEMP);
1869         return error;
1870 }
1871
1872 #if CONFIG_LOCKERBOOT
1873 __private_extern__
1874 int
1875 mount_locker_protoboot(const char *fsname, const char *mntpoint,
1876     const char *pbdevpath)
1877 {
1878         int error = -1;
1879         struct nameidata nd;
1880         boolean_t cleanup_nd = FALSE;
1881         vfs_context_t ctx = vfs_context_kernel();
1882         boolean_t is64 = TRUE;
1883         boolean_t by_index = TRUE;
1884         struct user64_mnt_imgsrc_args mia64 = {
1885                 .mi_height = 0,
1886                 .mi_flags = 0,
1887                 .mi_devpath = CAST_USER_ADDR_T(pbdevpath),
1888         };
1889         user_addr_t mia64addr = CAST_USER_ADDR_T(&mia64);
1890
1891         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
1892             UIO_SYSSPACE, CAST_USER_ADDR_T(mntpoint), ctx);
1893         error = namei(&nd);
1894         if (error) {
1895                 IMGSRC_DEBUG("namei: %d\n", error);
1896                 goto out;
1897         }
1898
1899         cleanup_nd = TRUE;
1900         error = relocate_imageboot_source(nd.ni_dvp, nd.ni_vp,
1901             &nd.ni_cnd, fsname, ctx, is64, mia64addr, by_index);
1902
1903 out:
1904         if (cleanup_nd) {
1905                 int stashed = error;
1906
1907                 error = vnode_put(nd.ni_vp);
1908                 if (error) {
1909                         panic("vnode_put() returned non-zero: %d", error);
1910                 }
1911
1912                 if (nd.ni_dvp) {
1913                         error = vnode_put(nd.ni_dvp);
1914                         if (error) {
1915                                 panic("vnode_put() returned non-zero: %d", error);
1916                         }
1917                 }
1918                 nameidone(&nd);
1919
1920                 error = stashed;
1921         }
1922         return error;
1923 }
1924 #endif /* CONFIG_LOCKERBOOT */
1925 #endif /* CONFIG_IMGSRC_ACCESS */
1926
1927 void
1928 enablequotas(struct mount *mp, vfs_context_t ctx)
1929 {
1930         struct nameidata qnd;
1931         int type;
1932         char qfpath[MAXPATHLEN];
1933         const char *qfname = QUOTAFILENAME;
1934         const char *qfopsname = QUOTAOPSNAME;
1935         const char *qfextension[] = INITQFNAMES;
1936
1937         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1938         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
1939                 return;
1940         }
1941         /*
1942          * Enable filesystem disk quotas if necessary.
1943          * We ignore errors as this should not interfere with final mount
1944          */
1945         for (type = 0; type < MAXQUOTAS; type++) {
1946                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1947                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1948                     CAST_USER_ADDR_T(qfpath), ctx);
1949                 if (namei(&qnd) != 0) {
1950                         continue;           /* option file to trigger quotas is not present */
1951                 }
1952                 vnode_put(qnd.ni_vp);
1953                 nameidone(&qnd);
1954                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1955
1956                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1957         }
1958         return;
1959 }
1960
1961
1962 static int
1963 checkdirs_callback(proc_t p, void * arg)
1964 {
1965         struct cdirargs * cdrp = (struct cdirargs *)arg;
1966         vnode_t olddp = cdrp->olddp;
1967         vnode_t newdp = cdrp->newdp;
1968         struct filedesc *fdp;
1969         vnode_t new_cvp = newdp;
1970         vnode_t new_rvp = newdp;
1971         vnode_t old_cvp = NULL;
1972         vnode_t old_rvp = NULL;
1973
1974         /*
1975          * XXX Also needs to iterate each thread in the process to see if it
1976          * XXX is using a per-thread current working directory, and, if so,
1977          * XXX update that as well.
1978          */
1979
1980         /*
1981          * First, with the proc_fdlock held, check to see if we will need
1982          * to do any work.  If not, we will get out fast.
1983          */
1984         proc_fdlock(p);
1985         fdp = p->p_fd;
1986         if (fdp == NULL ||
1987             (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp)) {
1988                 proc_fdunlock(p);
1989                 return PROC_RETURNED;
1990         }
1991         proc_fdunlock(p);
1992
1993         /*
1994          * Ok, we will have to do some work.  Always take two refs
1995          * because we might need that many.  We'll dispose of whatever
1996          * we ended up not using.
1997          */
1998         if (vnode_ref(newdp) != 0) {
1999                 return PROC_RETURNED;
2000         }
2001         if (vnode_ref(newdp) != 0) {
2002                 vnode_rele(newdp);
2003                 return PROC_RETURNED;
2004         }
2005
2006         /*
2007          * Now do the work.  Note: we dropped the proc_fdlock, so we
2008          * have to do all of the checks again.
2009          */
2010         proc_fdlock(p);
2011         fdp = p->p_fd;
2012         if (fdp != NULL) {
2013                 if (fdp->fd_cdir == olddp) {
2014                         old_cvp = olddp;
2015                         fdp->fd_cdir = newdp;
2016                         new_cvp = NULL;
2017                 }
2018                 if (fdp->fd_rdir == olddp) {
2019                         old_rvp = olddp;
2020                         fdp->fd_rdir = newdp;
2021                         new_rvp = NULL;
2022                 }
2023         }
2024         proc_fdunlock(p);
2025
2026         /*
2027          * Dispose of any references that are no longer needed.
2028          */
2029         if (old_cvp != NULL) {
2030                 vnode_rele(old_cvp);
2031         }
2032         if (old_rvp != NULL) {
2033                 vnode_rele(old_rvp);
2034         }
2035         if (new_cvp != NULL) {
2036                 vnode_rele(new_cvp);
2037         }
2038         if (new_rvp != NULL) {
2039                 vnode_rele(new_rvp);
2040         }
2041
2042         return PROC_RETURNED;
2043 }
2044
2045
2046
2047 /*
2048  * Scan all active processes to see if any of them have a current
2049  * or root directory onto which the new filesystem has just been
2050  * mounted. If so, replace them with the new mount point.
2051  */
2052 static int
2053 checkdirs(vnode_t olddp, vfs_context_t ctx)
2054 {
2055         vnode_t newdp;
2056         vnode_t tvp;
2057         int err;
2058         struct cdirargs cdr;
2059
2060         if (olddp->v_usecount == 1) {
2061                 return 0;
2062         }
2063         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2064
2065         if (err != 0) {
2066 #if DIAGNOSTIC
2067                 panic("mount: lost mount: error %d", err);
2068 #endif
2069                 return err;
2070         }
2071
2072         cdr.olddp = olddp;
2073         cdr.newdp = newdp;
2074         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2075         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2076
2077         if (rootvnode == olddp) {
2078                 vnode_ref(newdp);
2079                 tvp = rootvnode;
2080                 rootvnode = newdp;
2081                 vnode_rele(tvp);
2082         }
2083
2084         vnode_put(newdp);
2085         return 0;
2086 }
2087
2088 /*
2089  * Unmount a file system.
2090  *
2091  * Note: unmount takes a path to the vnode mounted on as argument,
2092  * not special file (as before).
2093  */
2094 /* ARGSUSED */
2095 int
2096 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2097 {
2098         vnode_t vp;
2099         struct mount *mp;
2100         int error;
2101         struct nameidata nd;
2102         vfs_context_t ctx = vfs_context_current();
2103
2104         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2105             UIO_USERSPACE, uap->path, ctx);
2106         error = namei(&nd);
2107         if (error) {
2108                 return error;
2109         }
2110         vp = nd.ni_vp;
2111         mp = vp->v_mount;
2112         nameidone(&nd);
2113
2114 #if CONFIG_MACF
2115         error = mac_mount_check_umount(ctx, mp);
2116         if (error != 0) {
2117                 vnode_put(vp);
2118                 return error;
2119         }
2120 #endif
2121         /*
2122          * Must be the root of the filesystem
2123          */
2124         if ((vp->v_flag & VROOT) == 0) {
2125                 vnode_put(vp);
2126                 return EINVAL;
2127         }
2128         mount_ref(mp, 0);
2129         vnode_put(vp);
2130         /* safedounmount consumes the mount ref */
2131         return safedounmount(mp, uap->flags, ctx);
2132 }
2133
2134 int
2135 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2136 {
2137         mount_t mp;
2138
2139         mp = mount_list_lookupby_fsid(fsid, 0, 1);
2140         if (mp == (mount_t)0) {
2141                 return ENOENT;
2142         }
2143         mount_ref(mp, 0);
2144         mount_iterdrop(mp);
2145         /* safedounmount consumes the mount ref */
2146         return safedounmount(mp, flags, ctx);
2147 }
2148
2149
2150 /*
2151  * The mount struct comes with a mount ref which will be consumed.
2152  * Do the actual file system unmount, prevent some common foot shooting.
2153  */
2154 int
2155 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2156 {
2157         int error;
2158         proc_t p = vfs_context_proc(ctx);
2159
2160         /*
2161          * If the file system is not responding and MNT_NOBLOCK
2162          * is set and not a forced unmount then return EBUSY.
2163          */
2164         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2165             (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2166                 error = EBUSY;
2167                 goto out;
2168         }
2169
2170         /*
2171          * Skip authorization if the mount is tagged as permissive and
2172          * this is not a forced-unmount attempt.
2173          */
2174         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
2175                 /*
2176                  * Only root, or the user that did the original mount is
2177                  * permitted to unmount this filesystem.
2178                  */
2179                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2180                     (error = suser(kauth_cred_get(), &p->p_acflag))) {
2181                         goto out;
2182                 }
2183         }
2184         /*
2185          * Don't allow unmounting the root file system (or the associated VM or DATA mounts) .
2186          */
2187         if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2188                 error = EBUSY; /* the root (or associated volumes) is always busy */
2189                 goto out;
2190         }
2191
2192 #ifdef CONFIG_IMGSRC_ACCESS
2193         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2194                 error = EBUSY;
2195                 goto out;
2196         }
2197 #endif /* CONFIG_IMGSRC_ACCESS */
2198
2199         return dounmount(mp, flags, 1, ctx);
2200
2201 out:
2202         mount_drop(mp, 0);
2203         return error;
2204 }
2205
2206 /*
2207  * Do the actual file system unmount.
2208  */
2209 int
2210 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2211 {
2212         vnode_t coveredvp = (vnode_t)0;
2213         int error;
2214         int needwakeup = 0;
2215         int forcedunmount = 0;
2216         int lflags = 0;
2217         struct vnode *devvp = NULLVP;
2218 #if CONFIG_TRIGGERS
2219         proc_t p = vfs_context_proc(ctx);
2220         int did_vflush = 0;
2221         int pflags_save = 0;
2222 #endif /* CONFIG_TRIGGERS */
2223
2224 #if CONFIG_FSE
2225         if (!(flags & MNT_FORCE)) {
2226                 fsevent_unmount(mp, ctx);  /* has to come first! */
2227         }
2228 #endif
2229
2230         mount_lock(mp);
2231
2232         /*
2233          * If already an unmount in progress just return EBUSY.
2234          * Even a forced unmount cannot override.
2235          */
2236         if (mp->mnt_lflag & MNT_LUNMOUNT) {
2237                 if (withref != 0) {
2238                         mount_drop(mp, 1);
2239                 }
2240                 mount_unlock(mp);
2241                 return EBUSY;
2242         }
2243
2244         if (flags & MNT_FORCE) {
2245                 forcedunmount = 1;
2246                 mp->mnt_lflag |= MNT_LFORCE;
2247         }
2248
2249 #if CONFIG_TRIGGERS
2250         if (flags & MNT_NOBLOCK && p != kernproc) {
2251                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2252         }
2253 #endif
2254
2255         mp->mnt_kern_flag |= MNTK_UNMOUNT;
2256         mp->mnt_lflag |= MNT_LUNMOUNT;
2257         mp->mnt_flag &= ~MNT_ASYNC;
2258         /*
2259          * anyone currently in the fast path that
2260          * trips over the cached rootvp will be
2261          * dumped out and forced into the slow path
2262          * to regenerate a new cached value
2263          */
2264         mp->mnt_realrootvp = NULLVP;
2265         mount_unlock(mp);
2266
2267         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2268                 /*
2269                  * Force unmount any mounts in this filesystem.
2270                  * If any unmounts fail - just leave them dangling.
2271                  * Avoids recursion.
2272                  */
2273                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2274         }
2275
2276         /*
2277          * taking the name_cache_lock exclusively will
2278          * insure that everyone is out of the fast path who
2279          * might be trying to use a now stale copy of
2280          * vp->v_mountedhere->mnt_realrootvp
2281          * bumping mount_generation causes the cached values
2282          * to be invalidated
2283          */
2284         name_cache_lock();
2285         mount_generation++;
2286         name_cache_unlock();
2287
2288
2289         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2290         if (withref != 0) {
2291                 mount_drop(mp, 0);
2292         }
2293         error = 0;
2294         if (forcedunmount == 0) {
2295                 ubc_umount(mp); /* release cached vnodes */
2296                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2297                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
2298                         if (error) {
2299                                 mount_lock(mp);
2300                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2301                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2302                                 mp->mnt_lflag &= ~MNT_LFORCE;
2303                                 goto out;
2304                         }
2305                 }
2306         }
2307
2308         IOBSDMountChange(mp, kIOMountChangeUnmount);
2309
2310 #if CONFIG_TRIGGERS
2311         vfs_nested_trigger_unmounts(mp, flags, ctx);
2312         did_vflush = 1;
2313 #endif
2314         if (forcedunmount) {
2315                 lflags |= FORCECLOSE;
2316         }
2317         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2318         if ((forcedunmount == 0) && error) {
2319                 mount_lock(mp);
2320                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2321                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2322                 mp->mnt_lflag &= ~MNT_LFORCE;
2323                 goto out;
2324         }
2325
2326         /* make sure there are no one in the mount iterations or lookup */
2327         mount_iterdrain(mp);
2328
2329         error = VFS_UNMOUNT(mp, flags, ctx);
2330         if (error) {
2331                 mount_iterreset(mp);
2332                 mount_lock(mp);
2333                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2334                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2335                 mp->mnt_lflag &= ~MNT_LFORCE;
2336                 goto out;
2337         }
2338
2339         /* increment the operations count */
2340         if (!error) {
2341                 OSAddAtomic(1, &vfs_nummntops);
2342         }
2343
2344         if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2345                 /* hold an io reference and drop the usecount before close */
2346                 devvp = mp->mnt_devvp;
2347                 vnode_getalways(devvp);
2348                 vnode_rele(devvp);
2349                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2350                     ctx);
2351                 vnode_clearmountedon(devvp);
2352                 vnode_put(devvp);
2353         }
2354         lck_rw_done(&mp->mnt_rwlock);
2355         mount_list_remove(mp);
2356         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2357
2358         /* mark the mount point hook in the vp but not drop the ref yet */
2359         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2360                 /*
2361                  * The covered vnode needs special handling. Trying to get an
2362                  * iocount must not block here as this may lead to deadlocks
2363                  * if the Filesystem to which the covered vnode belongs is
2364                  * undergoing forced unmounts. Since we hold a usecount, the
2365                  * vnode cannot be reused (it can, however, still be terminated)
2366                  */
2367                 vnode_getalways(coveredvp);
2368                 vnode_lock_spin(coveredvp);
2369
2370                 mp->mnt_crossref++;
2371                 coveredvp->v_mountedhere = (struct mount *)0;
2372                 CLR(coveredvp->v_flag, VMOUNT);
2373
2374                 vnode_unlock(coveredvp);
2375                 vnode_put(coveredvp);
2376         }
2377
2378         mount_list_lock();
2379         mp->mnt_vtable->vfc_refcount--;
2380         mount_list_unlock();
2381
2382         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2383         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2384         mount_lock(mp);
2385         mp->mnt_lflag |= MNT_LDEAD;
2386
2387         if (mp->mnt_lflag & MNT_LWAIT) {
2388                 /*
2389                  * do the wakeup here
2390                  * in case we block in mount_refdrain
2391                  * which will drop the mount lock
2392                  * and allow anyone blocked in vfs_busy
2393                  * to wakeup and see the LDEAD state
2394                  */
2395                 mp->mnt_lflag &= ~MNT_LWAIT;
2396                 wakeup((caddr_t)mp);
2397         }
2398         mount_refdrain(mp);
2399
2400         /* free disk_conditioner_info structure for this mount */
2401         disk_conditioner_unmount(mp);
2402
2403 out:
2404         if (mp->mnt_lflag & MNT_LWAIT) {
2405                 mp->mnt_lflag &= ~MNT_LWAIT;
2406                 needwakeup = 1;
2407         }
2408
2409 #if CONFIG_TRIGGERS
2410         if (flags & MNT_NOBLOCK && p != kernproc) {
2411                 // Restore P_NOREMOTEHANG bit to its previous value
2412                 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2413                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2414                 }
2415         }
2416
2417         /*
2418          * Callback and context are set together under the mount lock, and
2419          * never cleared, so we're safe to examine them here, drop the lock,
2420          * and call out.
2421          */
2422         if (mp->mnt_triggercallback != NULL) {
2423                 mount_unlock(mp);
2424                 if (error == 0) {
2425                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2426                 } else if (did_vflush) {
2427                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2428                 }
2429         } else {
2430                 mount_unlock(mp);
2431         }
2432 #else
2433         mount_unlock(mp);
2434 #endif /* CONFIG_TRIGGERS */
2435
2436         lck_rw_done(&mp->mnt_rwlock);
2437
2438         if (needwakeup) {
2439                 wakeup((caddr_t)mp);
2440         }
2441
2442         if (!error) {
2443                 if ((coveredvp != NULLVP)) {
2444                         vnode_t pvp = NULLVP;
2445
2446                         /*
2447                          * The covered vnode needs special handling. Trying to
2448                          * get an iocount must not block here as this may lead
2449                          * to deadlocks if the Filesystem to which the covered
2450                          * vnode belongs is undergoing forced unmounts. Since we
2451                          * hold a usecount, the  vnode cannot be reused
2452                          * (it can, however, still be terminated).
2453                          */
2454                         vnode_getalways(coveredvp);
2455
2456                         mount_dropcrossref(mp, coveredvp, 0);
2457                         /*
2458                          * We'll _try_ to detect if this really needs to be
2459                          * done. The coveredvp can only be in termination (or
2460                          * terminated) if the coveredvp's mount point is in a
2461                          * forced unmount (or has been) since we still hold the
2462                          * ref.
2463                          */
2464                         if (!vnode_isrecycled(coveredvp)) {
2465                                 pvp = vnode_getparent(coveredvp);
2466 #if CONFIG_TRIGGERS
2467                                 if (coveredvp->v_resolve) {
2468                                         vnode_trigger_rearm(coveredvp, ctx);
2469                                 }
2470 #endif
2471                         }
2472
2473                         vnode_rele(coveredvp);
2474                         vnode_put(coveredvp);
2475                         coveredvp = NULLVP;
2476
2477                         if (pvp) {
2478                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2479                                 vnode_put(pvp);
2480                         }
2481                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2482                         mount_lock_destroy(mp);
2483 #if CONFIG_MACF
2484                         mac_mount_label_destroy(mp);
2485 #endif
2486                         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2487                 } else {
2488                         panic("dounmount: no coveredvp");
2489                 }
2490         }
2491         return error;
2492 }
2493
2494 /*
2495  * Unmount any mounts in this filesystem.
2496  */
2497 void
2498 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2499 {
2500         mount_t smp;
2501         fsid_t *fsids, fsid;
2502         int fsids_sz;
2503         int count = 0, i, m = 0;
2504         vnode_t vp;
2505
2506         mount_list_lock();
2507
2508         // Get an array to hold the submounts fsids.
2509         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2510         count++;
2511         fsids_sz = count * sizeof(fsid_t);
2512         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2513         if (fsids == NULL) {
2514                 mount_list_unlock();
2515                 goto out;
2516         }
2517         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2518
2519         /*
2520          * Fill the array with submount fsids.
2521          * Since mounts are always added to the tail of the mount list, the
2522          * list is always in mount order.
2523          * For each mount check if the mounted-on vnode belongs to a
2524          * mount that's already added to our array of mounts to be unmounted.
2525          */
2526         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2527                 vp = smp->mnt_vnodecovered;
2528                 if (vp == NULL) {
2529                         continue;
2530                 }
2531                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2532                 for (i = 0; i <= m; i++) {
2533                         if (fsids[i].val[0] == fsid.val[0] &&
2534                             fsids[i].val[1] == fsid.val[1]) {
2535                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2536                                 break;
2537                         }
2538                 }
2539         }
2540         mount_list_unlock();
2541
2542         // Unmount the submounts in reverse order. Ignore errors.
2543         for (i = m; i > 0; i--) {
2544                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2545                 if (smp) {
2546                         mount_ref(smp, 0);
2547                         mount_iterdrop(smp);
2548                         (void) dounmount(smp, flags, 1, ctx);
2549                 }
2550         }
2551 out:
2552         if (fsids) {
2553                 FREE(fsids, M_TEMP);
2554         }
2555 }
2556
2557 void
2558 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2559 {
2560         vnode_lock(dp);
2561         mp->mnt_crossref--;
2562
2563         if (mp->mnt_crossref < 0) {
2564                 panic("mount cross refs -ve");
2565         }
2566
2567         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2568                 if (need_put) {
2569                         vnode_put_locked(dp);
2570                 }
2571                 vnode_unlock(dp);
2572
2573                 mount_lock_destroy(mp);
2574 #if CONFIG_MACF
2575                 mac_mount_label_destroy(mp);
2576 #endif
2577                 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2578                 return;
2579         }
2580         if (need_put) {
2581                 vnode_put_locked(dp);
2582         }
2583         vnode_unlock(dp);
2584 }
2585
2586
2587 /*
2588  * Sync each mounted filesystem.
2589  */
2590 #if DIAGNOSTIC
2591 int syncprt = 0;
2592 #endif
2593
2594 int print_vmpage_stat = 0;
2595
2596 /*
2597  * sync_callback:       simple wrapper that calls VFS_SYNC() on volumes
2598  *                      mounted read-write with the passed waitfor value.
2599  *
2600  * Parameters:  mp      mount-point descriptor per mounted file-system instance.
2601  *              arg     user argument (please see below)
2602  *
2603  * User argument is a pointer to 32 bit unsigned integer which describes the
2604  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
2605  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2606  * waitfor value.
2607  *
2608  * Returns:             VFS_RETURNED
2609  */
2610 static int
2611 sync_callback(mount_t mp, void *arg)
2612 {
2613         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2614                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2615                 unsigned waitfor = MNT_NOWAIT;
2616
2617                 if (arg) {
2618                         waitfor = *(uint32_t*)arg;
2619                 }
2620
2621                 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
2622                 if (waitfor != MNT_WAIT &&
2623                     waitfor != (MNT_WAIT | MNT_VOLUME) &&
2624                     waitfor != MNT_NOWAIT &&
2625                     waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2626                     waitfor != MNT_DWAIT &&
2627                     waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2628                         panic("Passed inappropriate waitfor %u to "
2629                             "sync_callback()", waitfor);
2630                 }
2631
2632                 mp->mnt_flag &= ~MNT_ASYNC;
2633                 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2634                 if (asyncflag) {
2635                         mp->mnt_flag |= MNT_ASYNC;
2636                 }
2637         }
2638
2639         return VFS_RETURNED;
2640 }
2641
2642 /* ARGSUSED */
2643 int
2644 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2645 {
2646         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2647
2648         if (print_vmpage_stat) {
2649                 vm_countdirtypages();
2650         }
2651
2652 #if DIAGNOSTIC
2653         if (syncprt) {
2654                 vfs_bufstats();
2655         }
2656 #endif /* DIAGNOSTIC */
2657         return 0;
2658 }
2659
2660 typedef enum {
2661         SYNC_ALL = 0,
2662         SYNC_ONLY_RELIABLE_MEDIA = 1,
2663         SYNC_ONLY_UNRELIABLE_MEDIA = 2
2664 } sync_type_t;
2665
2666 static int
2667 sync_internal_callback(mount_t mp, void *arg)
2668 {
2669         if (arg) {
2670                 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2671                     (mp->mnt_flag & MNT_LOCAL);
2672                 sync_type_t sync_type = *((sync_type_t *)arg);
2673
2674                 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2675                         return VFS_RETURNED;
2676                 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2677                         return VFS_RETURNED;
2678                 }
2679         }
2680
2681         (void)sync_callback(mp, NULL);
2682
2683         return VFS_RETURNED;
2684 }
2685
2686 int sync_thread_state = 0;
2687 int sync_timeout_seconds = 5;
2688
2689 #define SYNC_THREAD_RUN       0x0001
2690 #define SYNC_THREAD_RUNNING   0x0002
2691
2692 static void
2693 sync_thread(__unused void *arg, __unused wait_result_t wr)
2694 {
2695         sync_type_t sync_type;
2696
2697         lck_mtx_lock(sync_mtx_lck);
2698         while (sync_thread_state & SYNC_THREAD_RUN) {
2699                 sync_thread_state &= ~SYNC_THREAD_RUN;
2700                 lck_mtx_unlock(sync_mtx_lck);
2701
2702                 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2703                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2704                 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2705                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2706
2707                 lck_mtx_lock(sync_mtx_lck);
2708         }
2709         /*
2710          * This wakeup _has_ to be issued before the lock is released otherwise
2711          * we may end up waking up a thread in sync_internal which is
2712          * expecting a wakeup from a thread it just created and not from this
2713          * thread which is about to exit.
2714          */
2715         wakeup(&sync_thread_state);
2716         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2717         lck_mtx_unlock(sync_mtx_lck);
2718
2719         if (print_vmpage_stat) {
2720                 vm_countdirtypages();
2721         }
2722
2723 #if DIAGNOSTIC
2724         if (syncprt) {
2725                 vfs_bufstats();
2726         }
2727 #endif /* DIAGNOSTIC */
2728 }
2729
2730 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2731
2732 /*
2733  * An in-kernel sync for power management to call.
2734  * This function always returns within sync_timeout seconds.
2735  */
2736 __private_extern__ int
2737 sync_internal(void)
2738 {
2739         thread_t thd;
2740         int error;
2741         int thread_created = FALSE;
2742         struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2743
2744         lck_mtx_lock(sync_mtx_lck);
2745         sync_thread_state |= SYNC_THREAD_RUN;
2746         if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2747                 int kr;
2748
2749                 sync_thread_state |= SYNC_THREAD_RUNNING;
2750                 kr = kernel_thread_start(sync_thread, NULL, &thd);
2751                 if (kr != KERN_SUCCESS) {
2752                         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2753                         lck_mtx_unlock(sync_mtx_lck);
2754                         printf("sync_thread failed\n");
2755                         return 0;
2756                 }
2757                 thread_created = TRUE;
2758         }
2759
2760         error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
2761             (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2762         if (error) {
2763                 struct timeval now;
2764
2765                 microtime(&now);
2766                 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2767                         printf("sync timed out: %d sec\n", sync_timeout_seconds);
2768                         sync_timeout_last_print.tv_sec = now.tv_sec;
2769                 }
2770         }
2771
2772         if (thread_created) {
2773                 thread_deallocate(thd);
2774         }
2775
2776         return 0;
2777 } /* end of sync_internal call */
2778
2779 /*
2780  * Change filesystem quotas.
2781  */
2782 #if QUOTA
2783 int
2784 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2785 {
2786         struct mount *mp;
2787         int error, quota_cmd, quota_status = 0;
2788         caddr_t datap;
2789         size_t fnamelen;
2790         struct nameidata nd;
2791         vfs_context_t ctx = vfs_context_current();
2792         struct dqblk my_dqblk = {};
2793
2794         AUDIT_ARG(uid, uap->uid);
2795         AUDIT_ARG(cmd, uap->cmd);
2796         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2797             uap->path, ctx);
2798         error = namei(&nd);
2799         if (error) {
2800                 return error;
2801         }
2802         mp = nd.ni_vp->v_mount;
2803         mount_ref(mp, 0);
2804         vnode_put(nd.ni_vp);
2805         nameidone(&nd);
2806
2807         /* copyin any data we will need for downstream code */
2808         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2809
2810         switch (quota_cmd) {
2811         case Q_QUOTAON:
2812                 /* uap->arg specifies a file from which to take the quotas */
2813                 fnamelen = MAXPATHLEN;
2814                 datap = kalloc(MAXPATHLEN);
2815                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2816                 break;
2817         case Q_GETQUOTA:
2818                 /* uap->arg is a pointer to a dqblk structure. */
2819                 datap = (caddr_t) &my_dqblk;
2820                 break;
2821         case Q_SETQUOTA:
2822         case Q_SETUSE:
2823                 /* uap->arg is a pointer to a dqblk structure. */
2824                 datap = (caddr_t) &my_dqblk;
2825                 if (proc_is64bit(p)) {
2826                         struct user_dqblk       my_dqblk64;
2827                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2828                         if (error == 0) {
2829                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2830                         }
2831                 } else {
2832                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2833                 }
2834                 break;
2835         case Q_QUOTASTAT:
2836                 /* uap->arg is a pointer to an integer */
2837                 datap = (caddr_t) &quota_status;
2838                 break;
2839         default:
2840                 datap = NULL;
2841                 break;
2842         } /* switch */
2843
2844         if (error == 0) {
2845                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2846         }
2847
2848         switch (quota_cmd) {
2849         case Q_QUOTAON:
2850                 if (datap != NULL) {
2851                         kfree(datap, MAXPATHLEN);
2852                 }
2853                 break;
2854         case Q_GETQUOTA:
2855                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2856                 if (error == 0) {
2857                         if (proc_is64bit(p)) {
2858                                 struct user_dqblk       my_dqblk64;
2859
2860                                 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
2861                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2862                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
2863                         } else {
2864                                 error = copyout(datap, uap->arg, sizeof(struct dqblk));
2865                         }
2866                 }
2867                 break;
2868         case Q_QUOTASTAT:
2869                 /* uap->arg is a pointer to an integer */
2870                 if (error == 0) {
2871                         error = copyout(datap, uap->arg, sizeof(quota_status));
2872                 }
2873                 break;
2874         default:
2875                 break;
2876         } /* switch */
2877
2878         mount_drop(mp, 0);
2879         return error;
2880 }
2881 #else
2882 int
2883 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2884 {
2885         return EOPNOTSUPP;
2886 }
2887 #endif /* QUOTA */
2888
2889 /*
2890  * Get filesystem statistics.
2891  *
2892  * Returns:     0                       Success
2893  *      namei:???
2894  *      vfs_update_vfsstat:???
2895  *      munge_statfs:EFAULT
2896  */
2897 /* ARGSUSED */
2898 int
2899 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2900 {
2901         struct mount *mp;
2902         struct vfsstatfs *sp;
2903         int error;
2904         struct nameidata nd;
2905         vfs_context_t ctx = vfs_context_current();
2906         vnode_t vp;
2907
2908         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2909             UIO_USERSPACE, uap->path, ctx);
2910         error = namei(&nd);
2911         if (error != 0) {
2912                 return error;
2913         }
2914         vp = nd.ni_vp;
2915         mp = vp->v_mount;
2916         sp = &mp->mnt_vfsstat;
2917         nameidone(&nd);
2918
2919 #if CONFIG_MACF
2920         error = mac_mount_check_stat(ctx, mp);
2921         if (error != 0) {
2922                 vnode_put(vp);
2923                 return error;
2924         }
2925 #endif
2926
2927         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2928         if (error != 0) {
2929                 vnode_put(vp);
2930                 return error;
2931         }
2932
2933         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2934         vnode_put(vp);
2935         return error;
2936 }
2937
2938 /*
2939  * Get filesystem statistics.
2940  */
2941 /* ARGSUSED */
2942 int
2943 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2944 {
2945         vnode_t vp;
2946         struct mount *mp;
2947         struct vfsstatfs *sp;
2948         int error;
2949
2950         AUDIT_ARG(fd, uap->fd);
2951
2952         if ((error = file_vnode(uap->fd, &vp))) {
2953                 return error;
2954         }
2955
2956         error = vnode_getwithref(vp);
2957         if (error) {
2958                 file_drop(uap->fd);
2959                 return error;
2960         }
2961
2962         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2963
2964         mp = vp->v_mount;
2965         if (!mp) {
2966                 error = EBADF;
2967                 goto out;
2968         }
2969
2970 #if CONFIG_MACF
2971         error = mac_mount_check_stat(vfs_context_current(), mp);
2972         if (error != 0) {
2973                 goto out;
2974         }
2975 #endif
2976
2977         sp = &mp->mnt_vfsstat;
2978         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2979                 goto out;
2980         }
2981
2982         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2983
2984 out:
2985         file_drop(uap->fd);
2986         vnode_put(vp);
2987
2988         return error;
2989 }
2990
2991 void
2992 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
2993 {
2994         struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
2995
2996         bzero(sfs, sizeof(*sfs));
2997
2998         sfs->f_bsize = vsfs->f_bsize;
2999         sfs->f_iosize = (int32_t)vsfs->f_iosize;
3000         sfs->f_blocks = vsfs->f_blocks;
3001         sfs->f_bfree = vsfs->f_bfree;
3002         sfs->f_bavail = vsfs->f_bavail;
3003         sfs->f_files = vsfs->f_files;
3004         sfs->f_ffree = vsfs->f_ffree;
3005         sfs->f_fsid = vsfs->f_fsid;
3006         sfs->f_owner = vsfs->f_owner;
3007         sfs->f_type = mp->mnt_vtable->vfc_typenum;
3008         sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3009         sfs->f_fssubtype = vsfs->f_fssubtype;
3010         sfs->f_flags_ext = ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS)) ? MNT_EXT_ROOT_DATA_VOL : 0;
3011         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3012                 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3013         } else {
3014                 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3015         }
3016         strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3017         strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3018 }
3019
3020 /*
3021  * Get file system statistics in 64-bit mode
3022  */
3023 int
3024 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3025 {
3026         struct mount *mp;
3027         int error;
3028         struct nameidata nd;
3029         struct statfs64 sfs;
3030         vfs_context_t ctxp = vfs_context_current();
3031         vnode_t vp;
3032
3033         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3034             UIO_USERSPACE, uap->path, ctxp);
3035         error = namei(&nd);
3036         if (error != 0) {
3037                 return error;
3038         }
3039         vp = nd.ni_vp;
3040         mp = vp->v_mount;
3041         nameidone(&nd);
3042
3043 #if CONFIG_MACF
3044         error = mac_mount_check_stat(ctxp, mp);
3045         if (error != 0) {
3046                 vnode_put(vp);
3047                 return error;
3048         }
3049 #endif
3050
3051         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3052         if (error != 0) {
3053                 vnode_put(vp);
3054                 return error;
3055         }
3056
3057         vfs_get_statfs64(mp, &sfs);
3058         if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3059             (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3060                 /* This process does not want to see a seperate data volume mountpoint */
3061                 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3062         }
3063         error = copyout(&sfs, uap->buf, sizeof(sfs));
3064         vnode_put(vp);
3065
3066         return error;
3067 }
3068
3069 /*
3070  * Get file system statistics in 64-bit mode
3071  */
3072 int
3073 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3074 {
3075         struct vnode *vp;
3076         struct mount *mp;
3077         struct statfs64 sfs;
3078         int error;
3079
3080         AUDIT_ARG(fd, uap->fd);
3081
3082         if ((error = file_vnode(uap->fd, &vp))) {
3083                 return error;
3084         }
3085
3086         error = vnode_getwithref(vp);
3087         if (error) {
3088                 file_drop(uap->fd);
3089                 return error;
3090         }
3091
3092         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3093
3094         mp = vp->v_mount;
3095         if (!mp) {
3096                 error = EBADF;
3097                 goto out;
3098         }
3099
3100 #if CONFIG_MACF
3101         error = mac_mount_check_stat(vfs_context_current(), mp);
3102         if (error != 0) {
3103                 goto out;
3104         }
3105 #endif
3106
3107         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3108                 goto out;
3109         }
3110
3111         vfs_get_statfs64(mp, &sfs);
3112         if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3113             (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3114                 /* This process does not want to see a seperate data volume mountpoint */
3115                 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3116         }
3117         error = copyout(&sfs, uap->buf, sizeof(sfs));
3118
3119 out:
3120         file_drop(uap->fd);
3121         vnode_put(vp);
3122
3123         return error;
3124 }
3125
3126 struct getfsstat_struct {
3127         user_addr_t     sfsp;
3128         user_addr_t     *mp;
3129         int             count;
3130         int             maxcount;
3131         int             flags;
3132         int             error;
3133 };
3134
3135
3136 static int
3137 getfsstat_callback(mount_t mp, void * arg)
3138 {
3139         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3140         struct vfsstatfs *sp;
3141         int error, my_size;
3142         vfs_context_t ctx = vfs_context_current();
3143
3144         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3145 #if CONFIG_MACF
3146                 error = mac_mount_check_stat(ctx, mp);
3147                 if (error != 0) {
3148                         fstp->error = error;
3149                         return VFS_RETURNED_DONE;
3150                 }
3151 #endif
3152                 sp = &mp->mnt_vfsstat;
3153                 /*
3154                  * If MNT_NOWAIT is specified, do not refresh the
3155                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3156                  */
3157                 if ((mp->mnt_lflag & MNT_LDEAD) ||
3158                     (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3159                     (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3160                     (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3161                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3162                         return VFS_RETURNED;
3163                 }
3164
3165                 /*
3166                  * Need to handle LP64 version of struct statfs
3167                  */
3168                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3169                 if (error) {
3170                         fstp->error = error;
3171                         return VFS_RETURNED_DONE;
3172                 }
3173                 fstp->sfsp += my_size;
3174
3175                 if (fstp->mp) {
3176 #if CONFIG_MACF
3177                         error = mac_mount_label_get(mp, *fstp->mp);
3178                         if (error) {
3179                                 fstp->error = error;
3180                                 return VFS_RETURNED_DONE;
3181                         }
3182 #endif
3183                         fstp->mp++;
3184                 }
3185         }
3186         fstp->count++;
3187         return VFS_RETURNED;
3188 }
3189
3190 /*
3191  * Get statistics on all filesystems.
3192  */
3193 int
3194 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3195 {
3196         struct __mac_getfsstat_args muap;
3197
3198         muap.buf = uap->buf;
3199         muap.bufsize = uap->bufsize;
3200         muap.mac = USER_ADDR_NULL;
3201         muap.macsize = 0;
3202         muap.flags = uap->flags;
3203
3204         return __mac_getfsstat(p, &muap, retval);
3205 }
3206
3207 /*
3208  * __mac_getfsstat: Get MAC-related file system statistics
3209  *
3210  * Parameters:    p                        (ignored)
3211  *                uap                      User argument descriptor (see below)
3212  *                retval                   Count of file system statistics (N stats)
3213  *
3214  * Indirect:      uap->bufsize             Buffer size
3215  *                uap->macsize             MAC info size
3216  *                uap->buf                 Buffer where information will be returned
3217  *                uap->mac                 MAC info
3218  *                uap->flags               File system flags
3219  *
3220  *
3221  * Returns:        0                       Success
3222  *                !0                       Not success
3223  *
3224  */
3225 int
3226 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3227 {
3228         user_addr_t sfsp;
3229         user_addr_t *mp;
3230         size_t count, maxcount, bufsize, macsize;
3231         struct getfsstat_struct fst;
3232
3233         if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3234                 return EINVAL;
3235         }
3236
3237         bufsize = (size_t) uap->bufsize;
3238         macsize = (size_t) uap->macsize;
3239
3240         if (IS_64BIT_PROCESS(p)) {
3241                 maxcount = bufsize / sizeof(struct user64_statfs);
3242         } else {
3243                 maxcount = bufsize / sizeof(struct user32_statfs);
3244         }
3245         sfsp = uap->buf;
3246         count = 0;
3247
3248         mp = NULL;
3249
3250 #if CONFIG_MACF
3251         if (uap->mac != USER_ADDR_NULL) {
3252                 u_int32_t *mp0;
3253                 int error;
3254                 unsigned int i;
3255
3256                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3257                 if (count != maxcount) {
3258                         return EINVAL;
3259                 }
3260
3261                 /* Copy in the array */
3262                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
3263                 if (mp0 == NULL) {
3264                         return ENOMEM;
3265                 }
3266
3267                 error = copyin(uap->mac, mp0, macsize);
3268                 if (error) {
3269                         FREE(mp0, M_MACTEMP);
3270                         return error;
3271                 }
3272
3273                 /* Normalize to an array of user_addr_t */
3274                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
3275                 if (mp == NULL) {
3276                         FREE(mp0, M_MACTEMP);
3277                         return ENOMEM;
3278                 }
3279
3280                 for (i = 0; i < count; i++) {
3281                         if (IS_64BIT_PROCESS(p)) {
3282                                 mp[i] = ((user_addr_t *)mp0)[i];
3283                         } else {
3284                                 mp[i] = (user_addr_t)mp0[i];
3285                         }
3286                 }
3287                 FREE(mp0, M_MACTEMP);
3288         }
3289 #endif
3290
3291
3292         fst.sfsp = sfsp;
3293         fst.mp = mp;
3294         fst.flags = uap->flags;
3295         fst.count = 0;
3296         fst.error = 0;
3297         fst.maxcount = maxcount;
3298
3299
3300         vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3301
3302         if (mp) {
3303                 FREE(mp, M_MACTEMP);
3304         }
3305
3306         if (fst.error) {
3307                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3308                 return fst.error;
3309         }
3310
3311         if (fst.sfsp && fst.count > fst.maxcount) {
3312                 *retval = fst.maxcount;
3313         } else {
3314                 *retval = fst.count;
3315         }
3316         return 0;
3317 }
3318
3319 static int
3320 getfsstat64_callback(mount_t mp, void * arg)
3321 {
3322         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3323         struct vfsstatfs *sp;
3324         struct statfs64 sfs;
3325         int error;
3326
3327         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3328 #if CONFIG_MACF
3329                 error = mac_mount_check_stat(vfs_context_current(), mp);
3330                 if (error != 0) {
3331                         fstp->error = error;
3332                         return VFS_RETURNED_DONE;
3333                 }
3334 #endif
3335                 sp = &mp->mnt_vfsstat;
3336                 /*
3337                  * If MNT_NOWAIT is specified, do not refresh the fsstat
3338                  * cache. MNT_WAIT overrides MNT_NOWAIT.
3339                  *
3340                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
3341                  * getfsstat, since the constants are out of the same
3342                  * namespace.
3343                  */
3344                 if ((mp->mnt_lflag & MNT_LDEAD) ||
3345                     ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3346                     (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3347                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3348                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3349                         return VFS_RETURNED;
3350                 }
3351
3352                 vfs_get_statfs64(mp, &sfs);
3353                 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3354                 if (error) {
3355                         fstp->error = error;
3356                         return VFS_RETURNED_DONE;
3357                 }
3358                 fstp->sfsp += sizeof(sfs);
3359         }
3360         fstp->count++;
3361         return VFS_RETURNED;
3362 }
3363
3364 /*
3365  * Get statistics on all file systems in 64 bit mode.
3366  */
3367 int
3368 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3369 {
3370         user_addr_t sfsp;
3371         int count, maxcount;
3372         struct getfsstat_struct fst;
3373
3374         maxcount = uap->bufsize / sizeof(struct statfs64);
3375
3376         sfsp = uap->buf;
3377         count = 0;
3378
3379         fst.sfsp = sfsp;
3380         fst.flags = uap->flags;
3381         fst.count = 0;
3382         fst.error = 0;
3383         fst.maxcount = maxcount;
3384
3385         vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3386
3387         if (fst.error) {
3388                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3389                 return fst.error;
3390         }
3391
3392         if (fst.sfsp && fst.count > fst.maxcount) {
3393                 *retval = fst.maxcount;
3394         } else {
3395                 *retval = fst.count;
3396         }
3397
3398         return 0;
3399 }
3400
3401 /*
3402  * gets the associated vnode with the file descriptor passed.
3403  * as input
3404  *
3405  * INPUT
3406  * ctx - vfs context of caller
3407  * fd - file descriptor for which vnode is required.
3408  * vpp - Pointer to pointer to vnode to be returned.
3409  *
3410  * The vnode is returned with an iocount so any vnode obtained
3411  * by this call needs a vnode_put
3412  *
3413  */
3414 int
3415 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3416 {
3417         int error;
3418         vnode_t vp;
3419         struct fileproc *fp;
3420         proc_t p = vfs_context_proc(ctx);
3421
3422         *vpp =  NULLVP;
3423
3424         error = fp_getfvp(p, fd, &fp, &vp);
3425         if (error) {
3426                 return error;
3427         }
3428
3429         error = vnode_getwithref(vp);
3430         if (error) {
3431                 (void)fp_drop(p, fd, fp, 0);
3432                 return error;
3433         }
3434
3435         (void)fp_drop(p, fd, fp, 0);
3436         *vpp = vp;
3437         return error;
3438 }
3439
3440 /*
3441  * Wrapper function around namei to start lookup from a directory
3442  * specified by a file descriptor ni_dirfd.
3443  *
3444  * In addition to all the errors returned by namei, this call can
3445  * return ENOTDIR if the file descriptor does not refer to a directory.
3446  * and EBADF if the file descriptor is not valid.
3447  */
3448 int
3449 nameiat(struct nameidata *ndp, int dirfd)
3450 {
3451         if ((dirfd != AT_FDCWD) &&
3452             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3453             !(ndp->ni_cnd.cn_flags & USEDVP)) {
3454                 int error = 0;
3455                 char c;
3456
3457                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3458                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3459                         if (error) {
3460                                 return error;
3461                         }
3462                 } else {
3463                         c = *((char *)(ndp->ni_dirp));
3464                 }
3465
3466                 if (c != '/') {
3467                         vnode_t dvp_at;
3468
3469                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3470                             &dvp_at);
3471                         if (error) {
3472                                 return error;
3473                         }
3474
3475                         if (vnode_vtype(dvp_at) != VDIR) {
3476                                 vnode_put(dvp_at);
3477                                 return ENOTDIR;
3478                         }
3479
3480                         ndp->ni_dvp = dvp_at;
3481                         ndp->ni_cnd.cn_flags |= USEDVP;
3482                         error = namei(ndp);
3483                         ndp->ni_cnd.cn_flags &= ~USEDVP;
3484                         vnode_put(dvp_at);
3485                         return error;
3486                 }
3487         }
3488
3489         return namei(ndp);
3490 }
3491
3492 /*
3493  * Change current working directory to a given file descriptor.
3494  */
3495 /* ARGSUSED */
3496 static int
3497 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3498 {
3499         struct filedesc *fdp = p->p_fd;
3500         vnode_t vp;
3501         vnode_t tdp;
3502         vnode_t tvp;
3503         struct mount *mp;
3504         int error;
3505         vfs_context_t ctx = vfs_context_current();
3506
3507         AUDIT_ARG(fd, uap->fd);
3508         if (per_thread && uap->fd == -1) {
3509                 /*
3510                  * Switching back from per-thread to per process CWD; verify we
3511                  * in fact have one before proceeding.  The only success case
3512                  * for this code path is to return 0 preemptively after zapping
3513                  * the thread structure contents.
3514                  */
3515                 thread_t th = vfs_context_thread(ctx);
3516                 if (th) {
3517                         uthread_t uth = get_bsdthread_info(th);
3518                         tvp = uth->uu_cdir;
3519                         uth->uu_cdir = NULLVP;
3520                         if (tvp != NULLVP) {
3521                                 vnode_rele(tvp);
3522                                 return 0;
3523                         }
3524                 }
3525                 return EBADF;
3526         }
3527
3528         if ((error = file_vnode(uap->fd, &vp))) {
3529                 return error;
3530         }
3531         if ((error = vnode_getwithref(vp))) {
3532                 file_drop(uap->fd);
3533                 return error;
3534         }
3535
3536         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3537
3538         if (vp->v_type != VDIR) {
3539                 error = ENOTDIR;
3540                 goto out;
3541         }
3542
3543 #if CONFIG_MACF
3544         error = mac_vnode_check_chdir(ctx, vp);
3545         if (error) {
3546                 goto out;
3547         }
3548 #endif
3549         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3550         if (error) {
3551                 goto out;
3552         }
3553
3554         while (!error && (mp = vp->v_mountedhere) != NULL) {
3555                 if (vfs_busy(mp, LK_NOWAIT)) {
3556                         error = EACCES;
3557                         goto out;
3558                 }
3559                 error = VFS_ROOT(mp, &tdp, ctx);
3560                 vfs_unbusy(mp);
3561                 if (error) {
3562                         break;
3563                 }
3564                 vnode_put(vp);
3565                 vp = tdp;
3566         }
3567         if (error) {
3568                 goto out;
3569         }
3570         if ((error = vnode_ref(vp))) {
3571                 goto out;
3572         }
3573         vnode_put(vp);
3574
3575         if (per_thread) {
3576                 thread_t th = vfs_context_thread(ctx);
3577                 if (th) {
3578                         uthread_t uth = get_bsdthread_info(th);
3579                         tvp = uth->uu_cdir;
3580                         uth->uu_cdir = vp;
3581                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3582                 } else {
3583                         vnode_rele(vp);
3584                         return ENOENT;
3585                 }
3586         } else {
3587                 proc_fdlock(p);
3588                 tvp = fdp->fd_cdir;
3589                 fdp->fd_cdir = vp;
3590                 proc_fdunlock(p);
3591         }
3592
3593         if (tvp) {
3594                 vnode_rele(tvp);
3595         }
3596         file_drop(uap->fd);
3597
3598         return 0;
3599 out:
3600         vnode_put(vp);
3601         file_drop(uap->fd);
3602
3603         return error;
3604 }
3605
3606 int
3607 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3608 {
3609         return common_fchdir(p, uap, 0);
3610 }
3611
3612 int
3613 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3614 {
3615         return common_fchdir(p, (void *)uap, 1);
3616 }
3617
3618
3619 /*
3620  * Change current working directory (".").
3621  *
3622  * Returns:     0                       Success
3623  *      change_dir:ENOTDIR
3624  *      change_dir:???
3625  *      vnode_ref:ENOENT                No such file or directory
3626  */
3627 /* ARGSUSED */
3628 int
3629 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3630 {
3631         struct filedesc *fdp = p->p_fd;
3632         int error;
3633         vnode_t tvp;
3634
3635         error = change_dir(ndp, ctx);
3636         if (error) {
3637                 return error;
3638         }
3639         if ((error = vnode_ref(ndp->ni_vp))) {
3640                 vnode_put(ndp->ni_vp);
3641                 return error;
3642         }
3643         /*
3644          * drop the iocount we picked up in change_dir
3645          */
3646         vnode_put(ndp->ni_vp);
3647
3648         if (per_thread) {
3649                 thread_t th = vfs_context_thread(ctx);
3650                 if (th) {
3651                         uthread_t uth = get_bsdthread_info(th);
3652                         tvp = uth->uu_cdir;
3653                         uth->uu_cdir = ndp->ni_vp;
3654                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3655                 } else {
3656                         vnode_rele(ndp->ni_vp);
3657                         return ENOENT;
3658                 }
3659         } else {
3660                 proc_fdlock(p);
3661                 tvp = fdp->fd_cdir;
3662                 fdp->fd_cdir = ndp->ni_vp;
3663                 proc_fdunlock(p);
3664         }
3665
3666         if (tvp) {
3667                 vnode_rele(tvp);
3668         }
3669
3670         return 0;
3671 }
3672
3673
3674 /*
3675  * Change current working directory (".").
3676  *
3677  * Returns:     0                       Success
3678  *      chdir_internal:ENOTDIR
3679  *      chdir_internal:ENOENT           No such file or directory
3680  *      chdir_internal:???
3681  */
3682 /* ARGSUSED */
3683 static int
3684 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3685 {
3686         struct nameidata nd;
3687         vfs_context_t ctx = vfs_context_current();
3688
3689         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3690             UIO_USERSPACE, uap->path, ctx);
3691
3692         return chdir_internal(p, ctx, &nd, per_thread);
3693 }
3694
3695
3696 /*
3697  * chdir
3698  *
3699  * Change current working directory (".") for the entire process
3700  *
3701  * Parameters:  p       Process requesting the call
3702  *              uap     User argument descriptor (see below)
3703  *              retval  (ignored)
3704  *
3705  * Indirect parameters: uap->path       Directory path
3706  *
3707  * Returns:     0                       Success
3708  *              common_chdir: ENOTDIR
3709  *              common_chdir: ENOENT    No such file or directory
3710  *              common_chdir: ???
3711  *
3712  */
3713 int
3714 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3715 {
3716         return common_chdir(p, (void *)uap, 0);
3717 }
3718
3719 /*
3720  * __pthread_chdir
3721  *
3722  * Change current working directory (".") for a single thread
3723  *
3724  * Parameters:  p       Process requesting the call
3725  *              uap     User argument descriptor (see below)
3726  *              retval  (ignored)
3727  *
3728  * Indirect parameters: uap->path       Directory path
3729  *
3730  * Returns:     0                       Success
3731  *              common_chdir: ENOTDIR
3732  *              common_chdir: ENOENT    No such file or directory
3733  *              common_chdir: ???
3734  *
3735  */
3736 int
3737 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3738 {
3739         return common_chdir(p, (void *)uap, 1);
3740 }
3741
3742
3743 /*
3744  * Change notion of root (``/'') directory.
3745  */
3746 /* ARGSUSED */
3747 int
3748 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3749 {
3750         struct filedesc *fdp = p->p_fd;
3751         int error;
3752         struct nameidata nd;
3753         vnode_t tvp;
3754         vfs_context_t ctx = vfs_context_current();
3755
3756         if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3757                 return error;
3758         }
3759
3760         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3761             UIO_USERSPACE, uap->path, ctx);
3762         error = change_dir(&nd, ctx);
3763         if (error) {
3764                 return error;
3765         }
3766
3767 #if CONFIG_MACF
3768         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3769             &nd.ni_cnd);
3770         if (error) {
3771                 vnode_put(nd.ni_vp);
3772                 return error;
3773         }
3774 #endif
3775
3776         if ((error = vnode_ref(nd.ni_vp))) {
3777                 vnode_put(nd.ni_vp);
3778                 return error;
3779         }
3780         vnode_put(nd.ni_vp);
3781
3782         proc_fdlock(p);
3783         tvp = fdp->fd_rdir;
3784         fdp->fd_rdir = nd.ni_vp;
3785         fdp->fd_flags |= FD_CHROOT;
3786         proc_fdunlock(p);
3787
3788         if (tvp != NULL) {
3789                 vnode_rele(tvp);
3790         }
3791
3792         return 0;
3793 }
3794
3795 /*
3796  * Common routine for chroot and chdir.
3797  *
3798  * Returns:     0                       Success
3799  *              ENOTDIR                 Not a directory
3800  *              namei:???               [anything namei can return]
3801  *              vnode_authorize:???     [anything vnode_authorize can return]
3802  */
3803 static int
3804 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3805 {
3806         vnode_t vp;
3807         int error;
3808
3809         if ((error = namei(ndp))) {
3810                 return error;
3811         }
3812         nameidone(ndp);
3813         vp = ndp->ni_vp;
3814
3815         if (vp->v_type != VDIR) {
3816                 vnode_put(vp);
3817                 return ENOTDIR;
3818         }
3819
3820 #if CONFIG_MACF
3821         error = mac_vnode_check_chdir(ctx, vp);
3822         if (error) {
3823                 vnode_put(vp);
3824                 return error;
3825         }
3826 #endif
3827
3828         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3829         if (error) {
3830                 vnode_put(vp);
3831                 return error;
3832         }
3833
3834         return error;
3835 }
3836
3837 /*
3838  * Free the vnode data (for directories) associated with the file glob.
3839  */
3840 struct fd_vn_data *
3841 fg_vn_data_alloc(void)
3842 {
3843         struct fd_vn_data *fvdata;
3844
3845         /* Allocate per fd vnode data */
3846         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3847             M_FD_VN_DATA, M_WAITOK | M_ZERO);
3848         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3849         return fvdata;
3850 }
3851
3852 /*
3853  * Free the vnode data (for directories) associated with the file glob.
3854  */
3855 void
3856 fg_vn_data_free(void *fgvndata)
3857 {
3858         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3859
3860         if (fvdata->fv_buf) {
3861                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3862         }
3863         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3864         FREE(fvdata, M_FD_VN_DATA);
3865 }
3866
3867 /*
3868  * Check permissions, allocate an open file structure,
3869  * and call the device open routine if any.
3870  *
3871  * Returns:     0                       Success
3872  *              EINVAL
3873  *              EINTR
3874  *      falloc:ENFILE
3875  *      falloc:EMFILE
3876  *      falloc:ENOMEM
3877  *      vn_open_auth:???
3878  *      dupfdopen:???
3879  *      VNOP_ADVLOCK:???
3880  *      vnode_setsize:???
3881  *
3882  * XXX Need to implement uid, gid
3883  */
3884 int
3885 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3886     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3887     int32_t *retval)
3888 {
3889         proc_t p = vfs_context_proc(ctx);
3890         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3891         struct fileproc *fp;
3892         vnode_t vp;
3893         int flags, oflags;
3894         int type, indx, error;
3895         struct flock lf;
3896         struct vfs_context context;
3897
3898         oflags = uflags;
3899
3900         if ((oflags & O_ACCMODE) == O_ACCMODE) {
3901                 return EINVAL;
3902         }
3903
3904         flags = FFLAGS(uflags);
3905         CLR(flags, FENCRYPTED);
3906         CLR(flags, FUNENCRYPTED);
3907
3908         AUDIT_ARG(fflags, oflags);
3909         AUDIT_ARG(mode, vap->va_mode);
3910
3911         if ((error = falloc_withalloc(p,
3912             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3913                 return error;
3914         }
3915         uu->uu_dupfd = -indx - 1;
3916
3917         if ((error = vn_open_auth(ndp, &flags, vap))) {
3918                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {       /* XXX from fdopen */
3919                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3920                                 fp_drop(p, indx, NULL, 0);
3921                                 *retval = indx;
3922                                 return 0;
3923                         }
3924                 }
3925                 if (error == ERESTART) {
3926                         error = EINTR;
3927                 }
3928                 fp_free(p, indx, fp);
3929                 return error;
3930         }
3931         uu->uu_dupfd = 0;
3932         vp = ndp->ni_vp;
3933
3934         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3935         fp->f_fglob->fg_ops = &vnops;
3936         fp->f_fglob->fg_data = (caddr_t)vp;
3937
3938         if (flags & (O_EXLOCK | O_SHLOCK)) {
3939                 lf.l_whence = SEEK_SET;
3940                 lf.l_start = 0;
3941                 lf.l_len = 0;
3942                 if (flags & O_EXLOCK) {
3943                         lf.l_type = F_WRLCK;
3944                 } else {
3945                         lf.l_type = F_RDLCK;
3946                 }
3947                 type = F_FLOCK;
3948                 if ((flags & FNONBLOCK) == 0) {
3949                         type |= F_WAIT;
3950                 }
3951 #if CONFIG_MACF
3952                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3953                     F_SETLK, &lf);
3954                 if (error) {
3955                         goto bad;
3956                 }
3957 #endif
3958                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL))) {
3959                         goto bad;
3960                 }
3961                 fp->f_fglob->fg_flag |= FHASLOCK;
3962         }
3963
3964         /* try to truncate by setting the size attribute */
3965         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
3966                 goto bad;
3967         }
3968
3969         /*
3970          * For directories we hold some additional information in the fd.
3971          */
3972         if (vnode_vtype(vp) == VDIR) {
3973                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3974         } else {
3975                 fp->f_fglob->fg_vn_data = NULL;
3976         }
3977
3978         vnode_put(vp);
3979
3980         /*
3981          * The first terminal open (without a O_NOCTTY) by a session leader
3982          * results in it being set as the controlling terminal.
3983          */
3984         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3985             !(flags & O_NOCTTY)) {
3986                 int tmp = 0;
3987
3988                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3989                     (caddr_t)&tmp, ctx);
3990         }
3991
3992         proc_fdlock(p);
3993         if (flags & O_CLOEXEC) {
3994                 *fdflags(p, indx) |= UF_EXCLOSE;
3995         }
3996         if (flags & O_CLOFORK) {
3997                 *fdflags(p, indx) |= UF_FORKCLOSE;
3998         }
3999         procfdtbl_releasefd(p, indx, NULL);
4000
4001 #if CONFIG_SECLUDED_MEMORY
4002         if (secluded_for_filecache &&
4003             FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
4004             vnode_vtype(vp) == VREG) {
4005                 memory_object_control_t moc;
4006
4007                 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4008
4009                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4010                         /* nothing to do... */
4011                 } else if (fp->f_fglob->fg_flag & FWRITE) {
4012                         /* writable -> no longer  eligible for secluded pages */
4013                         memory_object_mark_eligible_for_secluded(moc,
4014                             FALSE);
4015                 } else if (secluded_for_filecache == 1) {
4016                         char pathname[32] = { 0, };
4017                         size_t copied;
4018                         /* XXX FBDP: better way to detect /Applications/ ? */
4019                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4020                                 (void)copyinstr(ndp->ni_dirp,
4021                                     pathname,
4022                                     sizeof(pathname),
4023                                     &copied);
4024                         } else {
4025                                 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4026                                     pathname,
4027                                     sizeof(pathname),
4028                                     &copied);
4029                         }
4030                         pathname[sizeof(pathname) - 1] = '\0';
4031                         if (strncmp(pathname,
4032                             "/Applications/",
4033                             strlen("/Applications/")) == 0 &&
4034                             strncmp(pathname,
4035                             "/Applications/Camera.app/",
4036                             strlen("/Applications/Camera.app/")) != 0) {
4037                                 /*
4038                                  * not writable
4039                                  * AND from "/Applications/"
4040                                  * AND not from "/Applications/Camera.app/"
4041                                  * ==> eligible for secluded
4042                                  */
4043                                 memory_object_mark_eligible_for_secluded(moc,
4044                                     TRUE);
4045                         }
4046                 } else if (secluded_for_filecache == 2) {
4047 #if __arm64__
4048 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
4049 #elif __arm__
4050 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
4051 #else
4052 /* not implemented... */
4053 #endif
4054                         size_t len = strlen(vp->v_name);
4055                         if (!strncmp(vp->v_name, DYLD_SHARED_CACHE_NAME, len) ||
4056                             !strncmp(vp->v_name, "dyld", len) ||
4057                             !strncmp(vp->v_name, "launchd", len) ||
4058                             !strncmp(vp->v_name, "Camera", len) ||
4059                             !strncmp(vp->v_name, "mediaserverd", len) ||
4060                             !strncmp(vp->v_name, "SpringBoard", len) ||
4061                             !strncmp(vp->v_name, "backboardd", len)) {
4062                                 /*
4063                                  * This file matters when launching Camera:
4064                                  * do not store its contents in the secluded
4065                                  * pool that will be drained on Camera launch.
4066                                  */
4067                                 memory_object_mark_eligible_for_secluded(moc,
4068                                     FALSE);
4069                         }
4070                 }
4071         }
4072 #endif /* CONFIG_SECLUDED_MEMORY */
4073
4074         fp_drop(p, indx, fp, 1);
4075         proc_fdunlock(p);
4076
4077         *retval = indx;
4078
4079         return 0;
4080 bad:
4081         context = *vfs_context_current();
4082         context.vc_ucred = fp->f_fglob->fg_cred;
4083
4084         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
4085             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
4086                 lf.l_whence = SEEK_SET;
4087                 lf.l_start = 0;
4088                 lf.l_len = 0;
4089                 lf.l_type = F_UNLCK;
4090
4091                 (void)VNOP_ADVLOCK(
4092                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4093         }
4094
4095         vn_close(vp, fp->f_fglob->fg_flag, &context);
4096         vnode_put(vp);
4097         fp_free(p, indx, fp);
4098
4099         return error;
4100 }
4101
4102 /*
4103  * While most of the *at syscall handlers can call nameiat() which
4104  * is a wrapper around namei, the use of namei and initialisation
4105  * of nameidata are far removed and in different functions  - namei
4106  * gets called in vn_open_auth for open1. So we'll just do here what
4107  * nameiat() does.
4108  */
4109 static int
4110 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4111     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
4112     int dirfd)
4113 {
4114         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4115                 int error;
4116                 char c;
4117
4118                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4119                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
4120                         if (error) {
4121                                 return error;
4122                         }
4123                 } else {
4124                         c = *((char *)(ndp->ni_dirp));
4125                 }
4126
4127                 if (c != '/') {
4128                         vnode_t dvp_at;
4129
4130                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4131                             &dvp_at);
4132                         if (error) {
4133                                 return error;
4134                         }
4135
4136                         if (vnode_vtype(dvp_at) != VDIR) {
4137                                 vnode_put(dvp_at);
4138                                 return ENOTDIR;
4139                         }
4140
4141                         ndp->ni_dvp = dvp_at;
4142                         ndp->ni_cnd.cn_flags |= USEDVP;
4143                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
4144                             retval);
4145                         vnode_put(dvp_at);
4146                         return error;
4147                 }
4148         }
4149
4150         return open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval);
4151 }
4152
4153 /*
4154  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4155  *
4156  * Parameters:  p                       Process requesting the open
4157  *              uap                     User argument descriptor (see below)
4158  *              retval                  Pointer to an area to receive the
4159  *                                      return calue from the system call
4160  *
4161  * Indirect:    uap->path               Path to open (same as 'open')
4162  *              uap->flags              Flags to open (same as 'open'
4163  *              uap->uid                UID to set, if creating
4164  *              uap->gid                GID to set, if creating
4165  *              uap->mode               File mode, if creating (same as 'open')
4166  *              uap->xsecurity          ACL to set, if creating
4167  *
4168  * Returns:     0                       Success
4169  *              !0                      errno value
4170  *
4171  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4172  *
4173  * XXX:         We should enummerate the possible errno values here, and where
4174  *              in the code they originated.
4175  */
4176 int
4177 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4178 {
4179         struct filedesc *fdp = p->p_fd;
4180         int ciferror;
4181         kauth_filesec_t xsecdst;
4182         struct vnode_attr va;
4183         struct nameidata nd;
4184         int cmode;
4185
4186         AUDIT_ARG(owner, uap->uid, uap->gid);
4187
4188         xsecdst = NULL;
4189         if ((uap->xsecurity != USER_ADDR_NULL) &&
4190             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4191                 return ciferror;
4192         }
4193
4194         VATTR_INIT(&va);
4195         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4196         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4197         if (uap->uid != KAUTH_UID_NONE) {
4198                 VATTR_SET(&va, va_uid, uap->uid);
4199         }
4200         if (uap->gid != KAUTH_GID_NONE) {
4201                 VATTR_SET(&va, va_gid, uap->gid);
4202         }
4203         if (xsecdst != NULL) {
4204                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4205         }
4206
4207         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4208             uap->path, vfs_context_current());
4209
4210         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4211             fileproc_alloc_init, NULL, retval);
4212         if (xsecdst != NULL) {
4213                 kauth_filesec_free(xsecdst);
4214         }
4215
4216         return ciferror;
4217 }
4218
4219 /*
4220  * Go through the data-protected atomically controlled open (2)
4221  *
4222  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4223  */
4224 int
4225 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4226 {
4227         int flags = uap->flags;
4228         int class = uap->class;
4229         int dpflags = uap->dpflags;
4230
4231         /*
4232          * Follow the same path as normal open(2)
4233          * Look up the item if it exists, and acquire the vnode.
4234          */
4235         struct filedesc *fdp = p->p_fd;
4236         struct vnode_attr va;
4237         struct nameidata nd;
4238         int cmode;
4239         int error;
4240
4241         VATTR_INIT(&va);
4242         /* Mask off all but regular access permissions */
4243         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4244         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4245
4246         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4247             uap->path, vfs_context_current());
4248
4249         /*
4250          * Initialize the extra fields in vnode_attr to pass down our
4251          * extra fields.
4252          * 1. target cprotect class.
4253          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4254          */
4255         if (flags & O_CREAT) {
4256                 /* lower level kernel code validates that the class is valid before applying it. */
4257                 if (class != PROTECTION_CLASS_DEFAULT) {
4258                         /*
4259                          * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4260                          * file behave the same as open (2)
4261                          */
4262                         VATTR_SET(&va, va_dataprotect_class, class);
4263                 }
4264         }
4265
4266         if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4267                 if (flags & (O_RDWR | O_WRONLY)) {
4268                         /* Not allowed to write raw encrypted bytes */
4269                         return EINVAL;
4270                 }
4271                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4272                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4273                 }
4274                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4275                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4276                 }
4277         }
4278
4279         error = open1(vfs_context_current(), &nd, uap->flags, &va,
4280             fileproc_alloc_init, NULL, retval);
4281
4282         return error;
4283 }
4284
4285 static int
4286 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4287     int fd, enum uio_seg segflg, int *retval)
4288 {
4289         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
4290         struct vnode_attr va;
4291         struct nameidata nd;
4292         int cmode;
4293
4294         VATTR_INIT(&va);
4295         /* Mask off all but regular access permissions */
4296         cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4297         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4298
4299         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4300             segflg, path, ctx);
4301
4302         return open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
4303                    retval, fd);
4304 }
4305
4306 int
4307 open(proc_t p, struct open_args *uap, int32_t *retval)
4308 {
4309         __pthread_testcancel(1);
4310         return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4311 }
4312
4313 int
4314 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4315     int32_t *retval)
4316 {
4317         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4318                    uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4319 }
4320
4321 int
4322 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4323     int32_t *retval)
4324 {
4325         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4326                    uap->mode, uap->fd, UIO_USERSPACE, retval);
4327 }
4328
4329 int
4330 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4331 {
4332         __pthread_testcancel(1);
4333         return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4334 }
4335
4336 /*
4337  * openbyid_np: open a file given a file system id and a file system object id
4338  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
4339  *      file systems that don't support object ids it is a node id (uint64_t).
4340  *
4341  * Parameters:  p                       Process requesting the open
4342  *              uap                     User argument descriptor (see below)
4343  *              retval                  Pointer to an area to receive the
4344  *                                      return calue from the system call
4345  *
4346  * Indirect:    uap->path               Path to open (same as 'open')
4347  *
4348  *              uap->fsid               id of target file system
4349  *              uap->objid              id of target file system object
4350  *              uap->flags              Flags to open (same as 'open')
4351  *
4352  * Returns:     0                       Success
4353  *              !0                      errno value
4354  *
4355  *
4356  * XXX:         We should enummerate the possible errno values here, and where
4357  *              in the code they originated.
4358  */
4359 int
4360 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4361 {
4362         fsid_t fsid;
4363         uint64_t objid;
4364         int error;
4365         char *buf = NULL;
4366         int buflen = MAXPATHLEN;
4367         int pathlen = 0;
4368         vfs_context_t ctx = vfs_context_current();
4369
4370         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4371                 return error;
4372         }
4373
4374         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4375                 return error;
4376         }
4377
4378         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4379         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4380                 return error;
4381         }
4382
4383         AUDIT_ARG(value32, fsid.val[0]);
4384         AUDIT_ARG(value64, objid);
4385
4386         /*resolve path from fsis, objid*/
4387         do {
4388                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
4389                 if (buf == NULL) {
4390                         return ENOMEM;
4391                 }
4392
4393                 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4394                     buf, FSOPT_ISREALFSID, &pathlen);
4395
4396                 if (error) {
4397                         FREE(buf, M_TEMP);
4398                         buf = NULL;
4399                 }
4400         } while (error == ENOSPC && (buflen += MAXPATHLEN));
4401
4402         if (error) {
4403                 return error;
4404         }
4405
4406         buf[pathlen] = 0;
4407
4408         error = openat_internal(
4409                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4410
4411         FREE(buf, M_TEMP);
4412
4413         return error;
4414 }
4415
4416
4417 /*
4418  * Create a special file.
4419  */
4420 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4421
4422 int
4423 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4424 {
4425         struct vnode_attr va;
4426         vfs_context_t ctx = vfs_context_current();
4427         int error;
4428         struct nameidata nd;
4429         vnode_t vp, dvp;
4430
4431         VATTR_INIT(&va);
4432         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4433         VATTR_SET(&va, va_rdev, uap->dev);
4434
4435         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4436         if ((uap->mode & S_IFMT) == S_IFIFO) {
4437                 return mkfifo1(ctx, uap->path, &va);
4438         }
4439
4440         AUDIT_ARG(mode, uap->mode);
4441         AUDIT_ARG(value32, uap->dev);
4442
4443         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4444                 return error;
4445         }
4446         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4447             UIO_USERSPACE, uap->path, ctx);
4448         error = namei(&nd);
4449         if (error) {
4450                 return error;
4451         }
4452         dvp = nd.ni_dvp;
4453         vp = nd.ni_vp;
4454
4455         if (vp != NULL) {
4456                 error = EEXIST;
4457                 goto out;
4458         }
4459
4460         switch (uap->mode & S_IFMT) {
4461         case S_IFCHR:
4462                 VATTR_SET(&va, va_type, VCHR);
4463                 break;
4464         case S_IFBLK:
4465                 VATTR_SET(&va, va_type, VBLK);
4466                 break;
4467         default:
4468                 error = EINVAL;
4469                 goto out;
4470         }
4471
4472 #if CONFIG_MACF
4473         error = mac_vnode_check_create(ctx,
4474             nd.ni_dvp, &nd.ni_cnd, &va);
4475         if (error) {
4476                 goto out;
4477         }
4478 #endif
4479
4480         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4481                 goto out;
4482         }
4483
4484         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4485                 goto out;
4486         }
4487
4488         if (vp) {
4489                 int     update_flags = 0;
4490
4491                 // Make sure the name & parent pointers are hooked up
4492                 if (vp->v_name == NULL) {
4493                         update_flags |= VNODE_UPDATE_NAME;
4494                 }
4495                 if (vp->v_parent == NULLVP) {
4496                         update_flags |= VNODE_UPDATE_PARENT;
4497                 }
4498
4499                 if (update_flags) {
4500                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4501                 }
4502
4503 #if CONFIG_FSE
4504                 add_fsevent(FSE_CREATE_FILE, ctx,
4505                     FSE_ARG_VNODE, vp,
4506                     FSE_ARG_DONE);
4507 #endif
4508         }
4509
4510 out:
4511         /*
4512          * nameidone has to happen before we vnode_put(dvp)
4513          * since it may need to release the fs_nodelock on the dvp
4514          */
4515         nameidone(&nd);
4516
4517         if (vp) {
4518                 vnode_put(vp);
4519         }
4520         vnode_put(dvp);
4521
4522         return error;
4523 }
4524
4525 /*
4526  * Create a named pipe.
4527  *
4528  * Returns:     0                       Success
4529  *              EEXIST
4530  *      namei:???
4531  *      vnode_authorize:???
4532  *      vn_create:???
4533  */
4534 static int
4535 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4536 {
4537         vnode_t vp, dvp;
4538         int error;
4539         struct nameidata nd;
4540
4541         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4542             UIO_USERSPACE, upath, ctx);
4543         error = namei(&nd);
4544         if (error) {
4545                 return error;
4546         }
4547         dvp = nd.ni_dvp;
4548         vp = nd.ni_vp;
4549
4550         /* check that this is a new file and authorize addition */
4551         if (vp != NULL) {
4552                 error = EEXIST;
4553                 goto out;
4554         }
4555         VATTR_SET(vap, va_type, VFIFO);
4556
4557         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4558                 goto out;
4559         }
4560
4561         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4562 out:
4563         /*
4564          * nameidone has to happen before we vnode_put(dvp)
4565          * since it may need to release the fs_nodelock on the dvp
4566          */
4567         nameidone(&nd);
4568
4569         if (vp) {
4570                 vnode_put(vp);
4571         }
4572         vnode_put(dvp);
4573
4574         return error;
4575 }
4576
4577
4578 /*
4579  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4580  *
4581  * Parameters:  p                       Process requesting the open
4582  *              uap                     User argument descriptor (see below)
4583  *              retval                  (Ignored)
4584  *
4585  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
4586  *              uap->uid                UID to set
4587  *              uap->gid                GID to set
4588  *              uap->mode               File mode to set (same as 'mkfifo')
4589  *              uap->xsecurity          ACL to set, if creating
4590  *
4591  * Returns:     0                       Success
4592  *              !0                      errno value
4593  *
4594  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4595  *
4596  * XXX:         We should enummerate the possible errno values here, and where
4597  *              in the code they originated.
4598  */
4599 int
4600 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4601 {
4602         int ciferror;
4603         kauth_filesec_t xsecdst;
4604         struct vnode_attr va;
4605
4606         AUDIT_ARG(owner, uap->uid, uap->gid);
4607
4608         xsecdst = KAUTH_FILESEC_NONE;
4609         if (uap->xsecurity != USER_ADDR_NULL) {
4610                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4611                         return ciferror;
4612                 }
4613         }
4614
4615         VATTR_INIT(&va);
4616         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4617         if (uap->uid != KAUTH_UID_NONE) {
4618                 VATTR_SET(&va, va_uid, uap->uid);
4619         }
4620         if (uap->gid != KAUTH_GID_NONE) {
4621                 VATTR_SET(&va, va_gid, uap->gid);
4622         }
4623         if (xsecdst != KAUTH_FILESEC_NONE) {
4624                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4625         }
4626
4627         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4628
4629         if (xsecdst != KAUTH_FILESEC_NONE) {
4630                 kauth_filesec_free(xsecdst);
4631         }
4632         return ciferror;
4633 }
4634
4635 /* ARGSUSED */
4636 int
4637 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4638 {
4639         struct vnode_attr va;
4640
4641         VATTR_INIT(&va);
4642         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4643
4644         return mkfifo1(vfs_context_current(), uap->path, &va);
4645 }
4646
4647
4648 static char *
4649 my_strrchr(char *p, int ch)
4650 {
4651         char *save;
4652
4653         for (save = NULL;; ++p) {
4654                 if (*p == ch) {
4655                         save = p;
4656                 }
4657                 if (!*p) {
4658                         return save;
4659                 }
4660         }
4661         /* NOTREACHED */
4662 }
4663
4664 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4665 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4666 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4667
4668 int
4669 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4670 {
4671         int ret, len = _len;
4672
4673         *truncated_path = 0;
4674
4675         if (firmlink) {
4676                 ret = vn_getpath(dvp, path, &len);
4677         } else {
4678                 ret = vn_getpath_no_firmlink(dvp, path, &len);
4679         }
4680         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4681                 if (leafname) {
4682                         path[len - 1] = '/';
4683                         len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4684                         if (len > MAXPATHLEN) {
4685                                 char *ptr;
4686
4687                                 // the string got truncated!
4688                                 *truncated_path = 1;
4689                                 ptr = my_strrchr(path, '/');
4690                                 if (ptr) {
4691                                         *ptr = '\0';   // chop off the string at the last directory component
4692                                 }
4693                                 len = strlen(path) + 1;
4694                         }
4695                 }
4696         } else if (ret == 0) {
4697                 *truncated_path = 1;
4698         } else if (ret != 0) {
4699                 struct vnode *mydvp = dvp;
4700
4701                 if (ret != ENOSPC) {
4702                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4703                             dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4704                 }
4705                 *truncated_path = 1;
4706
4707                 do {
4708                         if (mydvp->v_parent != NULL) {
4709                                 mydvp = mydvp->v_parent;
4710                         } else if (mydvp->v_mount) {
4711                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4712                                 break;
4713                         } else {
4714                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4715                                 strlcpy(path, "/", _len);
4716                                 len = 2;
4717                                 mydvp = NULL;
4718                         }
4719
4720                         if (mydvp == NULL) {
4721                                 break;
4722                         }
4723
4724                         len = _len;
4725                         if (firmlink) {
4726                                 ret = vn_getpath(mydvp, path, &len);
4727                         } else {
4728                                 ret = vn_getpath_no_firmlink(mydvp, path, &len);
4729                         }
4730                 } while (ret == ENOSPC);
4731         }
4732
4733         return len;
4734 }
4735
4736 int
4737 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4738 {
4739         return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
4740 }
4741
4742 int
4743 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4744 {
4745         return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
4746 }
4747
4748 /*
4749  * Make a hard file link.
4750  *
4751  * Returns:     0                       Success
4752  *              EPERM
4753  *              EEXIST
4754  *              EXDEV
4755  *      namei:???
4756  *      vnode_authorize:???
4757  *      VNOP_LINK:???
4758  */
4759 /* ARGSUSED */
4760 static int
4761 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4762     user_addr_t link, int flag, enum uio_seg segflg)
4763 {
4764         vnode_t vp, pvp, dvp, lvp;
4765         struct nameidata nd;
4766         int follow;
4767         int error;
4768 #if CONFIG_FSE
4769         fse_info finfo;
4770 #endif
4771         int need_event, has_listeners, need_kpath2;
4772         char *target_path = NULL;
4773         int truncated = 0;
4774
4775         vp = dvp = lvp = NULLVP;
4776
4777         /* look up the object we are linking to */
4778         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4779         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4780             segflg, path, ctx);
4781
4782         error = nameiat(&nd, fd1);
4783         if (error) {
4784                 return error;
4785         }
4786         vp = nd.ni_vp;
4787
4788         nameidone(&nd);
4789
4790         /*
4791          * Normally, linking to directories is not supported.
4792          * However, some file systems may have limited support.
4793          */
4794         if (vp->v_type == VDIR) {
4795                 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4796                         error = EPERM;   /* POSIX */
4797                         goto out;
4798                 }
4799
4800                 /* Linking to a directory requires ownership. */
4801                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4802                         struct vnode_attr dva;
4803
4804                         VATTR_INIT(&dva);
4805                         VATTR_WANTED(&dva, va_uid);
4806                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4807                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4808                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4809                                 error = EACCES;
4810                                 goto out;
4811                         }
4812                 }
4813         }
4814
4815         /* lookup the target node */
4816 #if CONFIG_TRIGGERS
4817         nd.ni_op = OP_LINK;
4818 #endif
4819         nd.ni_cnd.cn_nameiop = CREATE;
4820         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4821         nd.ni_dirp = link;
4822         error = nameiat(&nd, fd2);
4823         if (error != 0) {
4824                 goto out;
4825         }
4826         dvp = nd.ni_dvp;
4827         lvp = nd.ni_vp;
4828
4829 #if CONFIG_MACF
4830         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
4831                 goto out2;
4832         }
4833 #endif
4834
4835         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4836         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
4837                 goto out2;
4838         }
4839
4840         /* target node must not exist */
4841         if (lvp != NULLVP) {
4842                 error = EEXIST;
4843                 goto out2;
4844         }
4845         /* cannot link across mountpoints */
4846         if (vnode_mount(vp) != vnode_mount(dvp)) {
4847                 error = EXDEV;
4848                 goto out2;
4849         }
4850
4851         /* authorize creation of the target note */
4852         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4853                 goto out2;
4854         }
4855
4856         /* and finally make the link */
4857         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4858         if (error) {
4859                 goto out2;
4860         }
4861
4862 #if CONFIG_MACF
4863         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4864 #endif
4865
4866 #if CONFIG_FSE
4867         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4868 #else
4869         need_event = 0;
4870 #endif
4871         has_listeners = kauth_authorize_fileop_has_listeners();
4872
4873         need_kpath2 = 0;
4874 #if CONFIG_AUDIT
4875         if (AUDIT_RECORD_EXISTS()) {
4876                 need_kpath2 = 1;
4877         }
4878 #endif
4879
4880         if (need_event || has_listeners || need_kpath2) {
4881                 char *link_to_path = NULL;
4882                 int len, link_name_len;
4883
4884                 /* build the path to the new link file */
4885                 GET_PATH(target_path);
4886                 if (target_path == NULL) {
4887                         error = ENOMEM;
4888                         goto out2;
4889                 }
4890
4891                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4892
4893                 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
4894
4895                 if (has_listeners) {
4896                         /* build the path to file we are linking to */
4897                         GET_PATH(link_to_path);
4898                         if (link_to_path == NULL) {
4899                                 error = ENOMEM;
4900                                 goto out2;
4901                         }
4902
4903                         link_name_len = MAXPATHLEN;
4904                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4905                                 /*
4906                                  * Call out to allow 3rd party notification of rename.
4907                                  * Ignore result of kauth_authorize_fileop call.
4908                                  */
4909                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4910                                     (uintptr_t)link_to_path,
4911                                     (uintptr_t)target_path);
4912                         }
4913                         if (link_to_path != NULL) {
4914                                 RELEASE_PATH(link_to_path);
4915                         }
4916                 }
4917 #if CONFIG_FSE
4918                 if (need_event) {
4919                         /* construct fsevent */
4920                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4921                                 if (truncated) {
4922                                         finfo.mode |= FSE_TRUNCATED_PATH;
4923                                 }
4924
4925                                 // build the path to the destination of the link
4926                                 add_fsevent(FSE_CREATE_FILE, ctx,
4927                                     FSE_ARG_STRING, len, target_path,
4928                                     FSE_ARG_FINFO, &finfo,
4929                                     FSE_ARG_DONE);
4930                         }
4931
4932                         pvp = vp->v_parent;
4933                         // need an iocount on pvp in this case
4934                         if (pvp && pvp != dvp) {
4935                                 error = vnode_get(pvp);
4936                                 if (error) {
4937                                         pvp = NULLVP;
4938                                         error = 0;
4939                                 }
4940                         }
4941                         if (pvp) {
4942                                 add_fsevent(FSE_STAT_CHANGED, ctx,
4943                                     FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
4944                         }
4945                         if (pvp && pvp != dvp) {
4946                                 vnode_put(pvp);
4947                         }
4948                 }
4949 #endif
4950         }
4951 out2:
4952         /*
4953          * nameidone has to happen before we vnode_put(dvp)
4954          * since it may need to release the fs_nodelock on the dvp
4955          */
4956         nameidone(&nd);
4957         if (target_path != NULL) {
4958                 RELEASE_PATH(target_path);
4959         }
4960 out:
4961         if (lvp) {
4962                 vnode_put(lvp);
4963         }
4964         if (dvp) {
4965                 vnode_put(dvp);
4966         }
4967         vnode_put(vp);
4968         return error;
4969 }
4970
4971 int
4972 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4973 {
4974         return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4975                    AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
4976 }
4977
4978 int
4979 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4980 {
4981         if (uap->flag & ~AT_SYMLINK_FOLLOW) {
4982                 return EINVAL;
4983         }
4984
4985         return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4986                    uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
4987 }
4988
4989 /*
4990  * Make a symbolic link.
4991  *
4992  * We could add support for ACLs here too...
4993  */
4994 /* ARGSUSED */
4995 static int
4996 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4997     user_addr_t link, enum uio_seg segflg)
4998 {
4999         struct vnode_attr va;
5000         char *path;
5001         int error;
5002         struct nameidata nd;
5003         vnode_t vp, dvp;
5004         size_t dummy = 0;
5005         proc_t p;
5006
5007         error = 0;
5008         if (UIO_SEG_IS_USER_SPACE(segflg)) {
5009                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
5010                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5011         } else {
5012                 path = (char *)path_data;
5013         }
5014         if (error) {
5015                 goto out;
5016         }
5017         AUDIT_ARG(text, path);  /* This is the link string */
5018
5019         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5020             segflg, link, ctx);
5021
5022         error = nameiat(&nd, fd);
5023         if (error) {
5024                 goto out;
5025         }
5026         dvp = nd.ni_dvp;
5027         vp = nd.ni_vp;
5028
5029         p = vfs_context_proc(ctx);
5030         VATTR_INIT(&va);
5031         VATTR_SET(&va, va_type, VLNK);
5032         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
5033
5034 #if CONFIG_MACF
5035         error = mac_vnode_check_create(ctx,
5036             dvp, &nd.ni_cnd, &va);
5037 #endif
5038         if (error != 0) {
5039                 goto skipit;
5040         }
5041
5042         if (vp != NULL) {
5043                 error = EEXIST;
5044                 goto skipit;
5045         }
5046
5047         /* authorize */
5048         if (error == 0) {
5049                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5050         }
5051         /* get default ownership, etc. */
5052         if (error == 0) {
5053                 error = vnode_authattr_new(dvp, &va, 0, ctx);
5054         }
5055         if (error == 0) {
5056                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5057         }
5058
5059 #if CONFIG_MACF
5060         if (error == 0 && vp) {
5061                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5062         }
5063 #endif
5064
5065         /* do fallback attribute handling */
5066         if (error == 0 && vp) {
5067                 error = vnode_setattr_fallback(vp, &va, ctx);
5068         }
5069
5070         if (error == 0) {
5071                 int     update_flags = 0;
5072
5073                 /*check if a new vnode was created, else try to get one*/
5074                 if (vp == NULL) {
5075                         nd.ni_cnd.cn_nameiop = LOOKUP;
5076 #if CONFIG_TRIGGERS
5077                         nd.ni_op = OP_LOOKUP;
5078 #endif
5079                         nd.ni_cnd.cn_flags = 0;
5080                         error = nameiat(&nd, fd);
5081                         vp = nd.ni_vp;
5082
5083                         if (vp == NULL) {
5084                                 goto skipit;
5085                         }
5086                 }
5087
5088 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5089                 /* call out to allow 3rd party notification of rename.
5090                  * Ignore result of kauth_authorize_fileop call.
5091                  */
5092                 if (kauth_authorize_fileop_has_listeners() &&
5093                     namei(&nd) == 0) {
5094                         char *new_link_path = NULL;
5095                         int             len;
5096
5097                         /* build the path to the new link file */
5098                         new_link_path = get_pathbuff();
5099                         len = MAXPATHLEN;
5100                         vn_getpath(dvp, new_link_path, &len);
5101                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5102                                 new_link_path[len - 1] = '/';
5103                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5104                         }
5105
5106                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5107                             (uintptr_t)path, (uintptr_t)new_link_path);
5108                         if (new_link_path != NULL) {
5109                                 release_pathbuff(new_link_path);
5110                         }
5111                 }
5112 #endif
5113                 // Make sure the name & parent pointers are hooked up
5114                 if (vp->v_name == NULL) {
5115                         update_flags |= VNODE_UPDATE_NAME;
5116                 }
5117                 if (vp->v_parent == NULLVP) {
5118                         update_flags |= VNODE_UPDATE_PARENT;
5119                 }
5120
5121                 if (update_flags) {
5122                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5123                 }
5124
5125 #if CONFIG_FSE
5126                 add_fsevent(FSE_CREATE_FILE, ctx,
5127                     FSE_ARG_VNODE, vp,
5128                     FSE_ARG_DONE);
5129 #endif
5130         }
5131
5132 skipit:
5133         /*
5134          * nameidone has to happen before we vnode_put(dvp)
5135          * since it may need to release the fs_nodelock on the dvp
5136          */
5137         nameidone(&nd);
5138
5139         if (vp) {
5140                 vnode_put(vp);
5141         }
5142         vnode_put(dvp);
5143 out:
5144         if (path && (path != (char *)path_data)) {
5145                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
5146         }
5147
5148         return error;
5149 }
5150
5151 int
5152 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5153 {
5154         return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5155                    uap->link, UIO_USERSPACE);
5156 }
5157
5158 int
5159 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5160     __unused int32_t *retval)
5161 {
5162         return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5163                    uap->path2, UIO_USERSPACE);
5164 }
5165
5166 /*
5167  * Delete a whiteout from the filesystem.
5168  * No longer supported.
5169  */
5170 int
5171 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5172 {
5173         return ENOTSUP;
5174 }
5175
5176 /*
5177  * Delete a name from the filesystem.
5178  */
5179 /* ARGSUSED */
5180 static int
5181 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5182     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5183 {
5184         struct nameidata nd;
5185         vnode_t vp, dvp;
5186         int error;
5187         struct componentname *cnp;
5188         char  *path = NULL;
5189         char  *no_firmlink_path = NULL;
5190         int  len_path = 0;
5191         int  len_no_firmlink_path = 0;
5192 #if CONFIG_FSE
5193         fse_info  finfo;
5194         struct vnode_attr va;
5195 #endif
5196         int flags;
5197         int need_event;
5198         int has_listeners;
5199         int truncated_path;
5200         int truncated_no_firmlink_path;
5201         int batched;
5202         struct vnode_attr *vap;
5203         int do_retry;
5204         int retry_count = 0;
5205         int cn_flags;
5206
5207         cn_flags = LOCKPARENT;
5208         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5209                 cn_flags |= AUDITVNPATH1;
5210         }
5211         /* If a starting dvp is passed, it trumps any fd passed. */
5212         if (start_dvp) {
5213                 cn_flags |= USEDVP;
5214         }
5215
5216 #if NAMEDRSRCFORK
5217         /* unlink or delete is allowed on rsrc forks and named streams */
5218         cn_flags |= CN_ALLOWRSRCFORK;
5219 #endif
5220
5221 retry:
5222         do_retry = 0;
5223         flags = 0;
5224         need_event = 0;
5225         has_listeners = 0;
5226         truncated_path = 0;
5227         truncated_no_firmlink_path = 0;
5228         vap = NULL;
5229
5230         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5231
5232         nd.ni_dvp = start_dvp;
5233         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
5234         cnp = &nd.ni_cnd;
5235
5236 continue_lookup:
5237         error = nameiat(&nd, fd);
5238         if (error) {
5239                 return error;
5240         }
5241
5242         dvp = nd.ni_dvp;
5243         vp = nd.ni_vp;
5244
5245
5246         /* With Carbon delete semantics, busy files cannot be deleted */
5247         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5248                 flags |= VNODE_REMOVE_NODELETEBUSY;
5249         }
5250
5251         /* Skip any potential upcalls if told to. */
5252         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5253                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5254         }
5255
5256         if (vp) {
5257                 batched = vnode_compound_remove_available(vp);
5258                 /*
5259                  * The root of a mounted filesystem cannot be deleted.
5260                  */
5261                 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5262                         error = EBUSY;
5263                         goto out;
5264                 }
5265
5266 #if DEVELOPMENT || DEBUG
5267                 /*
5268                  * XXX VSWAP: Check for entitlements or special flag here
5269                  * so we can restrict access appropriately.
5270                  */
5271 #else /* DEVELOPMENT || DEBUG */
5272
5273                 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5274                         error = EPERM;
5275                         goto out;
5276                 }
5277 #endif /* DEVELOPMENT || DEBUG */
5278
5279                 if (!batched) {
5280                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5281                         if (error) {
5282                                 if (error == ENOENT) {
5283                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5284                                                 do_retry = 1;
5285                                                 retry_count++;
5286                                         }
5287                                 }
5288                                 goto out;
5289                         }
5290                 }
5291         } else {
5292                 batched = 1;
5293
5294                 if (!vnode_compound_remove_available(dvp)) {
5295                         panic("No vp, but no compound remove?");
5296                 }
5297         }
5298
5299 #if CONFIG_FSE
5300         need_event = need_fsevent(FSE_DELETE, dvp);
5301         if (need_event) {
5302                 if (!batched) {
5303                         if ((vp->v_flag & VISHARDLINK) == 0) {
5304                                 /* XXX need to get these data in batched VNOP */
5305                                 get_fse_info(vp, &finfo, ctx);
5306                         }
5307                 } else {
5308                         error = vfs_get_notify_attributes(&va);
5309                         if (error) {
5310                                 goto out;
5311                         }
5312
5313                         vap = &va;
5314                 }
5315         }
5316 #endif
5317         has_listeners = kauth_authorize_fileop_has_listeners();
5318         if (need_event || has_listeners) {
5319                 if (path == NULL) {
5320                         GET_PATH(path);
5321                         if (path == NULL) {
5322                                 error = ENOMEM;
5323                                 goto out;
5324                         }
5325                 }
5326                 len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5327                 if (no_firmlink_path == NULL) {
5328                         GET_PATH(no_firmlink_path);
5329                         if (no_firmlink_path == NULL) {
5330                                 error = ENOMEM;
5331                                 goto out;
5332                         }
5333                 }
5334                 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5335         }
5336
5337 #if NAMEDRSRCFORK
5338         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5339                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5340         } else
5341 #endif
5342         {
5343                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
5344                 vp = nd.ni_vp;
5345                 if (error == EKEEPLOOKING) {
5346                         if (!batched) {
5347                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5348                         }
5349
5350                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
5351                                 panic("EKEEPLOOKING, but continue flag not set?");
5352                         }
5353
5354                         if (vnode_isdir(vp)) {
5355                                 error = EISDIR;
5356                                 goto out;
5357                         }
5358                         goto continue_lookup;
5359                 } else if (error == ENOENT && batched) {
5360                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5361                                 /*
5362                                  * For compound VNOPs, the authorization callback may
5363                                  * return ENOENT in case of racing hardlink lookups
5364                                  * hitting the name  cache, redrive the lookup.
5365                                  */
5366                                 do_retry = 1;
5367                                 retry_count += 1;
5368                                 goto out;
5369                         }
5370                 }
5371         }
5372
5373         /*
5374          * Call out to allow 3rd party notification of delete.
5375          * Ignore result of kauth_authorize_fileop call.
5376          */
5377         if (!error) {
5378                 if (has_listeners) {
5379                         kauth_authorize_fileop(vfs_context_ucred(ctx),
5380                             KAUTH_FILEOP_DELETE,
5381                             (uintptr_t)vp,
5382                             (uintptr_t)path);
5383                 }
5384
5385                 if (vp->v_flag & VISHARDLINK) {
5386                         //
5387                         // if a hardlink gets deleted we want to blow away the
5388                         // v_parent link because the path that got us to this
5389                         // instance of the link is no longer valid.  this will
5390                         // force the next call to get the path to ask the file
5391                         // system instead of just following the v_parent link.
5392                         //
5393                         vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5394                 }
5395
5396 #if CONFIG_FSE
5397                 if (need_event) {
5398                         if (vp->v_flag & VISHARDLINK) {
5399                                 get_fse_info(vp, &finfo, ctx);
5400                         } else if (vap) {
5401                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
5402                         }
5403                         if (truncated_path) {
5404                                 finfo.mode |= FSE_TRUNCATED_PATH;
5405                         }
5406                         add_fsevent(FSE_DELETE, ctx,
5407                             FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5408                             FSE_ARG_FINFO, &finfo,
5409                             FSE_ARG_DONE);
5410                 }
5411 #endif
5412         }
5413
5414 out:
5415         if (path != NULL) {
5416                 RELEASE_PATH(path);
5417                 path = NULL;
5418         }
5419
5420         if (no_firmlink_path != NULL) {
5421                 RELEASE_PATH(no_firmlink_path);
5422                 no_firmlink_path = NULL;
5423         }
5424 #if NAMEDRSRCFORK
5425         /* recycle the deleted rsrc fork vnode to force a reclaim, which
5426          * will cause its shadow file to go away if necessary.
5427          */
5428         if (vp && (vnode_isnamedstream(vp)) &&
5429             (vp->v_parent != NULLVP) &&
5430             vnode_isshadow(vp)) {
5431                 vnode_recycle(vp);
5432         }
5433 #endif
5434         /*
5435          * nameidone has to happen before we vnode_put(dvp)
5436          * since it may need to release the fs_nodelock on the dvp
5437          */
5438         nameidone(&nd);
5439         vnode_put(dvp);
5440         if (vp) {
5441                 vnode_put(vp);
5442         }
5443
5444         if (do_retry) {
5445                 goto retry;
5446         }
5447
5448         return error;
5449 }
5450
5451 int
5452 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5453     enum uio_seg segflg, int unlink_flags)
5454 {
5455         return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5456                    unlink_flags);
5457 }
5458
5459 /*
5460  * Delete a name from the filesystem using Carbon semantics.
5461  */
5462 int
5463 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5464 {
5465         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5466                    uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5467 }
5468
5469 /*
5470  * Delete a name from the filesystem using POSIX semantics.
5471  */
5472 int
5473 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5474 {
5475         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5476                    uap->path, UIO_USERSPACE, 0);
5477 }
5478
5479 int
5480 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5481 {
5482         if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5483                 return EINVAL;
5484         }
5485
5486         if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5487                 int unlink_flags = 0;
5488
5489                 if (uap->flag & AT_REMOVEDIR_DATALESS) {
5490                         unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5491                 }
5492                 return rmdirat_internal(vfs_context_current(), uap->fd,
5493                            uap->path, UIO_USERSPACE, unlink_flags);
5494         } else {
5495                 return unlinkat_internal(vfs_context_current(), uap->fd,
5496                            NULLVP, uap->path, UIO_USERSPACE, 0);
5497         }
5498 }
5499
5500 /*
5501  * Reposition read/write file offset.
5502  */
5503 int
5504 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5505 {
5506         struct fileproc *fp;
5507         vnode_t vp;
5508         struct vfs_context *ctx;
5509         off_t offset = uap->offset, file_size;
5510         int error;
5511
5512         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5513                 if (error == ENOTSUP) {
5514                         return ESPIPE;
5515                 }
5516                 return error;
5517         }
5518         if (vnode_isfifo(vp)) {
5519                 file_drop(uap->fd);
5520                 return ESPIPE;
5521         }
5522
5523
5524         ctx = vfs_context_current();
5525 #if CONFIG_MACF
5526         if (uap->whence == L_INCR && uap->offset == 0) {
5527                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5528                     fp->f_fglob);
5529         } else {
5530                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5531                     fp->f_fglob);
5532         }
5533         if (error) {
5534                 file_drop(uap->fd);
5535                 return error;
5536         }
5537 #endif
5538         if ((error = vnode_getwithref(vp))) {
5539                 file_drop(uap->fd);
5540                 return error;
5541         }
5542
5543         switch (uap->whence) {
5544         case L_INCR:
5545                 offset += fp->f_fglob->fg_offset;
5546                 break;
5547         case L_XTND:
5548                 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5549                         break;
5550                 }
5551                 offset += file_size;
5552                 break;
5553         case L_SET:
5554                 break;
5555         case SEEK_HOLE:
5556                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5557                 break;
5558         case SEEK_DATA:
5559                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5560                 break;
5561         default:
5562                 error = EINVAL;
5563         }
5564         if (error == 0) {
5565                 if (uap->offset > 0 && offset < 0) {
5566                         /* Incremented/relative move past max size */
5567                         error = EOVERFLOW;
5568                 } else {
5569                         /*
5570                          * Allow negative offsets on character devices, per
5571                          * POSIX 1003.1-2001.  Most likely for writing disk
5572                          * labels.
5573                          */
5574                         if (offset < 0 && vp->v_type != VCHR) {
5575                                 /* Decremented/relative move before start */
5576                                 error = EINVAL;
5577                         } else {
5578                                 /* Success */
5579                                 fp->f_fglob->fg_offset = offset;
5580                                 *retval = fp->f_fglob->fg_offset;
5581                         }
5582                 }
5583         }
5584
5585         /*
5586          * An lseek can affect whether data is "available to read."  Use
5587          * hint of NOTE_NONE so no EVFILT_VNODE events fire
5588          */
5589         post_event_if_success(vp, error, NOTE_NONE);
5590         (void)vnode_put(vp);
5591         file_drop(uap->fd);
5592         return error;
5593 }
5594
5595
5596 /*
5597  * Check access permissions.
5598  *
5599  * Returns:     0                       Success
5600  *              vnode_authorize:???
5601  */
5602 static int
5603 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5604 {
5605         kauth_action_t action;
5606         int error;
5607
5608         /*
5609          * If just the regular access bits, convert them to something
5610          * that vnode_authorize will understand.
5611          */
5612         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5613                 action = 0;
5614                 if (uflags & R_OK) {
5615                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
5616                 }
5617                 if (uflags & W_OK) {
5618                         if (vnode_isdir(vp)) {
5619                                 action |= KAUTH_VNODE_ADD_FILE |
5620                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
5621                                 /* might want delete rights here too */
5622                         } else {
5623                                 action |= KAUTH_VNODE_WRITE_DATA;
5624                         }
5625                 }
5626                 if (uflags & X_OK) {
5627                         if (vnode_isdir(vp)) {
5628                                 action |= KAUTH_VNODE_SEARCH;
5629                         } else {
5630                                 action |= KAUTH_VNODE_EXECUTE;
5631                         }
5632                 }
5633         } else {
5634                 /* take advantage of definition of uflags */
5635                 action = uflags >> 8;
5636         }
5637
5638 #if CONFIG_MACF
5639         error = mac_vnode_check_access(ctx, vp, uflags);
5640         if (error) {
5641                 return error;
5642         }
5643 #endif /* MAC */
5644
5645         /* action == 0 means only check for existence */
5646         if (action != 0) {
5647                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5648         } else {
5649                 error = 0;
5650         }
5651
5652         return error;
5653 }
5654
5655
5656
5657 /*
5658  * access_extended: Check access permissions in bulk.
5659  *
5660  * Description: uap->entries            Pointer to an array of accessx
5661  *                                      descriptor structs, plus one or
5662  *                                      more NULL terminated strings (see
5663  *                                      "Notes" section below).
5664  *              uap->size               Size of the area pointed to by
5665  *                                      uap->entries.
5666  *              uap->results            Pointer to the results array.
5667  *
5668  * Returns:     0                       Success
5669  *              ENOMEM                  Insufficient memory
5670  *              EINVAL                  Invalid arguments
5671  *              namei:EFAULT            Bad address
5672  *              namei:ENAMETOOLONG      Filename too long
5673  *              namei:ENOENT            No such file or directory
5674  *              namei:ELOOP             Too many levels of symbolic links
5675  *              namei:EBADF             Bad file descriptor
5676  *              namei:ENOTDIR           Not a directory
5677  *              namei:???
5678  *              access1:
5679  *
5680  * Implicit returns:
5681  *              uap->results            Array contents modified
5682  *
5683  * Notes:       The uap->entries are structured as an arbitrary length array
5684  *              of accessx descriptors, followed by one or more NULL terminated
5685  *              strings
5686  *
5687  *                      struct accessx_descriptor[0]
5688  *                      ...
5689  *                      struct accessx_descriptor[n]
5690  *                      char name_data[0];
5691  *
5692  *              We determine the entry count by walking the buffer containing
5693  *              the uap->entries argument descriptor.  For each descriptor we
5694  *              see, the valid values for the offset ad_name_offset will be
5695  *              in the byte range:
5696  *
5697  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
5698  *                                              to
5699  *                              [ uap->entries + uap->size - 2 ]
5700  *
5701  *              since we must have at least one string, and the string must
5702  *              be at least one character plus the NULL terminator in length.
5703  *
5704  * XXX:         Need to support the check-as uid argument
5705  */
5706 int
5707 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5708 {
5709         struct accessx_descriptor *input = NULL;
5710         errno_t *result = NULL;
5711         errno_t error = 0;
5712         int wantdelete = 0;
5713         unsigned int desc_max, desc_actual, i, j;
5714         struct vfs_context context;
5715         struct nameidata nd;
5716         int niopts;
5717         vnode_t vp = NULL;
5718         vnode_t dvp = NULL;
5719 #define ACCESSX_MAX_DESCR_ON_STACK 10
5720         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5721
5722         context.vc_ucred = NULL;
5723
5724         /*
5725          * Validate parameters; if valid, copy the descriptor array and string
5726          * arguments into local memory.  Before proceeding, the following
5727          * conditions must have been met:
5728          *
5729          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5730          * o    There must be sufficient room in the request for at least one
5731          *      descriptor and a one yte NUL terminated string.
5732          * o    The allocation of local storage must not fail.
5733          */
5734         if (uap->size > ACCESSX_MAX_TABLESIZE) {
5735                 return ENOMEM;
5736         }
5737         if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
5738                 return EINVAL;
5739         }
5740         if (uap->size <= sizeof(stack_input)) {
5741                 input = stack_input;
5742         } else {
5743                 MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5744                 if (input == NULL) {
5745                         error = ENOMEM;
5746                         goto out;
5747                 }
5748         }
5749         error = copyin(uap->entries, input, uap->size);
5750         if (error) {
5751                 goto out;
5752         }
5753
5754         AUDIT_ARG(opaque, input, uap->size);
5755
5756         /*
5757          * Force NUL termination of the copyin buffer to avoid nami() running
5758          * off the end.  If the caller passes us bogus data, they may get a
5759          * bogus result.
5760          */
5761         ((char *)input)[uap->size - 1] = 0;
5762
5763         /*
5764          * Access is defined as checking against the process' real identity,
5765          * even if operations are checking the effective identity.  This
5766          * requires that we use a local vfs context.
5767          */
5768         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5769         context.vc_thread = current_thread();
5770
5771         /*
5772          * Find out how many entries we have, so we can allocate the result
5773          * array by walking the list and adjusting the count downward by the
5774          * earliest string offset we see.
5775          */
5776         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5777         desc_actual = desc_max;
5778         for (i = 0; i < desc_actual; i++) {
5779                 /*
5780                  * Take the offset to the name string for this entry and
5781                  * convert to an input array index, which would be one off
5782                  * the end of the array if this entry was the lowest-addressed
5783                  * name string.
5784                  */
5785                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5786
5787                 /*
5788                  * An offset greater than the max allowable offset is an error.
5789                  * It is also an error for any valid entry to point
5790                  * to a location prior to the end of the current entry, if
5791                  * it's not a reference to the string of the previous entry.
5792                  */
5793                 if (j > desc_max || (j != 0 && j <= i)) {
5794                         error = EINVAL;
5795                         goto out;
5796                 }
5797
5798                 /* Also do not let ad_name_offset point to something beyond the size of the input */
5799                 if (input[i].ad_name_offset >= uap->size) {
5800                         error = EINVAL;
5801                         goto out;
5802                 }
5803
5804                 /*
5805                  * An offset of 0 means use the previous descriptor's offset;
5806                  * this is used to chain multiple requests for the same file
5807                  * to avoid multiple lookups.
5808                  */
5809                 if (j == 0) {
5810                         /* This is not valid for the first entry */
5811                         if (i == 0) {
5812                                 error = EINVAL;
5813                                 goto out;
5814                         }
5815                         continue;
5816                 }
5817
5818                 /*
5819                  * If the offset of the string for this descriptor is before
5820                  * what we believe is the current actual last descriptor,
5821                  * then we need to adjust our estimate downward; this permits
5822                  * the string table following the last descriptor to be out
5823                  * of order relative to the descriptor list.
5824                  */
5825                 if (j < desc_actual) {
5826                         desc_actual = j;
5827                 }
5828         }
5829
5830         /*
5831          * We limit the actual number of descriptors we are willing to process
5832          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5833          * requested does not exceed this limit,
5834          */
5835         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5836                 error = ENOMEM;
5837                 goto out;
5838         }
5839         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK | M_ZERO);
5840         if (result == NULL) {
5841                 error = ENOMEM;
5842                 goto out;
5843         }
5844
5845         /*
5846          * Do the work by iterating over the descriptor entries we know to
5847          * at least appear to contain valid data.
5848          */
5849         error = 0;
5850         for (i = 0; i < desc_actual; i++) {
5851                 /*
5852                  * If the ad_name_offset is 0, then we use the previous
5853                  * results to make the check; otherwise, we are looking up
5854                  * a new file name.
5855                  */
5856                 if (input[i].ad_name_offset != 0) {
5857                         /* discard old vnodes */
5858                         if (vp) {
5859                                 vnode_put(vp);
5860                                 vp = NULL;
5861                         }
5862                         if (dvp) {
5863                                 vnode_put(dvp);
5864                                 dvp = NULL;
5865                         }
5866
5867                         /*
5868                          * Scan forward in the descriptor list to see if we
5869                          * need the parent vnode.  We will need it if we are
5870                          * deleting, since we must have rights  to remove
5871                          * entries in the parent directory, as well as the
5872                          * rights to delete the object itself.
5873                          */
5874                         wantdelete = input[i].ad_flags & _DELETE_OK;
5875                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
5876                                 if (input[j].ad_flags & _DELETE_OK) {
5877                                         wantdelete = 1;
5878                                 }
5879                         }
5880
5881                         niopts = FOLLOW | AUDITVNPATH1;
5882
5883                         /* need parent for vnode_authorize for deletion test */
5884                         if (wantdelete) {
5885                                 niopts |= WANTPARENT;
5886                         }
5887
5888                         /* do the lookup */
5889                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5890                             CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5891                             &context);
5892                         error = namei(&nd);
5893                         if (!error) {
5894                                 vp = nd.ni_vp;
5895                                 if (wantdelete) {
5896                                         dvp = nd.ni_dvp;
5897                                 }
5898                         }
5899                         nameidone(&nd);
5900                 }
5901
5902                 /*
5903                  * Handle lookup errors.
5904                  */
5905                 switch (error) {
5906                 case ENOENT:
5907                 case EACCES:
5908                 case EPERM:
5909                 case ENOTDIR:
5910                         result[i] = error;
5911                         break;
5912                 case 0:
5913                         /* run this access check */
5914                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5915                         break;
5916                 default:
5917                         /* fatal lookup error */
5918
5919                         goto out;
5920                 }
5921         }
5922
5923         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5924
5925         /* copy out results */
5926         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5927
5928 out:
5929         if (input && input != stack_input) {
5930                 FREE(input, M_TEMP);
5931         }
5932         if (result) {
5933                 FREE(result, M_TEMP);
5934         }
5935         if (vp) {
5936                 vnode_put(vp);
5937         }
5938         if (dvp) {
5939                 vnode_put(dvp);
5940         }
5941         if (IS_VALID_CRED(context.vc_ucred)) {
5942                 kauth_cred_unref(&context.vc_ucred);
5943         }
5944         return error;
5945 }
5946
5947
5948 /*
5949  * Returns:     0                       Success
5950  *              namei:EFAULT            Bad address
5951  *              namei:ENAMETOOLONG      Filename too long
5952  *              namei:ENOENT            No such file or directory
5953  *              namei:ELOOP             Too many levels of symbolic links
5954  *              namei:EBADF             Bad file descriptor
5955  *              namei:ENOTDIR           Not a directory
5956  *              namei:???
5957  *              access1:
5958  */
5959 static int
5960 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5961     int flag, enum uio_seg segflg)
5962 {
5963         int error;
5964         struct nameidata nd;
5965         int niopts;
5966         struct vfs_context context;
5967 #if NAMEDRSRCFORK
5968         int is_namedstream = 0;
5969 #endif
5970
5971         /*
5972          * Unless the AT_EACCESS option is used, Access is defined as checking
5973          * against the process' real identity, even if operations are checking
5974          * the effective identity.  So we need to tweak the credential
5975          * in the context for that case.
5976          */
5977         if (!(flag & AT_EACCESS)) {
5978                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5979         } else {
5980                 context.vc_ucred = ctx->vc_ucred;
5981         }
5982         context.vc_thread = ctx->vc_thread;
5983
5984
5985         niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
5986         /* need parent for vnode_authorize for deletion test */
5987         if (amode & _DELETE_OK) {
5988                 niopts |= WANTPARENT;
5989         }
5990         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5991             path, &context);
5992
5993 #if NAMEDRSRCFORK
5994         /* access(F_OK) calls are allowed for resource forks. */
5995         if (amode == F_OK) {
5996                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5997         }
5998 #endif
5999         error = nameiat(&nd, fd);
6000         if (error) {
6001                 goto out;
6002         }
6003
6004 #if NAMEDRSRCFORK
6005         /* Grab reference on the shadow stream file vnode to
6006          * force an inactive on release which will mark it
6007          * for recycle.
6008          */
6009         if (vnode_isnamedstream(nd.ni_vp) &&
6010             (nd.ni_vp->v_parent != NULLVP) &&
6011             vnode_isshadow(nd.ni_vp)) {
6012                 is_namedstream = 1;
6013                 vnode_ref(nd.ni_vp);
6014         }
6015 #endif
6016
6017         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6018
6019 #if NAMEDRSRCFORK
6020         if (is_namedstream) {
6021                 vnode_rele(nd.ni_vp);
6022         }
6023 #endif
6024
6025         vnode_put(nd.ni_vp);
6026         if (amode & _DELETE_OK) {
6027                 vnode_put(nd.ni_dvp);
6028         }
6029         nameidone(&nd);
6030
6031 out:
6032         if (!(flag & AT_EACCESS)) {
6033                 kauth_cred_unref(&context.vc_ucred);
6034         }
6035         return error;
6036 }
6037
6038 int
6039 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6040 {
6041         return faccessat_internal(vfs_context_current(), AT_FDCWD,
6042                    uap->path, uap->flags, 0, UIO_USERSPACE);
6043 }
6044
6045 int
6046 faccessat(__unused proc_t p, struct faccessat_args *uap,
6047     __unused int32_t *retval)
6048 {
6049         if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
6050                 return EINVAL;
6051         }
6052
6053         return faccessat_internal(vfs_context_current(), uap->fd,
6054                    uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6055 }
6056
6057 /*
6058  * Returns:     0                       Success
6059  *              EFAULT
6060  *      copyout:EFAULT
6061  *      namei:???
6062  *      vn_stat:???
6063  */
6064 static int
6065 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6066     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6067     enum uio_seg segflg, int fd, int flag)
6068 {
6069         struct nameidata nd;
6070         int follow;
6071         union {
6072                 struct stat sb;
6073                 struct stat64 sb64;
6074         } source = {};
6075         union {
6076                 struct user64_stat user64_sb;
6077                 struct user32_stat user32_sb;
6078                 struct user64_stat64 user64_sb64;
6079                 struct user32_stat64 user32_sb64;
6080         } dest = {};
6081         caddr_t sbp;
6082         int error, my_size;
6083         kauth_filesec_t fsec;
6084         size_t xsecurity_bufsize;
6085         void * statptr;
6086         struct fileproc *fp = NULL;
6087         int needsrealdev = 0;
6088
6089         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6090         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6091             segflg, path, ctx);
6092
6093 #if NAMEDRSRCFORK
6094         int is_namedstream = 0;
6095         /* stat calls are allowed for resource forks. */
6096         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6097 #endif
6098
6099         if (flag & AT_FDONLY) {
6100                 vnode_t fvp;
6101
6102                 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6103                 if (error) {
6104                         return error;
6105                 }
6106                 if ((error = vnode_getwithref(fvp))) {
6107                         file_drop(fd);
6108                         return error;
6109                 }
6110                 nd.ni_vp = fvp;
6111         } else {
6112                 error = nameiat(&nd, fd);
6113                 if (error) {
6114                         return error;
6115                 }
6116         }
6117         fsec = KAUTH_FILESEC_NONE;
6118
6119         statptr = (void *)&source;
6120
6121 #if NAMEDRSRCFORK
6122         /* Grab reference on the shadow stream file vnode to
6123          * force an inactive on release which will mark it
6124          * for recycle.
6125          */
6126         if (vnode_isnamedstream(nd.ni_vp) &&
6127             (nd.ni_vp->v_parent != NULLVP) &&
6128             vnode_isshadow(nd.ni_vp)) {
6129                 is_namedstream = 1;
6130                 vnode_ref(nd.ni_vp);
6131         }
6132 #endif
6133
6134         needsrealdev = flag & AT_REALDEV ? 1 : 0;
6135         if (fp && (xsecurity == USER_ADDR_NULL)) {
6136                 /*
6137                  * If the caller has the file open, and is not
6138                  * requesting extended security information, we are
6139                  * going to let them get the basic stat information.
6140                  */
6141                 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6142                     fp->f_fglob->fg_cred);
6143         } else {
6144                 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6145                     isstat64, needsrealdev, ctx);
6146         }
6147
6148 #if NAMEDRSRCFORK
6149         if (is_namedstream) {
6150                 vnode_rele(nd.ni_vp);
6151         }
6152 #endif
6153         vnode_put(nd.ni_vp);
6154         nameidone(&nd);
6155         if (fp) {
6156                 file_drop(fd);
6157                 fp = NULL;
6158         }
6159
6160         if (error) {
6161                 return error;
6162         }
6163         /* Zap spare fields */
6164         if (isstat64 != 0) {
6165                 source.sb64.st_lspare = 0;
6166                 source.sb64.st_qspare[0] = 0LL;
6167                 source.sb64.st_qspare[1] = 0LL;
6168                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6169                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6170                         my_size = sizeof(dest.user64_sb64);
6171                         sbp = (caddr_t)&dest.user64_sb64;
6172                 } else {
6173                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6174                         my_size = sizeof(dest.user32_sb64);
6175                         sbp = (caddr_t)&dest.user32_sb64;
6176                 }
6177                 /*
6178                  * Check if we raced (post lookup) against the last unlink of a file.
6179                  */
6180                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6181                         source.sb64.st_nlink = 1;
6182                 }
6183         } else {
6184                 source.sb.st_lspare = 0;
6185                 source.sb.st_qspare[0] = 0LL;
6186                 source.sb.st_qspare[1] = 0LL;
6187                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6188                         munge_user64_stat(&source.sb, &dest.user64_sb);
6189                         my_size = sizeof(dest.user64_sb);
6190                         sbp = (caddr_t)&dest.user64_sb;
6191                 } else {
6192                         munge_user32_stat(&source.sb, &dest.user32_sb);
6193                         my_size = sizeof(dest.user32_sb);
6194                         sbp = (caddr_t)&dest.user32_sb;
6195                 }
6196
6197                 /*
6198                  * Check if we raced (post lookup) against the last unlink of a file.
6199                  */
6200                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6201                         source.sb.st_nlink = 1;
6202                 }
6203         }
6204         if ((error = copyout(sbp, ub, my_size)) != 0) {
6205                 goto out;
6206         }
6207
6208         /* caller wants extended security information? */
6209         if (xsecurity != USER_ADDR_NULL) {
6210                 /* did we get any? */
6211                 if (fsec == KAUTH_FILESEC_NONE) {
6212                         if (susize(xsecurity_size, 0) != 0) {
6213                                 error = EFAULT;
6214                                 goto out;
6215                         }
6216                 } else {
6217                         /* find the user buffer size */
6218                         xsecurity_bufsize = fusize(xsecurity_size);
6219
6220                         /* copy out the actual data size */
6221                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6222                                 error = EFAULT;
6223                                 goto out;
6224                         }
6225
6226                         /* if the caller supplied enough room, copy out to it */
6227                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6228                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6229                         }
6230                 }
6231         }
6232 out:
6233         if (fsec != KAUTH_FILESEC_NONE) {
6234                 kauth_filesec_free(fsec);
6235         }
6236         return error;
6237 }
6238
6239 /*
6240  * stat_extended: Get file status; with extended security (ACL).
6241  *
6242  * Parameters:    p                       (ignored)
6243  *                uap                     User argument descriptor (see below)
6244  *                retval                  (ignored)
6245  *
6246  * Indirect:      uap->path               Path of file to get status from
6247  *                uap->ub                 User buffer (holds file status info)
6248  *                uap->xsecurity          ACL to get (extended security)
6249  *                uap->xsecurity_size     Size of ACL
6250  *
6251  * Returns:        0                      Success
6252  *                !0                      errno value
6253  *
6254  */
6255 int
6256 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6257     __unused int32_t *retval)
6258 {
6259         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6260                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6261                    0);
6262 }
6263
6264 /*
6265  * Returns:     0                       Success
6266  *      fstatat_internal:???            [see fstatat_internal() in this file]
6267  */
6268 int
6269 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6270 {
6271         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6272                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6273 }
6274
6275 int
6276 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6277 {
6278         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6279                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6280 }
6281
6282 /*
6283  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6284  *
6285  * Parameters:    p                       (ignored)
6286  *                uap                     User argument descriptor (see below)
6287  *                retval                  (ignored)
6288  *
6289  * Indirect:      uap->path               Path of file to get status from
6290  *                uap->ub                 User buffer (holds file status info)
6291  *                uap->xsecurity          ACL to get (extended security)
6292  *                uap->xsecurity_size     Size of ACL
6293  *
6294  * Returns:        0                      Success
6295  *                !0                      errno value
6296  *
6297  */
6298 int
6299 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6300 {
6301         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6302                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6303                    0);
6304 }
6305
6306 /*
6307  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6308  *
6309  * Parameters:    p                       (ignored)
6310  *                uap                     User argument descriptor (see below)
6311  *                retval                  (ignored)
6312  *
6313  * Indirect:      uap->path               Path of file to get status from
6314  *                uap->ub                 User buffer (holds file status info)
6315  *                uap->xsecurity          ACL to get (extended security)
6316  *                uap->xsecurity_size     Size of ACL
6317  *
6318  * Returns:        0                      Success
6319  *                !0                      errno value
6320  *
6321  */
6322 int
6323 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6324 {
6325         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6326                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6327                    AT_SYMLINK_NOFOLLOW);
6328 }
6329
6330 /*
6331  * Get file status; this version does not follow links.
6332  */
6333 int
6334 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6335 {
6336         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6337                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6338 }
6339
6340 int
6341 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6342 {
6343         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6344                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6345 }
6346
6347 /*
6348  * lstat64_extended: Get file status; can handle large inode numbers; does not
6349  * follow links; with extended security (ACL).
6350  *
6351  * Parameters:    p                       (ignored)
6352  *                uap                     User argument descriptor (see below)
6353  *                retval                  (ignored)
6354  *
6355  * Indirect:      uap->path               Path of file to get status from
6356  *                uap->ub                 User buffer (holds file status info)
6357  *                uap->xsecurity          ACL to get (extended security)
6358  *                uap->xsecurity_size     Size of ACL
6359  *
6360  * Returns:        0                      Success
6361  *                !0                      errno value
6362  *
6363  */
6364 int
6365 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6366 {
6367         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6368                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6369                    AT_SYMLINK_NOFOLLOW);
6370 }
6371
6372 int
6373 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6374 {
6375         if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6376                 return EINVAL;
6377         }
6378
6379         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6380                    0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6381 }
6382
6383 int
6384 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6385     __unused int32_t *retval)
6386 {
6387         if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6388                 return EINVAL;
6389         }
6390
6391         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6392                    0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6393 }
6394
6395 /*
6396  * Get configurable pathname variables.
6397  *
6398  * Returns:     0                       Success
6399  *      namei:???
6400  *      vn_pathconf:???
6401  *
6402  * Notes:       Global implementation  constants are intended to be
6403  *              implemented in this function directly; all other constants
6404  *              are per-FS implementation, and therefore must be handled in
6405  *              each respective FS, instead.
6406  *
6407  * XXX We implement some things globally right now that should actually be
6408  * XXX per-FS; we will need to deal with this at some point.
6409  */
6410 /* ARGSUSED */
6411 int
6412 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6413 {
6414         int error;
6415         struct nameidata nd;
6416         vfs_context_t ctx = vfs_context_current();
6417
6418         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6419             UIO_USERSPACE, uap->path, ctx);
6420         error = namei(&nd);
6421         if (error) {
6422                 return error;
6423         }
6424
6425         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6426
6427         vnode_put(nd.ni_vp);
6428         nameidone(&nd);
6429         return error;
6430 }
6431
6432 /*
6433  * Return target name of a symbolic link.
6434  */
6435 /* ARGSUSED */
6436 static int
6437 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
6438     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6439     int *retval)
6440 {
6441         vnode_t vp;
6442         uio_t auio;
6443         int error;
6444         struct nameidata nd;
6445         char uio_buf[UIO_SIZEOF(1)];
6446
6447         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6448             seg, path, ctx);
6449
6450         error = nameiat(&nd, fd);
6451         if (error) {
6452                 return error;
6453         }
6454         vp = nd.ni_vp;
6455
6456         nameidone(&nd);
6457
6458         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6459             &uio_buf[0], sizeof(uio_buf));
6460         uio_addiov(auio, buf, bufsize);
6461         if (vp->v_type != VLNK) {
6462                 error = EINVAL;
6463         } else {
6464 #if CONFIG_MACF
6465                 error = mac_vnode_check_readlink(ctx, vp);
6466 #endif
6467                 if (error == 0) {
6468                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6469                             ctx);
6470                 }
6471                 if (error == 0) {
6472                         error = VNOP_READLINK(vp, auio, ctx);
6473                 }
6474         }
6475         vnode_put(vp);
6476
6477         *retval = bufsize - (int)uio_resid(auio);
6478         return error;
6479 }
6480
6481 int
6482 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6483 {
6484         enum uio_seg procseg;
6485
6486         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6487         return readlinkat_internal(vfs_context_current(), AT_FDCWD,
6488                    CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6489                    uap->count, procseg, retval);
6490 }
6491
6492 int
6493 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6494 {
6495         enum uio_seg procseg;
6496
6497         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6498         return readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
6499                    procseg, uap->buf, uap->bufsize, procseg, retval);
6500 }
6501
6502 /*
6503  * Change file flags, the deep inner layer.
6504  */
6505 static int
6506 chflags0(vnode_t vp, struct vnode_attr *va,
6507     int (*setattr)(vnode_t, void *, vfs_context_t),
6508     void *arg, vfs_context_t ctx)
6509 {
6510         kauth_action_t action = 0;
6511         int error;
6512
6513 #if CONFIG_MACF
6514         error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6515         if (error) {
6516                 goto out;
6517         }
6518 #endif
6519
6520         /* request authorisation, disregard immutability */
6521         if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6522                 goto out;
6523         }
6524         /*
6525          * Request that the auth layer disregard those file flags it's allowed to when
6526          * authorizing this operation; we need to do this in order to be able to
6527          * clear immutable flags.
6528          */
6529         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6530                 goto out;
6531         }
6532         error = (*setattr)(vp, arg, ctx);
6533
6534 #if CONFIG_MACF
6535         if (error == 0) {
6536                 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6537         }
6538 #endif
6539
6540 out:
6541         return error;
6542 }
6543
6544 /*
6545  * Change file flags.
6546  *
6547  * NOTE: this will vnode_put() `vp'
6548  */
6549 static int
6550 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6551 {
6552         struct vnode_attr va;
6553         int error;
6554
6555         VATTR_INIT(&va);
6556         VATTR_SET(&va, va_flags, flags);
6557
6558         error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6559         vnode_put(vp);
6560
6561         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6562                 error = ENOTSUP;
6563         }
6564
6565         return error;
6566 }
6567
6568 /*
6569  * Change flags of a file given a path name.
6570  */
6571 /* ARGSUSED */
6572 int
6573 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6574 {
6575         vnode_t vp;
6576         vfs_context_t ctx = vfs_context_current();
6577         int error;
6578         struct nameidata nd;
6579
6580         AUDIT_ARG(fflags, uap->flags);
6581         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6582             UIO_USERSPACE, uap->path, ctx);
6583         error = namei(&nd);
6584         if (error) {
6585                 return error;
6586         }
6587         vp = nd.ni_vp;
6588         nameidone(&nd);
6589
6590         /* we don't vnode_put() here because chflags1 does internally */
6591         error = chflags1(vp, uap->flags, ctx);
6592
6593         return error;
6594 }
6595
6596 /*
6597  * Change flags of a file given a file descriptor.
6598  */
6599 /* ARGSUSED */
6600 int
6601 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6602 {
6603         vnode_t vp;
6604         int error;
6605
6606         AUDIT_ARG(fd, uap->fd);
6607         AUDIT_ARG(fflags, uap->flags);
6608         if ((error = file_vnode(uap->fd, &vp))) {
6609                 return error;
6610         }
6611
6612         if ((error = vnode_getwithref(vp))) {
6613                 file_drop(uap->fd);
6614                 return error;
6615         }
6616
6617         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6618
6619         /* we don't vnode_put() here because chflags1 does internally */
6620         error = chflags1(vp, uap->flags, vfs_context_current());
6621
6622         file_drop(uap->fd);
6623         return error;
6624 }
6625
6626 /*
6627  * Change security information on a filesystem object.
6628  *
6629  * Returns:     0                       Success
6630  *              EPERM                   Operation not permitted
6631  *              vnode_authattr:???      [anything vnode_authattr can return]
6632  *              vnode_authorize:???     [anything vnode_authorize can return]
6633  *              vnode_setattr:???       [anything vnode_setattr can return]
6634  *
6635  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
6636  *              translated to EPERM before being returned.
6637  */
6638 static int
6639 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6640 {
6641         kauth_action_t action;
6642         int error;
6643
6644         AUDIT_ARG(mode, vap->va_mode);
6645         /* XXX audit new args */
6646
6647 #if NAMEDSTREAMS
6648         /* chmod calls are not allowed for resource forks. */
6649         if (vp->v_flag & VISNAMEDSTREAM) {
6650                 return EPERM;
6651         }
6652 #endif
6653
6654 #if CONFIG_MACF
6655         if (VATTR_IS_ACTIVE(vap, va_mode) &&
6656             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6657                 return error;
6658         }
6659
6660         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6661                 if ((error = mac_vnode_check_setowner(ctx, vp,
6662                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6663                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6664                         return error;
6665                 }
6666         }
6667
6668         if (VATTR_IS_ACTIVE(vap, va_acl) &&
6669             (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
6670                 return error;
6671         }
6672 #endif
6673
6674         /* make sure that the caller is allowed to set this security information */
6675         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6676             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6677                 if (error == EACCES) {
6678                         error = EPERM;
6679                 }
6680                 return error;
6681         }
6682
6683         if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
6684                 return error;
6685         }
6686
6687 #if CONFIG_MACF
6688         if (VATTR_IS_ACTIVE(vap, va_mode)) {
6689                 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6690         }
6691
6692         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6693                 mac_vnode_notify_setowner(ctx, vp,
6694                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6695                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6696         }
6697
6698         if (VATTR_IS_ACTIVE(vap, va_acl)) {
6699                 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6700         }
6701 #endif
6702
6703         return error;
6704 }
6705
6706
6707 /*
6708  * Change mode of a file given a path name.
6709  *
6710  * Returns:     0                       Success
6711  *              namei:???               [anything namei can return]
6712  *              chmod_vnode:???         [anything chmod_vnode can return]
6713  */
6714 static int
6715 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6716     int fd, int flag, enum uio_seg segflg)
6717 {
6718         struct nameidata nd;
6719         int follow, error;
6720
6721         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6722         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
6723             segflg, path, ctx);
6724         if ((error = nameiat(&nd, fd))) {
6725                 return error;
6726         }
6727         error = chmod_vnode(ctx, nd.ni_vp, vap);
6728         vnode_put(nd.ni_vp);
6729         nameidone(&nd);
6730         return error;
6731 }
6732
6733 /*
6734  * chmod_extended: Change the mode of a file given a path name; with extended
6735  * argument list (including extended security (ACL)).
6736  *
6737  * Parameters:  p                       Process requesting the open
6738  *              uap                     User argument descriptor (see below)
6739  *              retval                  (ignored)
6740  *
6741  * Indirect:    uap->path               Path to object (same as 'chmod')
6742  *              uap->uid                UID to set
6743  *              uap->gid                GID to set
6744  *              uap->mode               File mode to set (same as 'chmod')
6745  *              uap->xsecurity          ACL to set (or delete)
6746  *
6747  * Returns:     0                       Success
6748  *              !0                      errno value
6749  *
6750  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
6751  *
6752  * XXX:         We should enummerate the possible errno values here, and where
6753  *              in the code they originated.
6754  */
6755 int
6756 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
6757 {
6758         int error;
6759         struct vnode_attr va;
6760         kauth_filesec_t xsecdst;
6761
6762         AUDIT_ARG(owner, uap->uid, uap->gid);
6763
6764         VATTR_INIT(&va);
6765         if (uap->mode != -1) {
6766                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6767         }
6768         if (uap->uid != KAUTH_UID_NONE) {
6769                 VATTR_SET(&va, va_uid, uap->uid);
6770         }
6771         if (uap->gid != KAUTH_GID_NONE) {
6772                 VATTR_SET(&va, va_gid, uap->gid);
6773         }
6774
6775         xsecdst = NULL;
6776         switch (uap->xsecurity) {
6777         /* explicit remove request */
6778         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6779                 VATTR_SET(&va, va_acl, NULL);
6780                 break;
6781         /* not being set */
6782         case USER_ADDR_NULL:
6783                 break;
6784         default:
6785                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6786                         return error;
6787                 }
6788                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6789                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6790         }
6791
6792         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6793             UIO_USERSPACE);
6794
6795         if (xsecdst != NULL) {
6796                 kauth_filesec_free(xsecdst);
6797         }
6798         return error;
6799 }
6800
6801 /*
6802  * Returns:     0                       Success
6803  *              chmodat:???             [anything chmodat can return]
6804  */
6805 static int
6806 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6807     int flag, enum uio_seg segflg)
6808 {
6809         struct vnode_attr va;
6810
6811         VATTR_INIT(&va);
6812         VATTR_SET(&va, va_mode, mode & ALLPERMS);
6813
6814         return chmodat(ctx, path, &va, fd, flag, segflg);
6815 }
6816
6817 int
6818 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6819 {
6820         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6821                    AT_FDCWD, 0, UIO_USERSPACE);
6822 }
6823
6824 int
6825 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6826 {
6827         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
6828                 return EINVAL;
6829         }
6830
6831         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6832                    uap->fd, uap->flag, UIO_USERSPACE);
6833 }
6834
6835 /*
6836  * Change mode of a file given a file descriptor.
6837  */
6838 static int
6839 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6840 {
6841         vnode_t vp;
6842         int error;
6843
6844         AUDIT_ARG(fd, fd);
6845
6846         if ((error = file_vnode(fd, &vp)) != 0) {
6847                 return error;
6848         }
6849         if ((error = vnode_getwithref(vp)) != 0) {
6850                 file_drop(fd);
6851                 return error;
6852         }
6853         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6854
6855         error = chmod_vnode(vfs_context_current(), vp, vap);
6856         (void)vnode_put(vp);
6857         file_drop(fd);
6858
6859         return error;
6860 }
6861
6862 /*
6863  * fchmod_extended: Change mode of a file given a file descriptor; with
6864  * extended argument list (including extended security (ACL)).
6865  *
6866  * Parameters:    p                       Process requesting to change file mode
6867  *                uap                     User argument descriptor (see below)
6868  *                retval                  (ignored)
6869  *
6870  * Indirect:      uap->mode               File mode to set (same as 'chmod')
6871  *                uap->uid                UID to set
6872  *                uap->gid                GID to set
6873  *                uap->xsecurity          ACL to set (or delete)
6874  *                uap->fd                 File descriptor of file to change mode
6875  *
6876  * Returns:        0                      Success
6877  *                !0                      errno value
6878  *
6879  */
6880 int
6881 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6882 {
6883         int error;
6884         struct vnode_attr va;
6885         kauth_filesec_t xsecdst;
6886
6887         AUDIT_ARG(owner, uap->uid, uap->gid);
6888
6889         VATTR_INIT(&va);
6890         if (uap->mode != -1) {
6891                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6892         }
6893         if (uap->uid != KAUTH_UID_NONE) {
6894                 VATTR_SET(&va, va_uid, uap->uid);
6895         }
6896         if (uap->gid != KAUTH_GID_NONE) {
6897                 VATTR_SET(&va, va_gid, uap->gid);
6898         }
6899
6900         xsecdst = NULL;
6901         switch (uap->xsecurity) {
6902         case USER_ADDR_NULL:
6903                 VATTR_SET(&va, va_acl, NULL);
6904                 break;
6905         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6906                 VATTR_SET(&va, va_acl, NULL);
6907                 break;
6908         /* not being set */
6909         case CAST_USER_ADDR_T(-1):
6910                 break;
6911         default:
6912                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6913                         return error;
6914                 }
6915                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6916         }
6917
6918         error = fchmod1(p, uap->fd, &va);
6919
6920
6921         switch (uap->xsecurity) {
6922         case USER_ADDR_NULL:
6923         case CAST_USER_ADDR_T(-1):
6924                 break;
6925         default:
6926                 if (xsecdst != NULL) {
6927                         kauth_filesec_free(xsecdst);
6928                 }
6929         }
6930         return error;
6931 }
6932
6933 int
6934 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6935 {
6936         struct vnode_attr va;
6937
6938         VATTR_INIT(&va);
6939         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6940
6941         return fchmod1(p, uap->fd, &va);
6942 }
6943
6944
6945 /*
6946  * Set ownership given a path name.
6947  */
6948 /* ARGSUSED */
6949 static int
6950 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6951     gid_t gid, int flag, enum uio_seg segflg)
6952 {
6953         vnode_t vp;
6954         struct vnode_attr va;
6955         int error;
6956         struct nameidata nd;
6957         int follow;
6958         kauth_action_t action;
6959
6960         AUDIT_ARG(owner, uid, gid);
6961
6962         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6963         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6964             path, ctx);
6965         error = nameiat(&nd, fd);
6966         if (error) {
6967                 return error;
6968         }
6969         vp = nd.ni_vp;
6970
6971         nameidone(&nd);
6972
6973         VATTR_INIT(&va);
6974         if (uid != (uid_t)VNOVAL) {
6975                 VATTR_SET(&va, va_uid, uid);
6976         }
6977         if (gid != (gid_t)VNOVAL) {
6978                 VATTR_SET(&va, va_gid, gid);
6979         }
6980
6981 #if CONFIG_MACF
6982         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6983         if (error) {
6984                 goto out;
6985         }
6986 #endif
6987
6988         /* preflight and authorize attribute changes */
6989         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6990                 goto out;
6991         }
6992         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6993                 goto out;
6994         }
6995         error = vnode_setattr(vp, &va, ctx);
6996
6997 #if CONFIG_MACF
6998         if (error == 0) {
6999                 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7000         }
7001 #endif
7002
7003 out:
7004         /*
7005          * EACCES is only allowed from namei(); permissions failure should
7006          * return EPERM, so we need to translate the error code.
7007          */
7008         if (error == EACCES) {
7009                 error = EPERM;
7010         }
7011
7012         vnode_put(vp);
7013         return error;
7014 }
7015
7016 int
7017 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7018 {
7019         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7020                    uap->uid, uap->gid, 0, UIO_USERSPACE);
7021 }
7022
7023 int
7024 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7025 {
7026         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7027                    uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7028 }
7029
7030 int
7031 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7032 {
7033         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7034                 return EINVAL;
7035         }
7036
7037         return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7038                    uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7039 }
7040
7041 /*
7042  * Set ownership given a file descriptor.
7043  */
7044 /* ARGSUSED */
7045 int
7046 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7047 {
7048         struct vnode_attr va;
7049         vfs_context_t ctx = vfs_context_current();
7050         vnode_t vp;
7051         int error;
7052         kauth_action_t action;
7053
7054         AUDIT_ARG(owner, uap->uid, uap->gid);
7055         AUDIT_ARG(fd, uap->fd);
7056
7057         if ((error = file_vnode(uap->fd, &vp))) {
7058                 return error;
7059         }
7060
7061         if ((error = vnode_getwithref(vp))) {
7062                 file_drop(uap->fd);
7063                 return error;
7064         }
7065         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7066
7067         VATTR_INIT(&va);
7068         if (uap->uid != VNOVAL) {
7069                 VATTR_SET(&va, va_uid, uap->uid);
7070         }
7071         if (uap->gid != VNOVAL) {
7072                 VATTR_SET(&va, va_gid, uap->gid);
7073         }
7074
7075 #if NAMEDSTREAMS
7076         /* chown calls are not allowed for resource forks. */
7077         if (vp->v_flag & VISNAMEDSTREAM) {
7078                 error = EPERM;
7079                 goto out;
7080         }
7081 #endif
7082
7083 #if CONFIG_MACF
7084         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7085         if (error) {
7086                 goto out;
7087         }
7088 #endif
7089
7090         /* preflight and authorize attribute changes */
7091         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7092                 goto out;
7093         }
7094         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7095                 if (error == EACCES) {
7096                         error = EPERM;
7097                 }
7098                 goto out;
7099         }
7100         error = vnode_setattr(vp, &va, ctx);
7101
7102 #if CONFIG_MACF
7103         if (error == 0) {
7104                 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7105         }
7106 #endif
7107
7108 out:
7109         (void)vnode_put(vp);
7110         file_drop(uap->fd);
7111         return error;
7112 }
7113
7114 static int
7115 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7116 {
7117         int error;
7118
7119         if (usrtvp == USER_ADDR_NULL) {
7120                 struct timeval old_tv;
7121                 /* XXX Y2038 bug because of microtime argument */
7122                 microtime(&old_tv);
7123                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7124                 tsp[1] = tsp[0];
7125         } else {
7126                 if (IS_64BIT_PROCESS(current_proc())) {
7127                         struct user64_timeval tv[2];
7128                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
7129                         if (error) {
7130                                 return error;
7131                         }
7132                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7133                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7134                 } else {
7135                         struct user32_timeval tv[2];
7136                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
7137                         if (error) {
7138                                 return error;
7139                         }
7140                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7141                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7142                 }
7143         }
7144         return 0;
7145 }
7146
7147 static int
7148 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7149     int nullflag)
7150 {
7151         int error;
7152         struct vnode_attr va;
7153         kauth_action_t action;
7154
7155         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7156
7157         VATTR_INIT(&va);
7158         VATTR_SET(&va, va_access_time, ts[0]);
7159         VATTR_SET(&va, va_modify_time, ts[1]);
7160         if (nullflag) {
7161                 va.va_vaflags |= VA_UTIMES_NULL;
7162         }
7163
7164 #if NAMEDSTREAMS
7165         /* utimes calls are not allowed for resource forks. */
7166         if (vp->v_flag & VISNAMEDSTREAM) {
7167                 error = EPERM;
7168                 goto out;
7169         }
7170 #endif
7171
7172 #if CONFIG_MACF
7173         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7174         if (error) {
7175                 goto out;
7176         }
7177 #endif
7178         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7179                 if (!nullflag && error == EACCES) {
7180                         error = EPERM;
7181                 }
7182                 goto out;
7183         }
7184
7185         /* since we may not need to auth anything, check here */
7186         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7187                 if (!nullflag && error == EACCES) {
7188                         error = EPERM;
7189                 }
7190                 goto out;
7191         }
7192         error = vnode_setattr(vp, &va, ctx);
7193
7194 #if CONFIG_MACF
7195         if (error == 0) {
7196                 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7197         }
7198 #endif
7199
7200 out:
7201         return error;
7202 }
7203
7204 /*
7205  * Set the access and modification times of a file.
7206  */
7207 /* ARGSUSED */
7208 int
7209 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7210 {
7211         struct timespec ts[2];
7212         user_addr_t usrtvp;
7213         int error;
7214         struct nameidata nd;
7215         vfs_context_t ctx = vfs_context_current();
7216
7217         /*
7218          * AUDIT: Needed to change the order of operations to do the
7219          * name lookup first because auditing wants the path.
7220          */
7221         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7222             UIO_USERSPACE, uap->path, ctx);
7223         error = namei(&nd);
7224         if (error) {
7225                 return error;
7226         }
7227         nameidone(&nd);
7228
7229         /*
7230          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
7231          * the current time instead.
7232          */
7233         usrtvp = uap->tptr;
7234         if ((error = getutimes(usrtvp, ts)) != 0) {
7235                 goto out;
7236         }
7237
7238         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7239
7240 out:
7241         vnode_put(nd.ni_vp);
7242         return error;
7243 }
7244
7245 /*
7246  * Set the access and modification times of a file.
7247  */
7248 /* ARGSUSED */
7249 int
7250 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7251 {
7252         struct timespec ts[2];
7253         vnode_t vp;
7254         user_addr_t usrtvp;
7255         int error;
7256
7257         AUDIT_ARG(fd, uap->fd);
7258         usrtvp = uap->tptr;
7259         if ((error = getutimes(usrtvp, ts)) != 0) {
7260                 return error;
7261         }
7262         if ((error = file_vnode(uap->fd, &vp)) != 0) {
7263                 return error;
7264         }
7265         if ((error = vnode_getwithref(vp))) {
7266                 file_drop(uap->fd);
7267                 return error;
7268         }
7269
7270         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7271         vnode_put(vp);
7272         file_drop(uap->fd);
7273         return error;
7274 }
7275
7276 /*
7277  * Truncate a file given its path name.
7278  */
7279 /* ARGSUSED */
7280 int
7281 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7282 {
7283         vnode_t vp;
7284         struct vnode_attr va;
7285         vfs_context_t ctx = vfs_context_current();
7286         int error;
7287         struct nameidata nd;
7288         kauth_action_t action;
7289
7290         if (uap->length < 0) {
7291                 return EINVAL;
7292         }
7293         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7294             UIO_USERSPACE, uap->path, ctx);
7295         if ((error = namei(&nd))) {
7296                 return error;
7297         }
7298         vp = nd.ni_vp;
7299
7300         nameidone(&nd);
7301
7302         VATTR_INIT(&va);
7303         VATTR_SET(&va, va_data_size, uap->length);
7304
7305 #if CONFIG_MACF
7306         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7307         if (error) {
7308                 goto out;
7309         }
7310 #endif
7311
7312         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7313                 goto out;
7314         }
7315         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7316                 goto out;
7317         }
7318         error = vnode_setattr(vp, &va, ctx);
7319
7320 #if CONFIG_MACF
7321         if (error == 0) {
7322                 mac_vnode_notify_truncate(ctx, NOCRED, vp);
7323         }
7324 #endif
7325
7326 out:
7327         vnode_put(vp);
7328         return error;
7329 }
7330
7331 /*
7332  * Truncate a file given a file descriptor.
7333  */
7334 /* ARGSUSED */
7335 int
7336 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7337 {
7338         vfs_context_t ctx = vfs_context_current();
7339         struct vnode_attr va;
7340         vnode_t vp;
7341         struct fileproc *fp;
7342         int error;
7343         int fd = uap->fd;
7344
7345         AUDIT_ARG(fd, uap->fd);
7346         if (uap->length < 0) {
7347                 return EINVAL;
7348         }
7349
7350         if ((error = fp_lookup(p, fd, &fp, 0))) {
7351                 return error;
7352         }
7353
7354         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
7355         case DTYPE_PSXSHM:
7356                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7357                 goto out;
7358         case DTYPE_VNODE:
7359                 break;
7360         default:
7361                 error = EINVAL;
7362                 goto out;
7363         }
7364
7365         vp = (vnode_t)fp->f_fglob->fg_data;
7366
7367         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
7368                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7369                 error = EINVAL;
7370                 goto out;
7371         }
7372
7373         if ((error = vnode_getwithref(vp)) != 0) {
7374                 goto out;
7375         }
7376
7377         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7378
7379 #if CONFIG_MACF
7380         error = mac_vnode_check_truncate(ctx,
7381             fp->f_fglob->fg_cred, vp);
7382         if (error) {
7383                 (void)vnode_put(vp);
7384                 goto out;
7385         }
7386 #endif
7387         VATTR_INIT(&va);
7388         VATTR_SET(&va, va_data_size, uap->length);
7389         error = vnode_setattr(vp, &va, ctx);
7390
7391 #if CONFIG_MACF
7392         if (error == 0) {
7393                 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
7394         }
7395 #endif
7396
7397         (void)vnode_put(vp);
7398 out:
7399         file_drop(fd);
7400         return error;
7401 }
7402
7403
7404 /*
7405  * Sync an open file with synchronized I/O _file_ integrity completion
7406  */
7407 /* ARGSUSED */
7408 int
7409 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7410 {
7411         __pthread_testcancel(1);
7412         return fsync_common(p, uap, MNT_WAIT);
7413 }
7414
7415
7416 /*
7417  * Sync an open file with synchronized I/O _file_ integrity completion
7418  *
7419  * Notes:       This is a legacy support function that does not test for
7420  *              thread cancellation points.
7421  */
7422 /* ARGSUSED */
7423 int
7424 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7425 {
7426         return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7427 }
7428
7429
7430 /*
7431  * Sync an open file with synchronized I/O _data_ integrity completion
7432  */
7433 /* ARGSUSED */
7434 int
7435 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7436 {
7437         __pthread_testcancel(1);
7438         return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7439 }
7440
7441
7442 /*
7443  * fsync_common
7444  *
7445  * Common fsync code to support both synchronized I/O file integrity completion
7446  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7447  *
7448  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7449  * will only guarantee that the file data contents are retrievable.  If
7450  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7451  * includes additional metadata unnecessary for retrieving the file data
7452  * contents, such as atime, mtime, ctime, etc., also be committed to stable
7453  * storage.
7454  *
7455  * Parameters:  p                               The process
7456  *              uap->fd                         The descriptor to synchronize
7457  *              flags                           The data integrity flags
7458  *
7459  * Returns:     int                             Success
7460  *      fp_getfvp:EBADF                         Bad file descriptor
7461  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
7462  *      VNOP_FSYNC:???                          unspecified
7463  *
7464  * Notes:       We use struct fsync_args because it is a short name, and all
7465  *              caller argument structures are otherwise identical.
7466  */
7467 static int
7468 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7469 {
7470         vnode_t vp;
7471         struct fileproc *fp;
7472         vfs_context_t ctx = vfs_context_current();
7473         int error;
7474
7475         AUDIT_ARG(fd, uap->fd);
7476
7477         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7478                 return error;
7479         }
7480         if ((error = vnode_getwithref(vp))) {
7481                 file_drop(uap->fd);
7482                 return error;
7483         }
7484
7485         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7486
7487         error = VNOP_FSYNC(vp, flags, ctx);
7488
7489 #if NAMEDRSRCFORK
7490         /* Sync resource fork shadow file if necessary. */
7491         if ((error == 0) &&
7492             (vp->v_flag & VISNAMEDSTREAM) &&
7493             (vp->v_parent != NULLVP) &&
7494             vnode_isshadow(vp) &&
7495             (fp->f_flags & FP_WRITTEN)) {
7496                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7497         }
7498 #endif
7499
7500         (void)vnode_put(vp);
7501         file_drop(uap->fd);
7502         return error;
7503 }
7504
7505 /*
7506  * Duplicate files.  Source must be a file, target must be a file or
7507  * must not exist.
7508  *
7509  * XXX Copyfile authorisation checking is woefully inadequate, and will not
7510  *     perform inheritance correctly.
7511  */
7512 /* ARGSUSED */
7513 int
7514 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7515 {
7516         vnode_t tvp, fvp, tdvp, sdvp;
7517         struct nameidata fromnd, tond;
7518         int error;
7519         vfs_context_t ctx = vfs_context_current();
7520 #if CONFIG_MACF
7521         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
7522         struct vnode_attr va;
7523 #endif
7524
7525         /* Check that the flags are valid. */
7526
7527         if (uap->flags & ~CPF_MASK) {
7528                 return EINVAL;
7529         }
7530
7531         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7532             UIO_USERSPACE, uap->from, ctx);
7533         if ((error = namei(&fromnd))) {
7534                 return error;
7535         }
7536         fvp = fromnd.ni_vp;
7537
7538         NDINIT(&tond, CREATE, OP_LINK,
7539             LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7540             UIO_USERSPACE, uap->to, ctx);
7541         if ((error = namei(&tond))) {
7542                 goto out1;
7543         }
7544         tdvp = tond.ni_dvp;
7545         tvp = tond.ni_vp;
7546
7547         if (tvp != NULL) {
7548                 if (!(uap->flags & CPF_OVERWRITE)) {
7549                         error = EEXIST;
7550                         goto out;
7551                 }
7552         }
7553
7554         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7555                 error = EISDIR;
7556                 goto out;
7557         }
7558
7559         /* This calls existing MAC hooks for open  */
7560         if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
7561             NULL))) {
7562                 goto out;
7563         }
7564
7565         if (tvp) {
7566                 /*
7567                  * See unlinkat_internal for an explanation of the potential
7568                  * ENOENT from the MAC hook but the gist is that the MAC hook
7569                  * can fail because vn_getpath isn't able to return the full
7570                  * path. We choose to ignore this failure.
7571                  */
7572                 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
7573                 if (error && error != ENOENT) {
7574                         goto out;
7575                 }
7576                 error = 0;
7577         }
7578
7579 #if CONFIG_MACF
7580         VATTR_INIT(&va);
7581         VATTR_SET(&va, va_type, fvp->v_type);
7582         /* Mask off all but regular access permissions */
7583         VATTR_SET(&va, va_mode,
7584             ((((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
7585         error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
7586         if (error) {
7587                 goto out;
7588         }
7589 #endif /* CONFIG_MACF */
7590
7591         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7592                 goto out;
7593         }
7594
7595         if (fvp == tdvp) {
7596                 error = EINVAL;
7597         }
7598         /*
7599          * If source is the same as the destination (that is the
7600          * same inode number) then there is nothing to do.
7601          * (fixed to have POSIX semantics - CSM 3/2/98)
7602          */
7603         if (fvp == tvp) {
7604                 error = -1;
7605         }
7606         if (!error) {
7607                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7608         }
7609 out:
7610         sdvp = tond.ni_startdir;
7611         /*
7612          * nameidone has to happen before we vnode_put(tdvp)
7613          * since it may need to release the fs_nodelock on the tdvp
7614          */
7615         nameidone(&tond);
7616
7617         if (tvp) {
7618                 vnode_put(tvp);
7619         }
7620         vnode_put(tdvp);
7621         vnode_put(sdvp);
7622 out1:
7623         vnode_put(fvp);
7624
7625         nameidone(&fromnd);
7626
7627         if (error == -1) {
7628                 return 0;
7629         }
7630         return error;
7631 }
7632
7633 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7634
7635 /*
7636  * Helper function for doing clones. The caller is expected to provide an
7637  * iocounted source vnode and release it.
7638  */
7639 static int
7640 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7641     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7642 {
7643         vnode_t tvp, tdvp;
7644         struct nameidata tond;
7645         int error;
7646         int follow;
7647         boolean_t free_src_acl;
7648         boolean_t attr_cleanup;
7649         enum vtype v_type;
7650         kauth_action_t action;
7651         struct componentname *cnp;
7652         uint32_t defaulted;
7653         struct vnode_attr va;
7654         struct vnode_attr nva;
7655         uint32_t vnop_flags;
7656
7657         v_type = vnode_vtype(fvp);
7658         switch (v_type) {
7659         case VLNK:
7660         /* FALLTHRU */
7661         case VREG:
7662                 action = KAUTH_VNODE_ADD_FILE;
7663                 break;
7664         case VDIR:
7665                 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7666                     fvp->v_mountedhere) {
7667                         return EINVAL;
7668                 }
7669                 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7670                 break;
7671         default:
7672                 return EINVAL;
7673         }
7674
7675         AUDIT_ARG(fd2, dst_dirfd);
7676         AUDIT_ARG(value32, flags);
7677
7678         follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7679         NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7680             UIO_USERSPACE, dst, ctx);
7681         if ((error = nameiat(&tond, dst_dirfd))) {
7682                 return error;
7683         }
7684         cnp = &tond.ni_cnd;
7685         tdvp = tond.ni_dvp;
7686         tvp = tond.ni_vp;
7687
7688         free_src_acl = FALSE;
7689         attr_cleanup = FALSE;
7690
7691         if (tvp != NULL) {
7692                 error = EEXIST;
7693                 goto out;
7694         }
7695
7696         if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7697                 error = EXDEV;
7698                 goto out;
7699         }
7700
7701 #if CONFIG_MACF
7702         if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
7703                 goto out;
7704         }
7705 #endif
7706         if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
7707                 goto out;
7708         }
7709
7710         action = KAUTH_VNODE_GENERIC_READ_BITS;
7711         if (data_read_authorised) {
7712                 action &= ~KAUTH_VNODE_READ_DATA;
7713         }
7714         if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
7715                 goto out;
7716         }
7717
7718         /*
7719          * certain attributes may need to be changed from the source, we ask for
7720          * those here.
7721          */
7722         VATTR_INIT(&va);
7723         VATTR_WANTED(&va, va_uid);
7724         VATTR_WANTED(&va, va_gid);
7725         VATTR_WANTED(&va, va_mode);
7726         VATTR_WANTED(&va, va_flags);
7727         VATTR_WANTED(&va, va_acl);
7728
7729         if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
7730                 goto out;
7731         }
7732
7733         VATTR_INIT(&nva);
7734         VATTR_SET(&nva, va_type, v_type);
7735         if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
7736                 VATTR_SET(&nva, va_acl, va.va_acl);
7737                 free_src_acl = TRUE;
7738         }
7739
7740         /* Handle ACL inheritance, initialize vap. */
7741         if (v_type == VLNK) {
7742                 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
7743         } else {
7744                 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
7745                 if (error) {
7746                         goto out;
7747                 }
7748                 attr_cleanup = TRUE;
7749         }
7750
7751         vnop_flags = VNODE_CLONEFILE_DEFAULT;
7752         /*
7753          * We've got initial values for all security parameters,
7754          * If we are superuser, then we can change owners to be the
7755          * same as the source. Both superuser and the owner have default
7756          * WRITE_SECURITY privileges so all other fields can be taken
7757          * from source as well.
7758          */
7759         if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
7760                 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
7761                         VATTR_SET(&nva, va_uid, va.va_uid);
7762                 }
7763                 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
7764                         VATTR_SET(&nva, va_gid, va.va_gid);
7765                 }
7766         } else {
7767                 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
7768         }
7769
7770         if (VATTR_IS_SUPPORTED(&va, va_mode)) {
7771                 VATTR_SET(&nva, va_mode, va.va_mode);
7772         }
7773         if (VATTR_IS_SUPPORTED(&va, va_flags)) {
7774                 VATTR_SET(&nva, va_flags,
7775                     ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
7776                     (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
7777         }
7778
7779         error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
7780
7781         if (!error && tvp) {
7782                 int     update_flags = 0;
7783 #if CONFIG_FSE
7784                 int fsevent;
7785 #endif /* CONFIG_FSE */
7786
7787 #if CONFIG_MACF
7788                 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
7789                     VNODE_LABEL_CREATE, ctx);
7790 #endif
7791                 /*
7792                  * If some of the requested attributes weren't handled by the
7793                  * VNOP, use our fallback code.
7794                  */
7795                 if (!VATTR_ALL_SUPPORTED(&va)) {
7796                         (void)vnode_setattr_fallback(tvp, &nva, ctx);
7797                 }
7798
7799                 // Make sure the name & parent pointers are hooked up
7800                 if (tvp->v_name == NULL) {
7801                         update_flags |= VNODE_UPDATE_NAME;
7802                 }
7803                 if (tvp->v_parent == NULLVP) {
7804                         update_flags |= VNODE_UPDATE_PARENT;
7805                 }
7806
7807                 if (update_flags) {
7808                         (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
7809                             cnp->cn_namelen, cnp->cn_hash, update_flags);
7810                 }
7811
7812 #if CONFIG_FSE
7813                 switch (vnode_vtype(tvp)) {
7814                 case VLNK:
7815                 /* FALLTHRU */
7816                 case VREG:
7817                         fsevent = FSE_CREATE_FILE;
7818                         break;
7819                 case VDIR:
7820                         fsevent = FSE_CREATE_DIR;
7821                         break;
7822                 default:
7823                         goto out;
7824                 }
7825
7826                 if (need_fsevent(fsevent, tvp)) {
7827                         /*
7828                          * The following is a sequence of three explicit events.
7829                          * A pair of FSE_CLONE events representing the source and destination
7830                          * followed by an FSE_CREATE_[FILE | DIR] for the destination.
7831                          * fseventsd may coalesce the destination clone and create events
7832                          * into a single event resulting in the following sequence for a client
7833                          * FSE_CLONE (src)
7834                          * FSE_CLONE | FSE_CREATE (dst)
7835                          */
7836                         add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
7837                             FSE_ARG_DONE);
7838                         add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
7839                             FSE_ARG_DONE);
7840                 }
7841 #endif /* CONFIG_FSE */
7842         }
7843
7844 out:
7845         if (attr_cleanup) {
7846                 vn_attribute_cleanup(&nva, defaulted);
7847         }
7848         if (free_src_acl && va.va_acl) {
7849                 kauth_acl_free(va.va_acl);
7850         }
7851         nameidone(&tond);
7852         if (tvp) {
7853                 vnode_put(tvp);
7854         }
7855         vnode_put(tdvp);
7856         return error;
7857 }
7858
7859 /*
7860  * clone files or directories, target must not exist.
7861  */
7862 /* ARGSUSED */
7863 int
7864 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7865     __unused int32_t *retval)
7866 {
7867         vnode_t fvp;
7868         struct nameidata fromnd;
7869         int follow;
7870         int error;
7871         vfs_context_t ctx = vfs_context_current();
7872
7873         /* Check that the flags are valid. */
7874         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7875                 return EINVAL;
7876         }
7877
7878         AUDIT_ARG(fd, uap->src_dirfd);
7879
7880         follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7881         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7882             UIO_USERSPACE, uap->src, ctx);
7883         if ((error = nameiat(&fromnd, uap->src_dirfd))) {
7884                 return error;
7885         }
7886
7887         fvp = fromnd.ni_vp;
7888         nameidone(&fromnd);
7889
7890         error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7891             uap->flags, ctx);
7892
7893         vnode_put(fvp);
7894         return error;
7895 }
7896
7897 int
7898 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7899     __unused int32_t *retval)
7900 {
7901         vnode_t fvp;
7902         struct fileproc *fp;
7903         int error;
7904         vfs_context_t ctx = vfs_context_current();
7905
7906         /* Check that the flags are valid. */
7907         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7908                 return EINVAL;
7909         }
7910
7911         AUDIT_ARG(fd, uap->src_fd);
7912         error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7913         if (error) {
7914                 return error;
7915         }
7916
7917         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7918                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7919                 error = EBADF;
7920                 goto out;
7921         }
7922
7923         if ((error = vnode_getwithref(fvp))) {
7924                 goto out;
7925         }
7926
7927         AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7928
7929         error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7930             uap->flags, ctx);
7931
7932         vnode_put(fvp);
7933 out:
7934         file_drop(uap->src_fd);
7935         return error;
7936 }
7937
7938 static int
7939 rename_submounts_callback(mount_t mp, void *arg)
7940 {
7941         int error = 0;
7942         mount_t pmp = (mount_t)arg;
7943         int prefix_len = strlen(pmp->mnt_vfsstat.f_mntonname);
7944
7945         if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
7946                 return 0;
7947         }
7948
7949         if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
7950                 return 0;
7951         }
7952
7953         if ((error = vfs_busy(mp, LK_NOWAIT))) {
7954                 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
7955                 return -1;
7956         }
7957
7958         int pathlen = MAXPATHLEN;
7959         if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
7960                 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
7961         }
7962
7963         vfs_unbusy(mp);
7964
7965         return error;
7966 }
7967
7968 /*
7969  * Rename files.  Source and destination must either both be directories,
7970  * or both not be directories.  If target is a directory, it must be empty.
7971  */
7972 /* ARGSUSED */
7973 static int
7974 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
7975     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
7976 {
7977         if (flags & ~VFS_RENAME_FLAGS_MASK) {
7978                 return EINVAL;
7979         }
7980
7981         if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
7982                 return EINVAL;
7983         }
7984
7985         vnode_t tvp, tdvp;
7986         vnode_t fvp, fdvp;
7987         struct nameidata *fromnd, *tond;
7988         int error;
7989         int do_retry;
7990         int retry_count;
7991         int mntrename;
7992         int need_event;
7993         int need_kpath2;
7994         int has_listeners;
7995         const char *oname = NULL;
7996         char *from_name = NULL, *to_name = NULL;
7997         char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
7998         int from_len = 0, to_len = 0;
7999         int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8000         int holding_mntlock;
8001         mount_t locked_mp = NULL;
8002         vnode_t oparent = NULLVP;
8003 #if CONFIG_FSE
8004         fse_info from_finfo, to_finfo;
8005 #endif
8006         int from_truncated = 0, to_truncated = 0;
8007         int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8008         int batched = 0;
8009         struct vnode_attr *fvap, *tvap;
8010         int continuing = 0;
8011         /* carving out a chunk for structs that are too big to be on stack. */
8012         struct {
8013                 struct nameidata from_node, to_node;
8014                 struct vnode_attr fv_attr, tv_attr;
8015         } * __rename_data;
8016         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
8017         fromnd = &__rename_data->from_node;
8018         tond = &__rename_data->to_node;
8019
8020         holding_mntlock = 0;
8021         do_retry = 0;
8022         retry_count = 0;
8023 retry:
8024         fvp = tvp = NULL;
8025         fdvp = tdvp = NULL;
8026         fvap = tvap = NULL;
8027         mntrename = FALSE;
8028
8029         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8030             segflg, from, ctx);
8031         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
8032
8033         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8034             segflg, to, ctx);
8035         tond->ni_flag = NAMEI_COMPOUNDRENAME;
8036
8037 continue_lookup:
8038         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8039                 if ((error = nameiat(fromnd, fromfd))) {
8040                         goto out1;
8041                 }
8042                 fdvp = fromnd->ni_dvp;
8043                 fvp  = fromnd->ni_vp;
8044
8045                 if (fvp && fvp->v_type == VDIR) {
8046                         tond->ni_cnd.cn_flags |= WILLBEDIR;
8047                 }
8048         }
8049
8050         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8051                 if ((error = nameiat(tond, tofd))) {
8052                         /*
8053                          * Translate error code for rename("dir1", "dir2/.").
8054                          */
8055                         if (error == EISDIR && fvp->v_type == VDIR) {
8056                                 error = EINVAL;
8057                         }
8058                         goto out1;
8059                 }
8060                 tdvp = tond->ni_dvp;
8061                 tvp  = tond->ni_vp;
8062         }
8063
8064 #if DEVELOPMENT || DEBUG
8065         /*
8066          * XXX VSWAP: Check for entitlements or special flag here
8067          * so we can restrict access appropriately.
8068          */
8069 #else /* DEVELOPMENT || DEBUG */
8070
8071         if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8072                 error = EPERM;
8073                 goto out1;
8074         }
8075
8076         if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8077                 error = EPERM;
8078                 goto out1;
8079         }
8080 #endif /* DEVELOPMENT || DEBUG */
8081
8082         if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8083                 error = ENOENT;
8084                 goto out1;
8085         }
8086
8087         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8088                 error = EEXIST;
8089                 goto out1;
8090         }
8091
8092         batched = vnode_compound_rename_available(fdvp);
8093
8094 #if CONFIG_FSE
8095         need_event = need_fsevent(FSE_RENAME, fdvp);
8096         if (need_event) {
8097                 if (fvp) {
8098                         get_fse_info(fvp, &from_finfo, ctx);
8099                 } else {
8100                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8101                         if (error) {
8102                                 goto out1;
8103                         }
8104
8105                         fvap = &__rename_data->fv_attr;
8106                 }
8107
8108                 if (tvp) {
8109                         get_fse_info(tvp, &to_finfo, ctx);
8110                 } else if (batched) {
8111                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8112                         if (error) {
8113                                 goto out1;
8114                         }
8115
8116                         tvap = &__rename_data->tv_attr;
8117                 }
8118         }
8119 #else
8120         need_event = 0;
8121 #endif /* CONFIG_FSE */
8122
8123         has_listeners = kauth_authorize_fileop_has_listeners();
8124
8125         need_kpath2 = 0;
8126 #if CONFIG_AUDIT
8127         if (AUDIT_RECORD_EXISTS()) {
8128                 need_kpath2 = 1;
8129         }
8130 #endif
8131
8132         if (need_event || has_listeners) {
8133                 if (from_name == NULL) {
8134                         GET_PATH(from_name);
8135                         if (from_name == NULL) {
8136                                 error = ENOMEM;
8137                                 goto out1;
8138                         }
8139                 }
8140
8141                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8142
8143                 if (from_name_no_firmlink == NULL) {
8144                         GET_PATH(from_name_no_firmlink);
8145                         if (from_name_no_firmlink == NULL) {
8146                                 error = ENOMEM;
8147                                 goto out1;
8148                         }
8149                 }
8150
8151                 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8152         }
8153
8154         if (need_event || need_kpath2 || has_listeners) {
8155                 if (to_name == NULL) {
8156                         GET_PATH(to_name);
8157                         if (to_name == NULL) {
8158                                 error = ENOMEM;
8159                                 goto out1;
8160                         }
8161                 }
8162
8163                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8164
8165                 if (to_name_no_firmlink == NULL) {
8166                         GET_PATH(to_name_no_firmlink);
8167                         if (to_name_no_firmlink == NULL) {
8168                                 error = ENOMEM;
8169                                 goto out1;
8170                         }
8171                 }
8172
8173                 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8174                 if (to_name && need_kpath2) {
8175                         AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8176                 }
8177         }
8178         if (!fvp) {
8179                 /*
8180                  * Claim: this check will never reject a valid rename.
8181                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8182                  * Suppose fdvp and tdvp are not on the same mount.
8183                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
8184                  *      then you can't move it to within another dir on the same mountpoint.
8185                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8186                  *
8187                  * If this check passes, then we are safe to pass these vnodes to the same FS.
8188                  */
8189                 if (fdvp->v_mount != tdvp->v_mount) {
8190                         error = EXDEV;
8191                         goto out1;
8192                 }
8193                 goto skipped_lookup;
8194         }
8195
8196         if (!batched) {
8197                 error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
8198                 if (error) {
8199                         if (error == ENOENT) {
8200                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8201                                         /*
8202                                          * We encountered a race where after doing the namei, tvp stops
8203                                          * being valid. If so, simply re-drive the rename call from the
8204                                          * top.
8205                                          */
8206                                         do_retry = 1;
8207                                         retry_count += 1;
8208                                 }
8209                         }
8210                         goto out1;
8211                 }
8212         }
8213
8214         /*
8215          * If the source and destination are the same (i.e. they're
8216          * links to the same vnode) and the target file system is
8217          * case sensitive, then there is nothing to do.
8218          *
8219          * XXX Come back to this.
8220          */
8221         if (fvp == tvp) {
8222                 int pathconf_val;
8223
8224                 /*
8225                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8226                  * then assume that this file system is case sensitive.
8227                  */
8228                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8229                     pathconf_val != 0) {
8230                         goto out1;
8231                 }
8232         }
8233
8234         /*
8235          * Allow the renaming of mount points.
8236          * - target must not exist
8237          * - target must reside in the same directory as source
8238          * - union mounts cannot be renamed
8239          * - "/" cannot be renamed
8240          *
8241          * XXX Handle this in VFS after a continued lookup (if we missed
8242          * in the cache to start off)
8243          *
8244          * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8245          * we'll skip past here.  The file system is responsible for
8246          * checking that @tvp is not a descendent of @fvp and vice versa
8247          * so it should always return EINVAL if either @tvp or @fvp is the
8248          * root of a volume.
8249          */
8250         if ((fvp->v_flag & VROOT) &&
8251             (fvp->v_type == VDIR) &&
8252             (tvp == NULL) &&
8253             (fvp->v_mountedhere == NULL) &&
8254             (fdvp == tdvp) &&
8255             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8256             ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8257             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8258                 vnode_t coveredvp;
8259
8260                 /* switch fvp to the covered vnode */
8261                 coveredvp = fvp->v_mount->mnt_vnodecovered;
8262                 if ((vnode_getwithref(coveredvp))) {
8263                         error = ENOENT;
8264                         goto out1;
8265                 }
8266                 vnode_put(fvp);
8267
8268                 fvp = coveredvp;
8269                 mntrename = TRUE;
8270         }
8271         /*
8272          * Check for cross-device rename.
8273          */
8274         if ((fvp->v_mount != tdvp->v_mount) ||
8275             (tvp && (fvp->v_mount != tvp->v_mount))) {
8276                 error = EXDEV;
8277                 goto out1;
8278         }
8279
8280         /*
8281          * If source is the same as the destination (that is the
8282          * same inode number) then there is nothing to do...
8283          * EXCEPT if the underlying file system supports case
8284          * insensitivity and is case preserving.  In this case
8285          * the file system needs to handle the special case of
8286          * getting the same vnode as target (fvp) and source (tvp).
8287          *
8288          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8289          * and _PC_CASE_PRESERVING can have this exception, and they need to
8290          * handle the special case of getting the same vnode as target and
8291          * source.  NOTE: Then the target is unlocked going into vnop_rename,
8292          * so not to cause locking problems. There is a single reference on tvp.
8293          *
8294          * NOTE - that fvp == tvp also occurs if they are hard linked and
8295          * that correct behaviour then is just to return success without doing
8296          * anything.
8297          *
8298          * XXX filesystem should take care of this itself, perhaps...
8299          */
8300         if (fvp == tvp && fdvp == tdvp) {
8301                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8302                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8303                     fromnd->ni_cnd.cn_namelen)) {
8304                         goto out1;
8305                 }
8306         }
8307
8308         if (holding_mntlock && fvp->v_mount != locked_mp) {
8309                 /*
8310                  * we're holding a reference and lock
8311                  * on locked_mp, but it no longer matches
8312                  * what we want to do... so drop our hold
8313                  */
8314                 mount_unlock_renames(locked_mp);
8315                 mount_drop(locked_mp, 0);
8316                 holding_mntlock = 0;
8317         }
8318         if (tdvp != fdvp && fvp->v_type == VDIR) {
8319                 /*
8320                  * serialize renames that re-shape
8321                  * the tree... if holding_mntlock is
8322                  * set, then we're ready to go...
8323                  * otherwise we
8324                  * first need to drop the iocounts
8325                  * we picked up, second take the
8326                  * lock to serialize the access,
8327                  * then finally start the lookup
8328                  * process over with the lock held
8329                  */
8330                 if (!holding_mntlock) {
8331                         /*
8332                          * need to grab a reference on
8333                          * the mount point before we
8334                          * drop all the iocounts... once
8335                          * the iocounts are gone, the mount
8336                          * could follow
8337                          */
8338                         locked_mp = fvp->v_mount;
8339                         mount_ref(locked_mp, 0);
8340
8341                         /*
8342                          * nameidone has to happen before we vnode_put(tvp)
8343                          * since it may need to release the fs_nodelock on the tvp
8344                          */
8345                         nameidone(tond);
8346
8347                         if (tvp) {
8348                                 vnode_put(tvp);
8349                         }
8350                         vnode_put(tdvp);
8351
8352                         /*
8353                          * nameidone has to happen before we vnode_put(fdvp)
8354                          * since it may need to release the fs_nodelock on the fvp
8355                          */
8356                         nameidone(fromnd);
8357
8358                         vnode_put(fvp);
8359                         vnode_put(fdvp);
8360
8361                         mount_lock_renames(locked_mp);
8362                         holding_mntlock = 1;
8363
8364                         goto retry;
8365                 }
8366         } else {
8367                 /*
8368                  * when we dropped the iocounts to take
8369                  * the lock, we allowed the identity of
8370                  * the various vnodes to change... if they did,
8371                  * we may no longer be dealing with a rename
8372                  * that reshapes the tree... once we're holding
8373                  * the iocounts, the vnodes can't change type
8374                  * so we're free to drop the lock at this point
8375                  * and continue on
8376                  */
8377                 if (holding_mntlock) {
8378                         mount_unlock_renames(locked_mp);
8379                         mount_drop(locked_mp, 0);
8380                         holding_mntlock = 0;
8381                 }
8382         }
8383
8384         // save these off so we can later verify that fvp is the same
8385         oname   = fvp->v_name;
8386         oparent = fvp->v_parent;
8387
8388 skipped_lookup:
8389         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8390             tdvp, &tvp, &tond->ni_cnd, tvap,
8391             flags, ctx);
8392
8393         if (holding_mntlock) {
8394                 /*
8395                  * we can drop our serialization
8396                  * lock now
8397                  */
8398                 mount_unlock_renames(locked_mp);
8399                 mount_drop(locked_mp, 0);
8400                 holding_mntlock = 0;
8401         }
8402         if (error) {
8403                 if (error == EDATALESS) {
8404                         /*
8405                          * If we've been here before, something has gone
8406                          * horribly wrong and we should just get out lest
8407                          * we spiral around the drain forever.
8408                          */
8409                         if (flags & VFS_RENAME_DATALESS) {
8410                                 error = EIO;
8411                                 goto out1;
8412                         }
8413
8414                         /*
8415                          * The object we're renaming is dataless (or has a
8416                          * dataless descendent) and requires materialization
8417                          * before the rename occurs.  But we're holding the
8418                          * mount point's rename lock, so it's not safe to
8419                          * make the upcall.
8420                          *
8421                          * In this case, we release the lock, perform the
8422                          * materialization, and start the whole thing over.
8423                          */
8424                         error = vnode_materialize_dataless_file(fvp,
8425                             NAMESPACE_HANDLER_RENAME_OP);
8426
8427                         if (error == 0) {
8428                                 /*
8429                                  * The next time around we need to tell the
8430                                  * file system that the materializtaion has
8431                                  * been performed.
8432                                  */
8433                                 flags |= VFS_RENAME_DATALESS;
8434                                 do_retry = 1;
8435                         }
8436                         goto out1;
8437                 }
8438                 if (error == EKEEPLOOKING) {
8439                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8440                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8441                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8442                                 }
8443                         }
8444
8445                         fromnd->ni_vp = fvp;
8446                         tond->ni_vp = tvp;
8447
8448                         goto continue_lookup;
8449                 }
8450
8451                 /*
8452                  * We may encounter a race in the VNOP where the destination didn't
8453                  * exist when we did the namei, but it does by the time we go and
8454                  * try to create the entry. In this case, we should re-drive this rename
8455                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
8456                  * but other filesystems susceptible to this race could return it, too.
8457                  */
8458                 if (error == ERECYCLE) {
8459                         do_retry = 1;
8460                 }
8461
8462                 /*
8463                  * For compound VNOPs, the authorization callback may return
8464                  * ENOENT in case of racing hardlink lookups hitting the name
8465                  * cache, redrive the lookup.
8466                  */
8467                 if (batched && error == ENOENT) {
8468                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8469                                 do_retry = 1;
8470                                 retry_count += 1;
8471                         }
8472                 }
8473
8474                 goto out1;
8475         }
8476
8477         /* call out to allow 3rd party notification of rename.
8478          * Ignore result of kauth_authorize_fileop call.
8479          */
8480         kauth_authorize_fileop(vfs_context_ucred(ctx),
8481             KAUTH_FILEOP_RENAME,
8482             (uintptr_t)from_name, (uintptr_t)to_name);
8483         if (flags & VFS_RENAME_SWAP) {
8484                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8485                     KAUTH_FILEOP_RENAME,
8486                     (uintptr_t)to_name, (uintptr_t)from_name);
8487         }
8488
8489 #if CONFIG_FSE
8490         if (from_name != NULL && to_name != NULL) {
8491                 if (from_truncated || to_truncated) {
8492                         // set it here since only the from_finfo gets reported up to user space
8493                         from_finfo.mode |= FSE_TRUNCATED_PATH;
8494                 }
8495
8496                 if (tvap && tvp) {
8497                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8498                 }
8499                 if (fvap) {
8500                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8501                 }
8502
8503                 if (tvp) {
8504                         add_fsevent(FSE_RENAME, ctx,
8505                             FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8506                             FSE_ARG_FINFO, &from_finfo,
8507                             FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8508                             FSE_ARG_FINFO, &to_finfo,
8509                             FSE_ARG_DONE);
8510                         if (flags & VFS_RENAME_SWAP) {
8511                                 /*
8512                                  * Strictly speaking, swap is the equivalent of
8513                                  * *three* renames.  FSEvents clients should only take
8514                                  * the events as a hint, so we only bother reporting
8515                                  * two.
8516                                  */
8517                                 add_fsevent(FSE_RENAME, ctx,
8518                                     FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8519                                     FSE_ARG_FINFO, &to_finfo,
8520                                     FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8521                                     FSE_ARG_FINFO, &from_finfo,
8522                                     FSE_ARG_DONE);
8523                         }
8524                 } else {
8525                         add_fsevent(FSE_RENAME, ctx,
8526                             FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8527                             FSE_ARG_FINFO, &from_finfo,
8528                             FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8529                             FSE_ARG_DONE);
8530                 }
8531         }
8532 #endif /* CONFIG_FSE */
8533
8534         /*
8535          * update filesystem's mount point data
8536          */
8537         if (mntrename) {
8538                 char *cp, *pathend, *mpname;
8539                 char * tobuf;
8540                 struct mount *mp;
8541                 int maxlen;
8542                 size_t len = 0;
8543
8544                 mp = fvp->v_mountedhere;
8545
8546                 if (vfs_busy(mp, LK_NOWAIT)) {
8547                         error = EBUSY;
8548                         goto out1;
8549                 }
8550                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
8551
8552                 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8553                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8554                 } else {
8555                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8556                 }
8557                 if (!error) {
8558                         /* find current mount point prefix */
8559                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
8560                         for (cp = pathend; *cp != '\0'; ++cp) {
8561                                 if (*cp == '/') {
8562                                         pathend = cp + 1;
8563                                 }
8564                         }
8565                         /* find last component of target name */
8566                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8567                                 if (*cp == '/') {
8568                                         mpname = cp + 1;
8569                                 }
8570                         }
8571
8572                         /* Update f_mntonname of sub mounts */
8573                         vfs_iterate(0, rename_submounts_callback, (void *)mp);
8574
8575                         /* append name to prefix */
8576                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
8577                         bzero(pathend, maxlen);
8578
8579                         strlcpy(pathend, mpname, maxlen);
8580                 }
8581                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
8582
8583                 vfs_unbusy(mp);
8584
8585                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8586         }
8587         /*
8588          * fix up name & parent pointers.  note that we first
8589          * check that fvp has the same name/parent pointers it
8590          * had before the rename call... this is a 'weak' check
8591          * at best...
8592          *
8593          * XXX oparent and oname may not be set in the compound vnop case
8594          */
8595         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8596                 int update_flags;
8597
8598                 update_flags = VNODE_UPDATE_NAME;
8599
8600                 if (fdvp != tdvp) {
8601                         update_flags |= VNODE_UPDATE_PARENT;
8602                 }
8603
8604                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8605         }
8606 out1:
8607         if (to_name != NULL) {
8608                 RELEASE_PATH(to_name);
8609                 to_name = NULL;
8610         }
8611         if (to_name_no_firmlink != NULL) {
8612                 RELEASE_PATH(to_name_no_firmlink);
8613                 to_name_no_firmlink = NULL;
8614         }
8615         if (from_name != NULL) {
8616                 RELEASE_PATH(from_name);
8617                 from_name = NULL;
8618         }
8619         if (from_name_no_firmlink != NULL) {
8620                 RELEASE_PATH(from_name_no_firmlink);
8621                 from_name_no_firmlink = NULL;
8622         }
8623         if (holding_mntlock) {
8624                 mount_unlock_renames(locked_mp);
8625                 mount_drop(locked_mp, 0);
8626                 holding_mntlock = 0;
8627         }
8628         if (tdvp) {
8629                 /*
8630                  * nameidone has to happen before we vnode_put(tdvp)
8631                  * since it may need to release the fs_nodelock on the tdvp
8632                  */
8633                 nameidone(tond);
8634
8635                 if (tvp) {
8636                         vnode_put(tvp);
8637                 }
8638                 vnode_put(tdvp);
8639         }
8640         if (fdvp) {
8641                 /*
8642                  * nameidone has to happen before we vnode_put(fdvp)
8643                  * since it may need to release the fs_nodelock on the fdvp
8644                  */
8645                 nameidone(fromnd);
8646
8647                 if (fvp) {
8648                         vnode_put(fvp);
8649                 }
8650                 vnode_put(fdvp);
8651         }
8652
8653         /*
8654          * If things changed after we did the namei, then we will re-drive
8655          * this rename call from the top.
8656          */
8657         if (do_retry) {
8658                 do_retry = 0;
8659                 goto retry;
8660         }
8661
8662         FREE(__rename_data, M_TEMP);
8663         return error;
8664 }
8665
8666 int
8667 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
8668 {
8669         return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
8670                    AT_FDCWD, uap->to, UIO_USERSPACE, 0);
8671 }
8672
8673 int
8674 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
8675 {
8676         return renameat_internal(
8677                 vfs_context_current(),
8678                 uap->fromfd, uap->from,
8679                 uap->tofd, uap->to,
8680                 UIO_USERSPACE, uap->flags);
8681 }
8682
8683 int
8684 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
8685 {
8686         return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
8687                    uap->tofd, uap->to, UIO_USERSPACE, 0);
8688 }
8689
8690 /*
8691  * Make a directory file.
8692  *
8693  * Returns:     0                       Success
8694  *              EEXIST
8695  *      namei:???
8696  *      vnode_authorize:???
8697  *      vn_create:???
8698  */
8699 /* ARGSUSED */
8700 static int
8701 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
8702     enum uio_seg segflg)
8703 {
8704         vnode_t vp, dvp;
8705         int error;
8706         int update_flags = 0;
8707         int batched;
8708         struct nameidata nd;
8709
8710         AUDIT_ARG(mode, vap->va_mode);
8711         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
8712             path, ctx);
8713         nd.ni_cnd.cn_flags |= WILLBEDIR;
8714         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
8715
8716 continue_lookup:
8717         error = nameiat(&nd, fd);
8718         if (error) {
8719                 return error;
8720         }
8721         dvp = nd.ni_dvp;
8722         vp = nd.ni_vp;
8723
8724         if (vp != NULL) {
8725                 error = EEXIST;
8726                 goto out;
8727         }
8728
8729         batched = vnode_compound_mkdir_available(dvp);
8730
8731         VATTR_SET(vap, va_type, VDIR);
8732
8733         /*
8734          * XXX
8735          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
8736          * only get EXISTS or EISDIR for existing path components, and not that it could see
8737          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
8738          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
8739          */
8740         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
8741                 if (error == EACCES || error == EPERM) {
8742                         int error2;
8743
8744                         nameidone(&nd);
8745                         vnode_put(dvp);
8746                         dvp = NULLVP;
8747
8748                         /*
8749                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
8750                          * rather than EACCESS if the target exists.
8751                          */
8752                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
8753                             path, ctx);
8754                         error2 = nameiat(&nd, fd);
8755                         if (error2) {
8756                                 goto out;
8757                         } else {
8758                                 vp = nd.ni_vp;
8759                                 error = EEXIST;
8760                                 goto out;
8761                         }
8762                 }
8763
8764                 goto out;
8765         }
8766
8767         /*
8768          * make the directory
8769          */
8770         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
8771                 if (error == EKEEPLOOKING) {
8772                         nd.ni_vp = vp;
8773                         goto continue_lookup;
8774                 }
8775
8776                 goto out;
8777         }
8778
8779         // Make sure the name & parent pointers are hooked up
8780         if (vp->v_name == NULL) {
8781                 update_flags |= VNODE_UPDATE_NAME;
8782         }
8783         if (vp->v_parent == NULLVP) {
8784                 update_flags |= VNODE_UPDATE_PARENT;
8785         }
8786
8787         if (update_flags) {
8788                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
8789         }
8790
8791 #if CONFIG_FSE
8792         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
8793 #endif
8794
8795 out:
8796         /*
8797          * nameidone has to happen before we vnode_put(dvp)
8798          * since it may need to release the fs_nodelock on the dvp
8799          */
8800         nameidone(&nd);
8801
8802         if (vp) {
8803                 vnode_put(vp);
8804         }
8805         if (dvp) {
8806                 vnode_put(dvp);
8807         }
8808
8809         return error;
8810 }
8811
8812 /*
8813  * mkdir_extended: Create a directory; with extended security (ACL).
8814  *
8815  * Parameters:    p                       Process requesting to create the directory
8816  *                uap                     User argument descriptor (see below)
8817  *                retval                  (ignored)
8818  *
8819  * Indirect:      uap->path               Path of directory to create
8820  *                uap->mode               Access permissions to set
8821  *                uap->xsecurity          ACL to set
8822  *
8823  * Returns:        0                      Success
8824  *                !0                      Not success
8825  *
8826  */
8827 int
8828 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
8829 {
8830         int ciferror;
8831         kauth_filesec_t xsecdst;
8832         struct vnode_attr va;
8833
8834         AUDIT_ARG(owner, uap->uid, uap->gid);
8835
8836         xsecdst = NULL;
8837         if ((uap->xsecurity != USER_ADDR_NULL) &&
8838             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
8839                 return ciferror;
8840         }
8841
8842         VATTR_INIT(&va);
8843         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8844         if (xsecdst != NULL) {
8845                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
8846         }
8847
8848         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8849             UIO_USERSPACE);
8850         if (xsecdst != NULL) {
8851                 kauth_filesec_free(xsecdst);
8852         }
8853         return ciferror;
8854 }
8855
8856 int
8857 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
8858 {
8859         struct vnode_attr va;
8860
8861         VATTR_INIT(&va);
8862         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8863
8864         return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8865                    UIO_USERSPACE);
8866 }
8867
8868 int
8869 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
8870 {
8871         struct vnode_attr va;
8872
8873         VATTR_INIT(&va);
8874         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8875
8876         return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
8877                    UIO_USERSPACE);
8878 }
8879
8880 static int
8881 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
8882     enum uio_seg segflg, int unlink_flags)
8883 {
8884         vnode_t vp, dvp;
8885         int error;
8886         struct nameidata nd;
8887         char     *path = NULL;
8888         char     *no_firmlink_path = NULL;
8889         int       len_path = 0;
8890         int       len_no_firmlink_path = 0;
8891         int has_listeners = 0;
8892         int need_event = 0;
8893         int truncated_path = 0;
8894         int truncated_no_firmlink_path = 0;
8895 #if CONFIG_FSE
8896         struct vnode_attr va;
8897 #endif /* CONFIG_FSE */
8898         struct vnode_attr *vap = NULL;
8899         int restart_count = 0;
8900         int batched;
8901
8902         int restart_flag;
8903
8904         /*
8905          * This loop exists to restart rmdir in the unlikely case that two
8906          * processes are simultaneously trying to remove the same directory
8907          * containing orphaned appleDouble files.
8908          */
8909         do {
8910                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
8911                     segflg, dirpath, ctx);
8912                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
8913 continue_lookup:
8914                 restart_flag = 0;
8915                 vap = NULL;
8916
8917                 error = nameiat(&nd, fd);
8918                 if (error) {
8919                         return error;
8920                 }
8921
8922                 dvp = nd.ni_dvp;
8923                 vp = nd.ni_vp;
8924
8925                 if (vp) {
8926                         batched = vnode_compound_rmdir_available(vp);
8927
8928                         if (vp->v_flag & VROOT) {
8929                                 /*
8930                                  * The root of a mounted filesystem cannot be deleted.
8931                                  */
8932                                 error = EBUSY;
8933                                 goto out;
8934                         }
8935
8936 #if DEVELOPMENT || DEBUG
8937                         /*
8938                          * XXX VSWAP: Check for entitlements or special flag here
8939                          * so we can restrict access appropriately.
8940                          */
8941 #else /* DEVELOPMENT || DEBUG */
8942
8943                         if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
8944                                 error = EPERM;
8945                                 goto out;
8946                         }
8947 #endif /* DEVELOPMENT || DEBUG */
8948
8949                         /*
8950                          * Removed a check here; we used to abort if vp's vid
8951                          * was not the same as what we'd seen the last time around.
8952                          * I do not think that check was valid, because if we retry
8953                          * and all dirents are gone, the directory could legitimately
8954                          * be recycled but still be present in a situation where we would
8955                          * have had permission to delete.  Therefore, we won't make
8956                          * an effort to preserve that check now that we may not have a
8957                          * vp here.
8958                          */
8959
8960                         if (!batched) {
8961                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
8962                                 if (error) {
8963                                         if (error == ENOENT) {
8964                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8965                                                         restart_flag = 1;
8966                                                         restart_count += 1;
8967                                                 }
8968                                         }
8969                                         goto out;
8970                                 }
8971                         }
8972                 } else {
8973                         batched = 1;
8974
8975                         if (!vnode_compound_rmdir_available(dvp)) {
8976                                 panic("No error, but no compound rmdir?");
8977                         }
8978                 }
8979
8980 #if CONFIG_FSE
8981                 fse_info  finfo;
8982
8983                 need_event = need_fsevent(FSE_DELETE, dvp);
8984                 if (need_event) {
8985                         if (!batched) {
8986                                 get_fse_info(vp, &finfo, ctx);
8987                         } else {
8988                                 error = vfs_get_notify_attributes(&va);
8989                                 if (error) {
8990                                         goto out;
8991                                 }
8992
8993                                 vap = &va;
8994                         }
8995                 }
8996 #endif
8997                 has_listeners = kauth_authorize_fileop_has_listeners();
8998                 if (need_event || has_listeners) {
8999                         if (path == NULL) {
9000                                 GET_PATH(path);
9001                                 if (path == NULL) {
9002                                         error = ENOMEM;
9003                                         goto out;
9004                                 }
9005                         }
9006
9007                         len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9008
9009                         if (no_firmlink_path == NULL) {
9010                                 GET_PATH(no_firmlink_path);
9011                                 if (no_firmlink_path == NULL) {
9012                                         error = ENOMEM;
9013                                         goto out;
9014                                 }
9015                         }
9016
9017                         len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9018 #if CONFIG_FSE
9019                         if (truncated_no_firmlink_path) {
9020                                 finfo.mode |= FSE_TRUNCATED_PATH;
9021                         }
9022 #endif
9023                 }
9024
9025                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9026                 nd.ni_vp = vp;
9027                 if (vp == NULLVP) {
9028                         /* Couldn't find a vnode */
9029                         goto out;
9030                 }
9031
9032                 if (error == EKEEPLOOKING) {
9033                         goto continue_lookup;
9034                 } else if (batched && error == ENOENT) {
9035                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9036                                 /*
9037                                  * For compound VNOPs, the authorization callback
9038                                  * may return ENOENT in case of racing hard link lookups
9039                                  * redrive the lookup.
9040                                  */
9041                                 restart_flag = 1;
9042                                 restart_count += 1;
9043                                 goto out;
9044                         }
9045                 }
9046
9047                 /*
9048                  * XXX There's no provision for passing flags
9049                  * to VNOP_RMDIR().  So, if vn_rmdir() fails
9050                  * because it's not empty, then we try again
9051                  * with VNOP_REMOVE(), passing in a special
9052                  * flag that clever file systems will know
9053                  * how to handle.
9054                  */
9055                 if (error == ENOTEMPTY &&
9056                     (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9057                         /*
9058                          * If this fails, we want to keep the original
9059                          * error.
9060                          */
9061                         if (vn_remove(dvp, &vp, &nd,
9062                             VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9063                                 error = 0;
9064                         }
9065                 }
9066
9067 #if CONFIG_APPLEDOUBLE
9068                 /*
9069                  * Special case to remove orphaned AppleDouble
9070                  * files. I don't like putting this in the kernel,
9071                  * but carbon does not like putting this in carbon either,
9072                  * so here we are.
9073                  */
9074                 if (error == ENOTEMPTY) {
9075                         int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9076                         if (ad_error == EBUSY) {
9077                                 error = ad_error;
9078                                 goto out;
9079                         }
9080
9081
9082                         /*
9083                          * Assuming everything went well, we will try the RMDIR again
9084                          */
9085                         if (!ad_error) {
9086                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9087                         }
9088                 }
9089 #endif /* CONFIG_APPLEDOUBLE */
9090                 /*
9091                  * Call out to allow 3rd party notification of delete.
9092                  * Ignore result of kauth_authorize_fileop call.
9093                  */
9094                 if (!error) {
9095                         if (has_listeners) {
9096                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
9097                                     KAUTH_FILEOP_DELETE,
9098                                     (uintptr_t)vp,
9099                                     (uintptr_t)path);
9100                         }
9101
9102                         if (vp->v_flag & VISHARDLINK) {
9103                                 // see the comment in unlink1() about why we update
9104                                 // the parent of a hard link when it is removed
9105                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9106                         }
9107
9108 #if CONFIG_FSE
9109                         if (need_event) {
9110                                 if (vap) {
9111                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
9112                                 }
9113                                 add_fsevent(FSE_DELETE, ctx,
9114                                     FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9115                                     FSE_ARG_FINFO, &finfo,
9116                                     FSE_ARG_DONE);
9117                         }
9118 #endif
9119                 }
9120
9121 out:
9122                 if (path != NULL) {
9123                         RELEASE_PATH(path);
9124                         path = NULL;
9125                 }
9126
9127                 if (no_firmlink_path != NULL) {
9128                         RELEASE_PATH(no_firmlink_path);
9129                         no_firmlink_path = NULL;
9130                 }
9131
9132                 /*
9133                  * nameidone has to happen before we vnode_put(dvp)
9134                  * since it may need to release the fs_nodelock on the dvp
9135                  */
9136                 nameidone(&nd);
9137                 vnode_put(dvp);
9138
9139                 if (vp) {
9140                         vnode_put(vp);
9141                 }
9142
9143                 if (restart_flag == 0) {
9144                         wakeup_one((caddr_t)vp);
9145                         return error;
9146                 }
9147                 tsleep(vp, PVFS, "rm AD", 1);
9148         } while (restart_flag != 0);
9149
9150         return error;
9151 }
9152
9153 /*
9154  * Remove a directory file.
9155  */
9156 /* ARGSUSED */
9157 int
9158 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9159 {
9160         return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9161                    CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9162 }
9163
9164 /* Get direntry length padded to 8 byte alignment */
9165 #define DIRENT64_LEN(namlen) \
9166         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9167
9168 /* Get dirent length padded to 4 byte alignment */
9169 #define DIRENT_LEN(namelen) \
9170         ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9171
9172 /* Get the end of this dirent */
9173 #define DIRENT_END(dep) \
9174         (((char *)(dep)) + (dep)->d_reclen - 1)
9175
9176 errno_t
9177 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9178     int *numdirent, vfs_context_t ctxp)
9179 {
9180         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
9181         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9182             ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9183                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9184         } else {
9185                 size_t bufsize;
9186                 void * bufptr;
9187                 uio_t auio;
9188                 struct direntry *entry64;
9189                 struct dirent *dep;
9190                 int bytesread;
9191                 int error;
9192
9193                 /*
9194                  * We're here because the underlying file system does not
9195                  * support direnties or we mounted denying support so we must
9196                  * fall back to dirents and convert them to direntries.
9197                  *
9198                  * Our kernel buffer needs to be smaller since re-packing will
9199                  * expand each dirent.  The worse case (when the name length
9200                  * is 3 or less) corresponds to a struct direntry size of 32
9201                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9202                  * (4-byte aligned).  So having a buffer that is 3/8 the size
9203                  * will prevent us from reading more than we can pack.
9204                  *
9205                  * Since this buffer is wired memory, we will limit the
9206                  * buffer size to a maximum of 32K. We would really like to
9207                  * use 32K in the MIN(), but we use magic number 87371 to
9208                  * prevent uio_resid() * 3 / 8 from overflowing.
9209                  */
9210                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9211                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
9212                 if (bufptr == NULL) {
9213                         return ENOMEM;
9214                 }
9215
9216                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9217                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9218                 auio->uio_offset = uio->uio_offset;
9219
9220                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9221
9222                 dep = (struct dirent *)bufptr;
9223                 bytesread = bufsize - uio_resid(auio);
9224
9225                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
9226                     M_TEMP, M_WAITOK);
9227                 /*
9228                  * Convert all the entries and copy them out to user's buffer.
9229                  */
9230                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9231                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
9232
9233                         if (DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9234                             DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9235                                 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9236                                     vp->v_mount->mnt_vfsstat.f_mntonname,
9237                                     vp->v_name ? vp->v_name : "<unknown>");
9238                                 error = EIO;
9239                                 break;
9240                         }
9241
9242                         bzero(entry64, enbufsize);
9243                         /* Convert a dirent to a dirent64. */
9244                         entry64->d_ino = dep->d_ino;
9245                         entry64->d_seekoff = 0;
9246                         entry64->d_reclen = enbufsize;
9247                         entry64->d_namlen = dep->d_namlen;
9248                         entry64->d_type = dep->d_type;
9249                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9250
9251                         /* Move to next entry. */
9252                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
9253
9254                         /* Copy entry64 to user's buffer. */
9255                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9256                 }
9257
9258                 /* Update the real offset using the offset we got from VNOP_READDIR. */
9259                 if (error == 0) {
9260                         uio->uio_offset = auio->uio_offset;
9261                 }
9262                 uio_free(auio);
9263                 FREE(bufptr, M_TEMP);
9264                 FREE(entry64, M_TEMP);
9265                 return error;
9266         }
9267 }
9268
9269 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
9270
9271 /*
9272  * Read a block of directory entries in a file system independent format.
9273  */
9274 static int
9275 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9276     off_t *offset, int *eofflag, int flags)
9277 {
9278         vnode_t vp;
9279         struct vfs_context context = *vfs_context_current();    /* local copy */
9280         struct fileproc *fp;
9281         uio_t auio;
9282         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9283         off_t loff;
9284         int error, numdirent;
9285         char uio_buf[UIO_SIZEOF(1)];
9286
9287         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9288         if (error) {
9289                 return error;
9290         }
9291         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9292                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9293                 error = EBADF;
9294                 goto out;
9295         }
9296
9297         if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9298                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
9299         }
9300
9301 #if CONFIG_MACF
9302         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
9303         if (error) {
9304                 goto out;
9305         }
9306 #endif
9307         if ((error = vnode_getwithref(vp))) {
9308                 goto out;
9309         }
9310         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9311
9312 unionread:
9313         if (vp->v_type != VDIR) {
9314                 (void)vnode_put(vp);
9315                 error = EINVAL;
9316                 goto out;
9317         }
9318
9319 #if CONFIG_MACF
9320         error = mac_vnode_check_readdir(&context, vp);
9321         if (error != 0) {
9322                 (void)vnode_put(vp);
9323                 goto out;
9324         }
9325 #endif /* MAC */
9326
9327         loff = fp->f_fglob->fg_offset;
9328         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9329         uio_addiov(auio, bufp, bufsize);
9330
9331         if (flags & VNODE_READDIR_EXTENDED) {
9332                 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9333                 fp->f_fglob->fg_offset = uio_offset(auio);
9334         } else {
9335                 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9336                 fp->f_fglob->fg_offset = uio_offset(auio);
9337         }
9338         if (error) {
9339                 (void)vnode_put(vp);
9340                 goto out;
9341         }
9342
9343         if ((user_ssize_t)bufsize == uio_resid(auio)) {
9344                 if (union_dircheckp) {
9345                         error = union_dircheckp(&vp, fp, &context);
9346                         if (error == -1) {
9347                                 goto unionread;
9348                         }
9349                         if (error) {
9350                                 (void)vnode_put(vp);
9351                                 goto out;
9352                         }
9353                 }
9354
9355                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
9356                         struct vnode *tvp = vp;
9357                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
9358                                 vnode_ref(vp);
9359                                 fp->f_fglob->fg_data = (caddr_t) vp;
9360                                 fp->f_fglob->fg_offset = 0;
9361                                 vnode_rele(tvp);
9362                                 vnode_put(tvp);
9363                                 goto unionread;
9364                         }
9365                         vp = tvp;
9366                 }
9367         }
9368
9369         vnode_put(vp);
9370         if (offset) {
9371                 *offset = loff;
9372         }
9373
9374         *bytesread = bufsize - uio_resid(auio);
9375 out:
9376         file_drop(fd);
9377         return error;
9378 }
9379
9380
9381 int
9382 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9383 {
9384         off_t offset;
9385         ssize_t bytesread;
9386         int error, eofflag;
9387
9388         AUDIT_ARG(fd, uap->fd);
9389         error = getdirentries_common(uap->fd, uap->buf, uap->count,
9390             &bytesread, &offset, &eofflag, 0);
9391
9392         if (error == 0) {
9393                 if (proc_is64bit(p)) {
9394                         user64_long_t base = (user64_long_t)offset;
9395                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9396                 } else {
9397                         user32_long_t base = (user32_long_t)offset;
9398                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9399                 }
9400                 *retval = bytesread;
9401         }
9402         return error;
9403 }
9404
9405 int
9406 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9407 {
9408         off_t offset;
9409         ssize_t bytesread;
9410         int error, eofflag;
9411         user_size_t bufsize;
9412
9413         AUDIT_ARG(fd, uap->fd);
9414
9415         /*
9416          * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9417          * then the kernel carves out the last 4 bytes to return extended
9418          * information to userspace (namely whether we reached EOF with this call).
9419          */
9420         if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9421                 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9422         } else {
9423                 bufsize = uap->bufsize;
9424         }
9425
9426         error = getdirentries_common(uap->fd, uap->buf, bufsize,
9427             &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9428
9429         if (error == 0) {
9430                 *retval = bytesread;
9431                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9432
9433                 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9434                         getdirentries64_flags_t flags = 0;
9435                         if (eofflag) {
9436                                 flags |= GETDIRENTRIES64_EOF;
9437                         }
9438                         error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9439                             sizeof(flags));
9440                 }
9441         }
9442         return error;
9443 }
9444
9445
9446 /*
9447  * Set the mode mask for creation of filesystem nodes.
9448  * XXX implement xsecurity
9449  */
9450 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
9451 static int
9452 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9453 {
9454         struct filedesc *fdp;
9455
9456         AUDIT_ARG(mask, newmask);
9457         proc_fdlock(p);
9458         fdp = p->p_fd;
9459         *retval = fdp->fd_cmask;
9460         fdp->fd_cmask = newmask & ALLPERMS;
9461         proc_fdunlock(p);
9462         return 0;
9463 }
9464
9465 /*
9466  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9467  *
9468  * Parameters:    p                       Process requesting to set the umask
9469  *                uap                     User argument descriptor (see below)
9470  *                retval                  umask of the process (parameter p)
9471  *
9472  * Indirect:      uap->newmask            umask to set
9473  *                uap->xsecurity          ACL to set
9474  *
9475  * Returns:        0                      Success
9476  *                !0                      Not success
9477  *
9478  */
9479 int
9480 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9481 {
9482         int ciferror;
9483         kauth_filesec_t xsecdst;
9484
9485         xsecdst = KAUTH_FILESEC_NONE;
9486         if (uap->xsecurity != USER_ADDR_NULL) {
9487                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
9488                         return ciferror;
9489                 }
9490         } else {
9491                 xsecdst = KAUTH_FILESEC_NONE;
9492         }
9493
9494         ciferror = umask1(p, uap->newmask, xsecdst, retval);
9495
9496         if (xsecdst != KAUTH_FILESEC_NONE) {
9497                 kauth_filesec_free(xsecdst);
9498         }
9499         return ciferror;
9500 }
9501
9502 int
9503 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9504 {
9505         return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9506 }
9507
9508 /*
9509  * Void all references to file by ripping underlying filesystem
9510  * away from vnode.
9511  */
9512 /* ARGSUSED */
9513 int
9514 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9515 {
9516         vnode_t vp;
9517         struct vnode_attr va;
9518         vfs_context_t ctx = vfs_context_current();
9519         int error;
9520         struct nameidata nd;
9521
9522         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9523             uap->path, ctx);
9524         error = namei(&nd);
9525         if (error) {
9526                 return error;
9527         }
9528         vp = nd.ni_vp;
9529
9530         nameidone(&nd);
9531
9532         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9533                 error = ENOTSUP;
9534                 goto out;
9535         }
9536
9537         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9538                 error = EBUSY;
9539                 goto out;
9540         }
9541
9542 #if CONFIG_MACF
9543         error = mac_vnode_check_revoke(ctx, vp);
9544         if (error) {
9545                 goto out;
9546         }
9547 #endif
9548
9549         VATTR_INIT(&va);
9550         VATTR_WANTED(&va, va_uid);
9551         if ((error = vnode_getattr(vp, &va, ctx))) {
9552                 goto out;
9553         }
9554         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9555             (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9556                 goto out;
9557         }
9558         if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9559                 VNOP_REVOKE(vp, REVOKEALL, ctx);
9560         }
9561 out:
9562         vnode_put(vp);
9563         return error;
9564 }
9565
9566
9567 /*
9568  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9569  *  The following system calls are designed to support features
9570  *  which are specific to the HFS & HFS Plus volume formats
9571  */
9572
9573
9574 /*
9575  * Obtain attribute information on objects in a directory while enumerating
9576  * the directory.
9577  */
9578 /* ARGSUSED */
9579 int
9580 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9581 {
9582         vnode_t vp;
9583         struct fileproc *fp;
9584         uio_t auio = NULL;
9585         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9586         uint32_t count = 0, savecount = 0;
9587         uint32_t newstate = 0;
9588         int error, eofflag;
9589         uint32_t loff = 0;
9590         struct attrlist attributelist;
9591         vfs_context_t ctx = vfs_context_current();
9592         int fd = uap->fd;
9593         char uio_buf[UIO_SIZEOF(1)];
9594         kauth_action_t action;
9595
9596         AUDIT_ARG(fd, fd);
9597
9598         /* Get the attributes into kernel space */
9599         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9600                 return error;
9601         }
9602         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9603                 return error;
9604         }
9605         savecount = count;
9606         if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9607                 return error;
9608         }
9609         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9610                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9611                 error = EBADF;
9612                 goto out;
9613         }
9614
9615
9616 #if CONFIG_MACF
9617         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
9618             fp->f_fglob);
9619         if (error) {
9620                 goto out;
9621         }
9622 #endif
9623
9624
9625         if ((error = vnode_getwithref(vp))) {
9626                 goto out;
9627         }
9628
9629         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9630
9631 unionread:
9632         if (vp->v_type != VDIR) {
9633                 (void)vnode_put(vp);
9634                 error = EINVAL;
9635                 goto out;
9636         }
9637
9638 #if CONFIG_MACF
9639         error = mac_vnode_check_readdir(ctx, vp);
9640         if (error != 0) {
9641                 (void)vnode_put(vp);
9642                 goto out;
9643         }
9644 #endif /* MAC */
9645
9646         /* set up the uio structure which will contain the users return buffer */
9647         loff = fp->f_fglob->fg_offset;
9648         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9649         uio_addiov(auio, uap->buffer, uap->buffersize);
9650
9651         /*
9652          * If the only item requested is file names, we can let that past with
9653          * just LIST_DIRECTORY.  If they want any other attributes, that means
9654          * they need SEARCH as well.
9655          */
9656         action = KAUTH_VNODE_LIST_DIRECTORY;
9657         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
9658             attributelist.fileattr || attributelist.dirattr) {
9659                 action |= KAUTH_VNODE_SEARCH;
9660         }
9661
9662         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
9663                 /* Believe it or not, uap->options only has 32-bits of valid
9664                  * info, so truncate before extending again */
9665
9666                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
9667                     (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
9668         }
9669
9670         if (error) {
9671                 (void) vnode_put(vp);
9672                 goto out;
9673         }
9674
9675         /*
9676          * If we've got the last entry of a directory in a union mount
9677          * then reset the eofflag and pretend there's still more to come.
9678          * The next call will again set eofflag and the buffer will be empty,
9679          * so traverse to the underlying directory and do the directory
9680          * read there.
9681          */
9682         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
9683                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
9684                         eofflag = 0;
9685                 } else {                                                // Empty buffer
9686                         struct vnode *tvp = vp;
9687                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
9688                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
9689                                 fp->f_fglob->fg_data = (caddr_t) vp;
9690                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
9691                                 count = savecount;
9692                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
9693                                 vnode_put(tvp);
9694                                 goto unionread;
9695                         }
9696                         vp = tvp;
9697                 }
9698         }
9699
9700         (void)vnode_put(vp);
9701
9702         if (error) {
9703                 goto out;
9704         }
9705         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
9706
9707         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
9708                 goto out;
9709         }
9710         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
9711                 goto out;
9712         }
9713         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
9714                 goto out;
9715         }
9716
9717         *retval = eofflag;  /* similar to getdirentries */
9718         error = 0;
9719 out:
9720         file_drop(fd);
9721         return error; /* return error earlier, an retval of 0 or 1 now */
9722 } /* end of getdirentriesattr system call */
9723
9724 /*
9725  * Exchange data between two files
9726  */
9727
9728 /* ARGSUSED */
9729 int
9730 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
9731 {
9732         struct nameidata fnd, snd;
9733         vfs_context_t ctx = vfs_context_current();
9734         vnode_t fvp;
9735         vnode_t svp;
9736         int error;
9737         u_int32_t nameiflags;
9738         char *fpath = NULL;
9739         char *spath = NULL;
9740         int   flen = 0, slen = 0;
9741         int from_truncated = 0, to_truncated = 0;
9742 #if CONFIG_FSE
9743         fse_info f_finfo, s_finfo;
9744 #endif
9745
9746         nameiflags = 0;
9747         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
9748                 nameiflags |= FOLLOW;
9749         }
9750
9751         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
9752             UIO_USERSPACE, uap->path1, ctx);
9753
9754         error = namei(&fnd);
9755         if (error) {
9756                 goto out2;
9757         }
9758
9759         nameidone(&fnd);
9760         fvp = fnd.ni_vp;
9761
9762         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
9763             UIO_USERSPACE, uap->path2, ctx);
9764
9765         error = namei(&snd);
9766         if (error) {
9767                 vnode_put(fvp);
9768                 goto out2;
9769         }
9770         nameidone(&snd);
9771         svp = snd.ni_vp;
9772
9773         /*
9774          * if the files are the same, return an inval error
9775          */
9776         if (svp == fvp) {
9777                 error = EINVAL;
9778                 goto out;
9779         }
9780
9781         /*
9782          * if the files are on different volumes, return an error
9783          */
9784         if (svp->v_mount != fvp->v_mount) {
9785                 error = EXDEV;
9786                 goto out;
9787         }
9788
9789         /* If they're not files, return an error */
9790         if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
9791                 error = EINVAL;
9792                 goto out;
9793         }
9794
9795 #if CONFIG_MACF
9796         error = mac_vnode_check_exchangedata(ctx,
9797             fvp, svp);
9798         if (error) {
9799                 goto out;
9800         }
9801 #endif
9802         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
9803             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
9804                 goto out;
9805         }
9806
9807         if (
9808 #if CONFIG_FSE
9809                 need_fsevent(FSE_EXCHANGE, fvp) ||
9810 #endif
9811                 kauth_authorize_fileop_has_listeners()) {
9812                 GET_PATH(fpath);
9813                 GET_PATH(spath);
9814                 if (fpath == NULL || spath == NULL) {
9815                         error = ENOMEM;
9816                         goto out;
9817                 }
9818
9819                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
9820                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
9821
9822 #if CONFIG_FSE
9823                 get_fse_info(fvp, &f_finfo, ctx);
9824                 get_fse_info(svp, &s_finfo, ctx);
9825                 if (from_truncated || to_truncated) {
9826                         // set it here since only the f_finfo gets reported up to user space
9827                         f_finfo.mode |= FSE_TRUNCATED_PATH;
9828                 }
9829 #endif
9830         }
9831         /* Ok, make the call */
9832         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
9833
9834         if (error == 0) {
9835                 const char *tmpname;
9836
9837                 if (fpath != NULL && spath != NULL) {
9838                         /* call out to allow 3rd party notification of exchangedata.
9839                          * Ignore result of kauth_authorize_fileop call.
9840                          */
9841                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
9842                             (uintptr_t)fpath, (uintptr_t)spath);
9843                 }
9844                 name_cache_lock();
9845
9846                 tmpname     = fvp->v_name;
9847                 fvp->v_name = svp->v_name;
9848                 svp->v_name = tmpname;
9849
9850                 if (fvp->v_parent != svp->v_parent) {
9851                         vnode_t tmp;
9852
9853                         tmp           = fvp->v_parent;
9854                         fvp->v_parent = svp->v_parent;
9855                         svp->v_parent = tmp;
9856                 }
9857                 name_cache_unlock();
9858
9859 #if CONFIG_FSE
9860                 if (fpath != NULL && spath != NULL) {
9861                         add_fsevent(FSE_EXCHANGE, ctx,
9862                             FSE_ARG_STRING, flen, fpath,
9863                             FSE_ARG_FINFO, &f_finfo,
9864                             FSE_ARG_STRING, slen, spath,
9865                             FSE_ARG_FINFO, &s_finfo,
9866                             FSE_ARG_DONE);
9867                 }
9868 #endif
9869         }
9870
9871 out:
9872         if (fpath != NULL) {
9873                 RELEASE_PATH(fpath);
9874         }
9875         if (spath != NULL) {
9876                 RELEASE_PATH(spath);
9877         }
9878         vnode_put(svp);
9879         vnode_put(fvp);
9880 out2:
9881         return error;
9882 }
9883
9884 /*
9885  * Return (in MB) the amount of freespace on the given vnode's volume.
9886  */
9887 uint32_t freespace_mb(vnode_t vp);
9888
9889 uint32_t
9890 freespace_mb(vnode_t vp)
9891 {
9892         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
9893         return ((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
9894                vp->v_mount->mnt_vfsstat.f_bsize) >> 20;
9895 }
9896
9897 #if CONFIG_SEARCHFS
9898
9899 /* ARGSUSED */
9900
9901 int
9902 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
9903 {
9904         vnode_t vp, tvp;
9905         int i, error = 0;
9906         int fserror = 0;
9907         struct nameidata nd;
9908         struct user64_fssearchblock searchblock;
9909         struct searchstate *state;
9910         struct attrlist *returnattrs;
9911         struct timeval timelimit;
9912         void *searchparams1, *searchparams2;
9913         uio_t auio = NULL;
9914         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9915         uint32_t nummatches;
9916         int mallocsize;
9917         uint32_t nameiflags;
9918         vfs_context_t ctx = vfs_context_current();
9919         char uio_buf[UIO_SIZEOF(1)];
9920
9921         /* Start by copying in fsearchblock parameter list */
9922         if (IS_64BIT_PROCESS(p)) {
9923                 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
9924                 timelimit.tv_sec = searchblock.timelimit.tv_sec;
9925                 timelimit.tv_usec = searchblock.timelimit.tv_usec;
9926         } else {
9927                 struct user32_fssearchblock tmp_searchblock;
9928
9929                 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
9930                 // munge into 64-bit version
9931                 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
9932                 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
9933                 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
9934                 searchblock.maxmatches = tmp_searchblock.maxmatches;
9935                 /*
9936                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
9937                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
9938                  */
9939                 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
9940                 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
9941                 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
9942                 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
9943                 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
9944                 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
9945                 searchblock.searchattrs = tmp_searchblock.searchattrs;
9946         }
9947         if (error) {
9948                 return error;
9949         }
9950
9951         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
9952          */
9953         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
9954             searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
9955                 return EINVAL;
9956         }
9957
9958         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
9959         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
9960         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
9961         /* block.                                                                                             */
9962         /*                                                                                                    */
9963         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
9964         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
9965         /*       assumes the size is still 556 bytes it will continue to work                                 */
9966
9967         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
9968             sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
9969
9970         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
9971
9972         /* Now set up the various pointers to the correct place in our newly allocated memory */
9973
9974         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
9975         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
9976         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
9977
9978         /* Now copy in the stuff given our local variables. */
9979
9980         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
9981                 goto freeandexit;
9982         }
9983
9984         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
9985                 goto freeandexit;
9986         }
9987
9988         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
9989                 goto freeandexit;
9990         }
9991
9992         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
9993                 goto freeandexit;
9994         }
9995
9996         /*
9997          * When searching a union mount, need to set the
9998          * start flag at the first call on each layer to
9999          * reset state for the new volume.
10000          */
10001         if (uap->options & SRCHFS_START) {
10002                 state->ss_union_layer = 0;
10003         } else {
10004                 uap->options |= state->ss_union_flags;
10005         }
10006         state->ss_union_flags = 0;
10007
10008         /*
10009          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10010          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10011          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10012          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10013          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10014          */
10015
10016         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10017                 attrreference_t* string_ref;
10018                 u_int32_t* start_length;
10019                 user64_size_t param_length;
10020
10021                 /* validate searchparams1 */
10022                 param_length = searchblock.sizeofsearchparams1;
10023                 /* skip the word that specifies length of the buffer */
10024                 start_length = (u_int32_t*) searchparams1;
10025                 start_length = start_length + 1;
10026                 string_ref = (attrreference_t*) start_length;
10027
10028                 /* ensure no negative offsets or too big offsets */
10029                 if (string_ref->attr_dataoffset < 0) {
10030                         error = EINVAL;
10031                         goto freeandexit;
10032                 }
10033                 if (string_ref->attr_length > MAXPATHLEN) {
10034                         error = EINVAL;
10035                         goto freeandexit;
10036                 }
10037
10038                 /* Check for pointer overflow in the string ref */
10039                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10040                         error = EINVAL;
10041                         goto freeandexit;
10042                 }
10043
10044                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10045                         error = EINVAL;
10046                         goto freeandexit;
10047                 }
10048                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10049                         error = EINVAL;
10050                         goto freeandexit;
10051                 }
10052         }
10053
10054         /* set up the uio structure which will contain the users return buffer */
10055         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10056         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10057
10058         nameiflags = 0;
10059         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10060                 nameiflags |= FOLLOW;
10061         }
10062         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10063             UIO_USERSPACE, uap->path, ctx);
10064
10065         error = namei(&nd);
10066         if (error) {
10067                 goto freeandexit;
10068         }
10069         vp = nd.ni_vp;
10070         nameidone(&nd);
10071
10072         /*
10073          * Switch to the root vnode for the volume
10074          */
10075         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10076         vnode_put(vp);
10077         if (error) {
10078                 goto freeandexit;
10079         }
10080         vp = tvp;
10081
10082         /*
10083          * If it's a union mount, the path lookup takes
10084          * us to the top layer. But we may need to descend
10085          * to a lower layer. For non-union mounts the layer
10086          * is always zero.
10087          */
10088         for (i = 0; i < (int) state->ss_union_layer; i++) {
10089                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10090                         break;
10091                 }
10092                 tvp = vp;
10093                 vp = vp->v_mount->mnt_vnodecovered;
10094                 if (vp == NULL) {
10095                         vnode_put(tvp);
10096                         error = ENOENT;
10097                         goto freeandexit;
10098                 }
10099                 error = vnode_getwithref(vp);
10100                 vnode_put(tvp);
10101                 if (error) {
10102                         goto freeandexit;
10103                 }
10104         }
10105
10106 #if CONFIG_MACF
10107         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
10108         if (error) {
10109                 vnode_put(vp);
10110                 goto freeandexit;
10111         }
10112 #endif
10113
10114
10115         /*
10116          * If searchblock.maxmatches == 0, then skip the search. This has happened
10117          * before and sometimes the underlying code doesnt deal with it well.
10118          */
10119         if (searchblock.maxmatches == 0) {
10120                 nummatches = 0;
10121                 goto saveandexit;
10122         }
10123
10124         /*
10125          * Allright, we have everything we need, so lets make that call.
10126          *
10127          * We keep special track of the return value from the file system:
10128          * EAGAIN is an acceptable error condition that shouldn't keep us
10129          * from copying out any results...
10130          */
10131
10132         fserror = VNOP_SEARCHFS(vp,
10133             searchparams1,
10134             searchparams2,
10135             &searchblock.searchattrs,
10136             (u_long)searchblock.maxmatches,
10137             &timelimit,
10138             returnattrs,
10139             &nummatches,
10140             (u_long)uap->scriptcode,
10141             (u_long)uap->options,
10142             auio,
10143             (struct searchstate *) &state->ss_fsstate,
10144             ctx);
10145
10146         /*
10147          * If it's a union mount we need to be called again
10148          * to search the mounted-on filesystem.
10149          */
10150         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10151                 state->ss_union_flags = SRCHFS_START;
10152                 state->ss_union_layer++;        // search next layer down
10153                 fserror = EAGAIN;
10154         }
10155
10156 saveandexit:
10157
10158         vnode_put(vp);
10159
10160         /* Now copy out the stuff that needs copying out. That means the number of matches, the
10161          *  search state.  Everything was already put into he return buffer by the vop call. */
10162
10163         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10164                 goto freeandexit;
10165         }
10166
10167         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10168                 goto freeandexit;
10169         }
10170
10171         error = fserror;
10172
10173 freeandexit:
10174
10175         FREE(searchparams1, M_TEMP);
10176
10177         return error;
10178 } /* end of searchfs system call */
10179
10180 #else /* CONFIG_SEARCHFS */
10181
10182 int
10183 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10184 {
10185         return ENOTSUP;
10186 }
10187
10188 #endif /* CONFIG_SEARCHFS */
10189
10190
10191 #if CONFIG_DATALESS_FILES
10192
10193 /*
10194  * === Namespace Resolver Up-call Mechanism ===
10195  *
10196  * When I/O is performed to a dataless file or directory (read, write,
10197  * lookup-in, etc.), the file system performs an upcall to the namespace
10198  * resolver (filecoordinationd) to materialize the object.
10199  *
10200  * We need multiple up-calls to be in flight at once, and we need these
10201  * up-calls to be interruptible, thus the following implementation:
10202  *
10203  * => The nspace_resolver_request represents the in-kernel request state.
10204  *    It contains a request ID, storage space for the errno code returned
10205  *    by filecoordinationd, and flags.
10206  *
10207  * => The request ID is simply a global monotonically incrementing 32-bit
10208  *    number.  Outstanding requests are stored in a hash table, and the
10209  *    hash function is extremely simple.
10210  *
10211  * => When an upcall is to be made to filecoordinationd, a request structure
10212  *    is allocated on the stack (it is small, and needs to live only during
10213  *    the duration of the call to resolve_nspace_item_ext()).  It is
10214  *    initialized and inserted into the table.  Some backpressure from
10215  *    filecoordinationd is applied by limiting the numnber of entries that
10216  *    can be inserted into the table (and thus limiting the number of
10217  *    outstanding requests issued to filecoordinationd); waiting for an
10218  *    available slot is interruptible.
10219  *
10220  * => Once the request has been inserted into the table, the up-call is made
10221  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
10222  *    immediately and filecoordinationd processes the request asynchronously.
10223  *
10224  * => The caller now waits for the request to complete.  Tnis is achieved by
10225  *    sleeping on the address of the request structure and waiting for
10226  *    filecoordinationd to mark the request structure as complete.  This
10227  *    is an interruptible sleep call; if interrupted, the request structure
10228  *    is removed from the table and EINTR is returned to the caller.  If
10229  *    this occurs, an advisory up-call is made to filecoordinationd with
10230  *    the request ID to indicate that the request can be aborted or
10231  *    de-prioritized at the discretion of filecoordinationd.
10232  *
10233  * => When filecoordinationd has completed the request, it signals completion
10234  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
10235  *    decorated as a namespace resolver can write to this sysctl node.  The
10236  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10237  *    The request ID is looked up in the table, and if the request is found,
10238  *    the error code is stored in the request structure and a wakeup()
10239  *    issued on the address of the request structure.  If the request is not
10240  *    found, we simply drop the completion notification, assuming that the
10241  *    caller was interrupted.
10242  *
10243  * => When the waiting thread wakes up, it extracts the error code from the
10244  *    request structure, removes the request from the table, and returns the
10245  *    error code to the calling function.  Fini!
10246  */
10247
10248 struct nspace_resolver_request {
10249         LIST_ENTRY(nspace_resolver_request) r_hashlink;
10250         uint32_t        r_req_id;
10251         int             r_resolver_error;
10252         int             r_flags;
10253 };
10254
10255 #define RRF_COMPLETE    0x0001
10256
10257 static uint32_t
10258 next_nspace_req_id(void)
10259 {
10260         static uint32_t next_req_id;
10261
10262         return OSAddAtomic(1, &next_req_id);
10263 }
10264
10265 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
10266 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
10267
10268 static LIST_HEAD(nspace_resolver_requesthead,
10269     nspace_resolver_request) * nspace_resolver_request_hashtbl;
10270 static u_long nspace_resolver_request_hashmask;
10271 static u_int nspace_resolver_request_count;
10272 static bool nspace_resolver_request_wait_slot;
10273 static lck_grp_t *nspace_resolver_request_lck_grp;
10274 static lck_mtx_t nspace_resolver_request_hash_mutex;
10275
10276 #define NSPACE_REQ_LOCK() \
10277         lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10278 #define NSPACE_REQ_UNLOCK() \
10279         lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10280
10281 #define NSPACE_RESOLVER_HASH(req_id)    \
10282         (&nspace_resolver_request_hashtbl[(req_id) & \
10283          nspace_resolver_request_hashmask])
10284
10285 static struct nspace_resolver_request *
10286 nspace_resolver_req_lookup(uint32_t req_id)
10287 {
10288         struct nspace_resolver_requesthead *bucket;
10289         struct nspace_resolver_request *req;
10290
10291         bucket = NSPACE_RESOLVER_HASH(req_id);
10292         LIST_FOREACH(req, bucket, r_hashlink) {
10293                 if (req->r_req_id == req_id) {
10294                         return req;
10295                 }
10296         }
10297
10298         return NULL;
10299 }
10300
10301 static int
10302 nspace_resolver_req_add(struct nspace_resolver_request *req)
10303 {
10304         struct nspace_resolver_requesthead *bucket;
10305         int error;
10306
10307         while (nspace_resolver_request_count >=
10308             NSPACE_RESOLVER_MAX_OUTSTANDING) {
10309                 nspace_resolver_request_wait_slot = true;
10310                 error = msleep(&nspace_resolver_request_count,
10311                     &nspace_resolver_request_hash_mutex,
10312                     PVFS | PCATCH, "nspacerq", NULL);
10313                 if (error) {
10314                         return error;
10315                 }
10316         }
10317
10318         bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10319 #if DIAGNOSTIC
10320         assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10321 #endif /* DIAGNOSTIC */
10322         LIST_INSERT_HEAD(bucket, req, r_hashlink);
10323         nspace_resolver_request_count++;
10324
10325         return 0;
10326 }
10327
10328 static void
10329 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10330 {
10331         struct nspace_resolver_requesthead *bucket;
10332
10333         bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10334 #if DIAGNOSTIC
10335         assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10336 #endif /* DIAGNOSTIC */
10337         LIST_REMOVE(req, r_hashlink);
10338         nspace_resolver_request_count--;
10339
10340         if (nspace_resolver_request_wait_slot) {
10341                 nspace_resolver_request_wait_slot = false;
10342                 wakeup(&nspace_resolver_request_count);
10343         }
10344 }
10345
10346 static void
10347 nspace_resolver_req_cancel(uint32_t req_id)
10348 {
10349         kern_return_t kr;
10350         mach_port_t mp;
10351
10352         // Failures here aren't fatal -- the cancellation message
10353         // sent to the resolver is merely advisory.
10354
10355         kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10356         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10357                 return;
10358         }
10359
10360         kr = send_nspace_resolve_cancel(mp, req_id);
10361         if (kr != KERN_SUCCESS) {
10362                 os_log_error(OS_LOG_DEFAULT,
10363                     "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10364         }
10365
10366         ipc_port_release_send(mp);
10367 }
10368
10369 static int
10370 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10371 {
10372         bool send_cancel_message = false;
10373         int error;
10374
10375         NSPACE_REQ_LOCK();
10376
10377         while ((req->r_flags & RRF_COMPLETE) == 0) {
10378                 error = msleep(req, &nspace_resolver_request_hash_mutex,
10379                     PVFS | PCATCH, "nspace", NULL);
10380                 if (error && error != ERESTART) {
10381                         req->r_resolver_error = (error == EINTR) ? EINTR :
10382                             ETIMEDOUT;
10383                         send_cancel_message = true;
10384                         break;
10385                 }
10386         }
10387
10388         nspace_resolver_req_remove(req);
10389
10390         NSPACE_REQ_UNLOCK();
10391
10392         if (send_cancel_message) {
10393                 nspace_resolver_req_cancel(req->r_req_id);
10394         }
10395
10396         return req->r_resolver_error;
10397 }
10398
10399 static void
10400 nspace_resolver_req_mark_complete(
10401         struct nspace_resolver_request *req,
10402         int resolver_error)
10403 {
10404         req->r_resolver_error = resolver_error;
10405         req->r_flags |= RRF_COMPLETE;
10406         wakeup(req);
10407 }
10408
10409 static void
10410 nspace_resolver_req_completed(uint32_t req_id, int resolver_error)
10411 {
10412         struct nspace_resolver_request *req;
10413
10414         NSPACE_REQ_LOCK();
10415
10416         // If we don't find the request corresponding to our req_id,
10417         // just drop the completion signal on the floor; it's likely
10418         // that the requester interrupted with a signal.
10419
10420         req = nspace_resolver_req_lookup(req_id);
10421         if (req) {
10422                 nspace_resolver_req_mark_complete(req, resolver_error);
10423         }
10424
10425         NSPACE_REQ_UNLOCK();
10426 }
10427
10428 static struct proc *nspace_resolver_proc;
10429
10430 static int
10431 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10432 {
10433         *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10434             p == nspace_resolver_proc) ? 1 : 0;
10435         return 0;
10436 }
10437
10438 static int
10439 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10440 {
10441         vfs_context_t ctx = vfs_context_current();
10442         int error = 0;
10443
10444         //
10445         // The system filecoordinationd runs as uid == 0.  This also
10446         // has the nice side-effect of filtering out filecoordinationd
10447         // running in the simulator.
10448         //
10449         if (!vfs_context_issuser(ctx)) {
10450                 return EPERM;
10451         }
10452
10453         error = priv_check_cred(vfs_context_ucred(ctx),
10454             PRIV_VFS_DATALESS_RESOLVER, 0);
10455         if (error) {
10456                 return error;
10457         }
10458
10459         if (is_resolver) {
10460                 NSPACE_REQ_LOCK();
10461
10462                 if (nspace_resolver_proc == NULL) {
10463                         proc_lock(p);
10464                         p->p_lflag |= P_LNSPACE_RESOLVER;
10465                         proc_unlock(p);
10466                         nspace_resolver_proc = p;
10467                 } else {
10468                         error = EBUSY;
10469                 }
10470
10471                 NSPACE_REQ_UNLOCK();
10472         } else {
10473                 // This is basically just like the exit case.
10474                 // nspace_resolver_exited() will verify that the
10475                 // process is the resolver, and will clear the
10476                 // global.
10477                 nspace_resolver_exited(p);
10478         }
10479
10480         return error;
10481 }
10482
10483 static int
10484 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10485 {
10486         if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10487             (p->p_vfs_iopolicy &
10488             P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10489                 *is_prevented = 1;
10490         } else {
10491                 *is_prevented = 0;
10492         }
10493         return 0;
10494 }
10495
10496 static int
10497 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10498 {
10499         if (p->p_lflag & P_LNSPACE_RESOLVER) {
10500                 return is_prevented ? 0 : EBUSY;
10501         }
10502
10503         if (is_prevented) {
10504                 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10505         } else {
10506                 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10507         }
10508         return 0;
10509 }
10510
10511 static int
10512 nspace_materialization_get_thread_state(int *is_prevented)
10513 {
10514         uthread_t ut = get_bsdthread_info(current_thread());
10515
10516         *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10517         return 0;
10518 }
10519
10520 static int
10521 nspace_materialization_set_thread_state(int is_prevented)
10522 {
10523         uthread_t ut = get_bsdthread_info(current_thread());
10524
10525         if (is_prevented) {
10526                 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10527         } else {
10528                 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10529         }
10530         return 0;
10531 }
10532
10533 static int
10534 nspace_materialization_is_prevented(void)
10535 {
10536         proc_t p = current_proc();
10537         uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
10538         vfs_context_t ctx = vfs_context_current();
10539
10540         /*
10541          * Kernel context ==> return EDEADLK, as we would with any random
10542          * process decorated as no-materialize.
10543          */
10544         if (ctx == vfs_context_kernel()) {
10545                 return EDEADLK;
10546         }
10547
10548         /*
10549          * If the process has the dataless-manipulation entitlement,
10550          * materialization is prevented, and depending on the kind
10551          * of file system operation, things get to proceed as if the
10552          * object is not dataless.
10553          */
10554         if (vfs_context_is_dataless_manipulator(ctx)) {
10555                 return EJUSTRETURN;
10556         }
10557
10558         /*
10559          * Per-thread decorations override any process-wide decorations.
10560          * (Foundation uses this, and this overrides even the dataless-
10561          * manipulation entitlement so as to make API contracts consistent.)
10562          */
10563         if (ut != NULL) {
10564                 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
10565                         return EDEADLK;
10566                 }
10567                 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
10568                         return 0;
10569                 }
10570         }
10571
10572         /*
10573          * If the process's iopolicy specifies that dataless files
10574          * can be materialized, then we let it go ahead.
10575          */
10576         if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
10577                 return 0;
10578         }
10579
10580         /*
10581          * The default behavior is to not materialize dataless files;
10582          * return to the caller that deadlock was detected.
10583          */
10584         return EDEADLK;
10585 }
10586
10587 /* the vfs.nspace branch */
10588 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10589
10590 static int
10591 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10592     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10593 {
10594         struct proc *p = req->p;
10595         int new_value, old_value, changed = 0;
10596         int error;
10597
10598         error = nspace_resolver_get_proc_state(p, &old_value);
10599         if (error) {
10600                 return error;
10601         }
10602
10603         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10604             &changed);
10605         if (error == 0 && changed) {
10606                 error = nspace_resolver_set_proc_state(p, new_value);
10607         }
10608         return error;
10609 }
10610
10611 /* decorate this process as the dataless file resolver */
10612 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
10613     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10614     0, 0, sysctl_nspace_resolver, "I", "");
10615
10616 static int
10617 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
10618     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10619 {
10620         struct proc *p = req->p;
10621         int new_value, old_value, changed = 0;
10622         int error;
10623
10624         error = nspace_materialization_get_proc_state(p, &old_value);
10625         if (error) {
10626                 return error;
10627         }
10628
10629         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10630             &changed);
10631         if (error == 0 && changed) {
10632                 error = nspace_materialization_set_proc_state(p, new_value);
10633         }
10634         return error;
10635 }
10636
10637 /* decorate this process as not wanting to materialize dataless files */
10638 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
10639     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10640     0, 0, sysctl_nspace_prevent_materialization, "I", "");
10641
10642 static int
10643 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
10644     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10645 {
10646         int new_value, old_value, changed = 0;
10647         int error;
10648
10649         error = nspace_materialization_get_thread_state(&old_value);
10650         if (error) {
10651                 return error;
10652         }
10653
10654         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10655             &changed);
10656         if (error == 0 && changed) {
10657                 error = nspace_materialization_set_thread_state(new_value);
10658         }
10659         return error;
10660 }
10661
10662 /* decorate this thread as not wanting to materialize dataless files */
10663 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
10664     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10665     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
10666
10667 static int
10668 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
10669     __unused int arg2, struct sysctl_req *req)
10670 {
10671         struct proc *p = req->p;
10672         uint32_t req_status[2] = { 0, 0 };
10673         int error, is_resolver, changed = 0;
10674
10675         error = nspace_resolver_get_proc_state(p, &is_resolver);
10676         if (error) {
10677                 return error;
10678         }
10679
10680         if (!is_resolver) {
10681                 return EPERM;
10682         }
10683
10684         error = sysctl_io_opaque(req, req_status, sizeof(req_status),
10685             &changed);
10686         if (error) {
10687                 return error;
10688         }
10689
10690         /*
10691          * req_status[0] is the req_id
10692          *
10693          * req_status[1] is the errno
10694          */
10695         if (error == 0 && changed) {
10696                 nspace_resolver_req_completed(req_status[0],
10697                     (int)req_status[1]);
10698         }
10699         return error;
10700 }
10701
10702 /* Resolver reports completed reqs here. */
10703 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
10704     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10705     0, 0, sysctl_nspace_complete, "-", "");
10706
10707 #endif /* CONFIG_DATALESS_FILES */
10708
10709 #if CONFIG_DATALESS_FILES
10710 #define __no_dataless_unused    /* nothing */
10711 #else
10712 #define __no_dataless_unused    __unused
10713 #endif
10714
10715 void
10716 nspace_resolver_init(void)
10717 {
10718 #if CONFIG_DATALESS_FILES
10719         nspace_resolver_request_lck_grp =
10720             lck_grp_alloc_init("file namespace resolver", NULL);
10721
10722         lck_mtx_init(&nspace_resolver_request_hash_mutex,
10723             nspace_resolver_request_lck_grp, NULL);
10724
10725         nspace_resolver_request_hashtbl =
10726             hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
10727             M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
10728 #endif /* CONFIG_DATALESS_FILES */
10729 }
10730
10731 void
10732 nspace_resolver_exited(struct proc *p __no_dataless_unused)
10733 {
10734 #if CONFIG_DATALESS_FILES
10735         struct nspace_resolver_requesthead *bucket;
10736         struct nspace_resolver_request *req;
10737         u_long idx;
10738
10739         NSPACE_REQ_LOCK();
10740
10741         if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10742             p == nspace_resolver_proc) {
10743                 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
10744                         bucket = &nspace_resolver_request_hashtbl[idx];
10745                         LIST_FOREACH(req, bucket, r_hashlink) {
10746                                 nspace_resolver_req_mark_complete(req,
10747                                     ETIMEDOUT);
10748                         }
10749                 }
10750                 nspace_resolver_proc = NULL;
10751         }
10752
10753         NSPACE_REQ_UNLOCK();
10754 #endif /* CONFIG_DATALESS_FILES */
10755 }
10756
10757 int
10758 resolve_nspace_item(struct vnode *vp, uint64_t op)
10759 {
10760         return resolve_nspace_item_ext(vp, op, NULL);
10761 }
10762
10763 #define DATALESS_RESOLVER_ENTITLEMENT     \
10764         "com.apple.private.vfs.dataless-resolver"
10765 #define DATALESS_MANIPULATION_ENTITLEMENT \
10766         "com.apple.private.vfs.dataless-manipulation"
10767
10768 /*
10769  * Return TRUE if the vfs context is associated with a process entitled
10770  * for dataless manipulation.
10771  *
10772  * XXX Arguably belongs in vfs_subr.c, but is here because of the
10773  * complication around CONFIG_DATALESS_FILES.
10774  */
10775 boolean_t
10776 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
10777 {
10778 #if CONFIG_DATALESS_FILES
10779         assert(ctx->vc_thread == current_thread());
10780         task_t const task = current_task();
10781         return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
10782                IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
10783 #else
10784         return false;
10785 #endif /* CONFIG_DATALESS_FILES */
10786 }
10787
10788 int
10789 resolve_nspace_item_ext(
10790         struct vnode *vp __no_dataless_unused,
10791         uint64_t op __no_dataless_unused,
10792         void *arg __unused)
10793 {
10794 #if CONFIG_DATALESS_FILES
10795         int error;
10796         mach_port_t mp;
10797         char *path = NULL;
10798         int path_len;
10799         kern_return_t kr;
10800         struct nspace_resolver_request req;
10801
10802         // only allow namespace events on regular files, directories and symlinks.
10803         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
10804                 return EFTYPE;
10805         }
10806
10807         //
10808         // if this is a snapshot event and the vnode is on a
10809         // disk image just pretend nothing happened since any
10810         // change to the disk image will cause the disk image
10811         // itself to get backed up and this avoids multi-way
10812         // deadlocks between the snapshot handler and the ever
10813         // popular diskimages-helper process.  the variable
10814         // nspace_allow_virtual_devs allows this behavior to
10815         // be overridden (for use by the Mobile TimeMachine
10816         // testing infrastructure which uses disk images)
10817         //
10818         if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
10819                 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
10820                 return ENOTSUP;
10821         }
10822
10823         error = nspace_materialization_is_prevented();
10824         if (error) {
10825                 os_log_debug(OS_LOG_DEFAULT,
10826                     "NSPACE process/thread is decorated as no-materialization");
10827                 return error;
10828         }
10829
10830         kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10831         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10832                 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
10833                 // Treat this like being unable to access the backing
10834                 // store server.
10835                 return ETIMEDOUT;
10836         }
10837
10838         MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
10839         if (path == NULL) {
10840                 error = ENOMEM;
10841                 goto out_release_port;
10842         }
10843         path_len = MAXPATHLEN;
10844
10845         error = vn_getpath(vp, path, &path_len);
10846         if (error == 0) {
10847                 int xxx_rdar44371223;   /* XXX Mig bug */
10848                 req.r_req_id = next_nspace_req_id();
10849                 req.r_resolver_error = 0;
10850                 req.r_flags = 0;
10851
10852                 NSPACE_REQ_LOCK();
10853                 error = nspace_resolver_req_add(&req);
10854                 NSPACE_REQ_UNLOCK();
10855                 if (error) {
10856                         goto out_release_port;
10857                 }
10858
10859                 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
10860                 kr = send_nspace_resolve_path(mp, req.r_req_id,
10861                     current_proc()->p_pid, (uint32_t)(op & 0xffffffff),
10862                     path, &xxx_rdar44371223);
10863                 if (kr != KERN_SUCCESS) {
10864                         // Also treat this like being unable to access
10865                         // the backing store server.
10866                         os_log_error(OS_LOG_DEFAULT,
10867                             "NSPACE resolve_path failure: %d", kr);
10868                         error = ETIMEDOUT;
10869
10870                         NSPACE_REQ_LOCK();
10871                         nspace_resolver_req_remove(&req);
10872                         NSPACE_REQ_UNLOCK();
10873                         goto out_release_port;
10874                 }
10875
10876                 // Give back the memory we allocated earlier while
10877                 // we wait; we no longer need it.
10878                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10879                 path = NULL;
10880
10881                 // Request has been submitted to the resolver.
10882                 // Now (interruptibly) wait for completion.
10883                 // Upon requrn, the request will have been removed
10884                 // from the lookup table.
10885                 error = nspace_resolver_req_wait(&req);
10886         }
10887
10888 out_release_port:
10889         if (path != NULL) {
10890                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10891         }
10892         ipc_port_release_send(mp);
10893
10894         return error;
10895 #else
10896         return ENOTSUP;
10897 #endif /* CONFIG_DATALESS_FILES */
10898 }
10899
10900 int
10901 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
10902     __unused uint64_t op_type, __unused void *arg)
10903 {
10904         return 0;
10905 }
10906
10907 #if 0
10908 static int
10909 build_volfs_path(struct vnode *vp, char *path, int *len)
10910 {
10911         struct vnode_attr va;
10912         int ret;
10913
10914         VATTR_INIT(&va);
10915         VATTR_WANTED(&va, va_fsid);
10916         VATTR_WANTED(&va, va_fileid);
10917
10918         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
10919                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
10920                 ret = -1;
10921         } else {
10922                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
10923                 ret = 0;
10924         }
10925
10926         return ret;
10927 }
10928 #endif
10929
10930 static unsigned long
10931 fsctl_bogus_command_compat(unsigned long cmd)
10932 {
10933         switch (cmd) {
10934         case IOCBASECMD(FSIOC_SYNC_VOLUME):
10935                 return FSIOC_SYNC_VOLUME;
10936         case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
10937                 return FSIOC_ROUTEFS_SETROUTEID;
10938         case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
10939                 return FSIOC_SET_PACKAGE_EXTS;
10940         case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
10941                 return FSIOC_SET_FSTYPENAME_OVERRIDE;
10942         case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
10943                 return DISK_CONDITIONER_IOC_GET;
10944         case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
10945                 return DISK_CONDITIONER_IOC_SET;
10946         case IOCBASECMD(FSIOC_FIOSEEKHOLE):
10947                 return FSIOC_FIOSEEKHOLE;
10948         case IOCBASECMD(FSIOC_FIOSEEKDATA):
10949                 return FSIOC_FIOSEEKDATA;
10950         case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
10951                 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
10952         case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
10953                 return SPOTLIGHT_IOC_GET_LAST_MTIME;
10954         }
10955
10956         return cmd;
10957 }
10958
10959 static int
10960 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
10961 {
10962         return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
10963 }
10964
10965 /*
10966  * Make a filesystem-specific control call:
10967  */
10968 /* ARGSUSED */
10969 static int
10970 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
10971 {
10972         int error = 0;
10973         boolean_t is64bit;
10974         u_int size;
10975 #define STK_PARAMS 128
10976         char stkbuf[STK_PARAMS] = {0};
10977         caddr_t data, memp;
10978         vnode_t vp = *arg_vp;
10979
10980         if (vp->v_type == VCHR || vp->v_type == VBLK) {
10981                 return ENOTTY;
10982         }
10983
10984         cmd = fsctl_bogus_command_compat(cmd);
10985
10986         size = IOCPARM_LEN(cmd);
10987         if (size > IOCPARM_MAX) {
10988                 return EINVAL;
10989         }
10990
10991         is64bit = proc_is64bit(p);
10992
10993         memp = NULL;
10994
10995         if (size > sizeof(stkbuf)) {
10996                 if ((memp = (caddr_t)kalloc(size)) == 0) {
10997                         return ENOMEM;
10998                 }
10999                 data = memp;
11000         } else {
11001                 data = &stkbuf[0];
11002         };
11003
11004         if (cmd & IOC_IN) {
11005                 if (size) {
11006                         error = copyin(udata, data, size);
11007                         if (error) {
11008                                 if (memp) {
11009                                         kfree(memp, size);
11010                                 }
11011                                 return error;
11012                         }
11013                 } else {
11014                         if (is64bit) {
11015                                 *(user_addr_t *)data = udata;
11016                         } else {
11017                                 *(uint32_t *)data = (uint32_t)udata;
11018                         }
11019                 };
11020         } else if ((cmd & IOC_OUT) && size) {
11021                 /*
11022                  * Zero the buffer so the user always
11023                  * gets back something deterministic.
11024                  */
11025                 bzero(data, size);
11026         } else if (cmd & IOC_VOID) {
11027                 if (is64bit) {
11028                         *(user_addr_t *)data = udata;
11029                 } else {
11030                         *(uint32_t *)data = (uint32_t)udata;
11031                 }
11032         }
11033
11034         /* Check to see if it's a generic command */
11035         switch (cmd) {
11036         case FSIOC_SYNC_VOLUME: {
11037                 struct vfs_attr vfa;
11038                 mount_t mp = vp->v_mount;
11039                 unsigned arg;
11040
11041
11042                 /* record vid of vp so we can drop it below. */
11043                 uint32_t vvid = vp->v_id;
11044
11045                 /*
11046                  * Then grab mount_iterref so that we can release the vnode.
11047                  * Without this, a thread may call vnode_iterate_prepare then
11048                  * get into a deadlock because we've never released the root vp
11049                  */
11050                 error = mount_iterref(mp, 0);
11051                 if (error) {
11052                         break;
11053                 }
11054                 vnode_put(vp);
11055
11056                 arg = MNT_NOWAIT;
11057                 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11058                         arg = MNT_WAIT;
11059                 }
11060
11061                 /*
11062                  * If the filessytem supports multiple filesytems in a
11063                  * partition (For eg APFS volumes in a container, it knows
11064                  * that the waitfor argument to VFS_SYNC are flags.
11065                  */
11066                 VFSATTR_INIT(&vfa);
11067                 VFSATTR_WANTED(&vfa, f_capabilities);
11068                 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11069                     VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11070                     ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11071                     ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11072                         arg |= MNT_VOLUME;
11073                 }
11074
11075                 /* issue the sync for this volume */
11076                 (void)sync_callback(mp, &arg);
11077
11078                 /*
11079                  * Then release the mount_iterref once we're done syncing; it's not
11080                  * needed for the VNOP_IOCTL below
11081                  */
11082                 mount_iterdrop(mp);
11083
11084                 if (arg & FSCTL_SYNC_FULLSYNC) {
11085                         /* re-obtain vnode iocount on the root vp, if possible */
11086                         error = vnode_getwithvid(vp, vvid);
11087                         if (error == 0) {
11088                                 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11089                                 vnode_put(vp);
11090                         }
11091                 }
11092                 /* mark the argument VP as having been released */
11093                 *arg_vp = NULL;
11094         }
11095         break;
11096
11097         case FSIOC_ROUTEFS_SETROUTEID: {
11098 #if ROUTEFS
11099                 char routepath[MAXPATHLEN];
11100                 size_t len = 0;
11101
11102                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11103                         break;
11104                 }
11105                 bzero(routepath, MAXPATHLEN);
11106                 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11107                 if (error) {
11108                         break;
11109                 }
11110                 error = routefs_kernel_mount(routepath);
11111                 if (error) {
11112                         break;
11113                 }
11114 #endif
11115         }
11116         break;
11117
11118         case FSIOC_SET_PACKAGE_EXTS: {
11119                 user_addr_t ext_strings;
11120                 uint32_t    num_entries;
11121                 uint32_t    max_width;
11122
11123                 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11124                         break;
11125                 }
11126
11127                 if ((is64bit && size != sizeof(user64_package_ext_info))
11128                     || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11129                         // either you're 64-bit and passed a 64-bit struct or
11130                         // you're 32-bit and passed a 32-bit struct.  otherwise
11131                         // it's not ok.
11132                         error = EINVAL;
11133                         break;
11134                 }
11135
11136                 if (is64bit) {
11137                         ext_strings = ((user64_package_ext_info *)data)->strings;
11138                         num_entries = ((user64_package_ext_info *)data)->num_entries;
11139                         max_width   = ((user64_package_ext_info *)data)->max_width;
11140                 } else {
11141                         ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11142                         num_entries = ((user32_package_ext_info *)data)->num_entries;
11143                         max_width   = ((user32_package_ext_info *)data)->max_width;
11144                 }
11145                 error = set_package_extensions_table(ext_strings, num_entries, max_width);
11146         }
11147         break;
11148
11149         case FSIOC_SET_FSTYPENAME_OVERRIDE:
11150         {
11151                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11152                         break;
11153                 }
11154                 if (vp->v_mount) {
11155                         mount_lock(vp->v_mount);
11156                         if (data[0] != 0) {
11157                                 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11158                                 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11159                                 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11160                                         vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
11161                                         vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
11162                                 }
11163                         } else {
11164                                 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11165                                         vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
11166                                 }
11167                                 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11168                                 vp->v_mount->fstypename_override[0] = '\0';
11169                         }
11170                         mount_unlock(vp->v_mount);
11171                 }
11172         }
11173         break;
11174
11175         case DISK_CONDITIONER_IOC_GET: {
11176                 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
11177         }
11178         break;
11179
11180         case DISK_CONDITIONER_IOC_SET: {
11181                 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
11182         }
11183         break;
11184
11185         case FSIOC_CAS_BSDFLAGS: {
11186                 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11187                 struct vnode_attr va;
11188
11189                 VATTR_INIT(&va);
11190                 VATTR_SET(&va, va_flags, cas->new_flags);
11191
11192                 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11193         }
11194         break;
11195
11196         case FSIOC_FD_ONLY_OPEN_ONCE: {
11197                 if (vnode_usecount(vp) > 1) {
11198                         error = EBUSY;
11199                 } else {
11200                         error = 0;
11201                 }
11202         }
11203         break;
11204
11205         default: {
11206                 /* other, known commands shouldn't be passed down here */
11207                 switch (cmd) {
11208                 case F_PUNCHHOLE:
11209                 case F_TRIM_ACTIVE_FILE:
11210                 case F_RDADVISE:
11211                 case F_TRANSCODEKEY:
11212                 case F_GETPROTECTIONLEVEL:
11213                 case F_GETDEFAULTPROTLEVEL:
11214                 case F_MAKECOMPRESSED:
11215                 case F_SET_GREEDY_MODE:
11216                 case F_SETSTATICCONTENT:
11217                 case F_SETIOTYPE:
11218                 case F_SETBACKINGSTORE:
11219                 case F_GETPATH_MTMINFO:
11220                 case APFSIOC_REVERT_TO_SNAPSHOT:
11221                 case FSIOC_FIOSEEKHOLE:
11222                 case FSIOC_FIOSEEKDATA:
11223                 case HFS_GET_BOOT_INFO:
11224                 case HFS_SET_BOOT_INFO:
11225                 case FIOPINSWAP:
11226                 case F_CHKCLEAN:
11227                 case F_FULLFSYNC:
11228                 case F_BARRIERFSYNC:
11229                 case F_FREEZE_FS:
11230                 case F_THAW_FS:
11231                         error = EINVAL;
11232                         goto outdrop;
11233                 }
11234                 /* Invoke the filesystem-specific code */
11235                 error = VNOP_IOCTL(vp, cmd, data, options, ctx);
11236         }
11237         } /* end switch stmt */
11238
11239         /*
11240          * if no errors, copy any data to user. Size was
11241          * already set and checked above.
11242          */
11243         if (error == 0 && (cmd & IOC_OUT) && size) {
11244                 error = copyout(data, udata, size);
11245         }
11246
11247 outdrop:
11248         if (memp) {
11249                 kfree(memp, size);
11250         }
11251
11252         return error;
11253 }
11254
11255 /* ARGSUSED */
11256 int
11257 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
11258 {
11259         int error;
11260         struct nameidata nd;
11261         u_long nameiflags;
11262         vnode_t vp = NULL;
11263         vfs_context_t ctx = vfs_context_current();
11264
11265         AUDIT_ARG(cmd, uap->cmd);
11266         AUDIT_ARG(value32, uap->options);
11267         /* Get the vnode for the file we are getting info on:  */
11268         nameiflags = 0;
11269         //
11270         // if we come through fsctl() then the file is by definition not open.
11271         // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
11272         // lest the caller mistakenly thinks the only open is their own (but in
11273         // reality it's someone elses).
11274         //
11275         if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
11276                 return EINVAL;
11277         }
11278         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11279                 nameiflags |= FOLLOW;
11280         }
11281         if (uap->cmd == FSIOC_FIRMLINK_CTL) {
11282                 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
11283         }
11284         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
11285             UIO_USERSPACE, uap->path, ctx);
11286         if ((error = namei(&nd))) {
11287                 goto done;
11288         }
11289         vp = nd.ni_vp;
11290         nameidone(&nd);
11291
11292 #if CONFIG_MACF
11293         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
11294         if (error) {
11295                 goto done;
11296         }
11297 #endif
11298
11299         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11300
11301 done:
11302         if (vp) {
11303                 vnode_put(vp);
11304         }
11305         return error;
11306 }
11307 /* ARGSUSED */
11308 int
11309 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
11310 {
11311         int error;
11312         vnode_t vp = NULL;
11313         vfs_context_t ctx = vfs_context_current();
11314         int fd = -1;
11315
11316         AUDIT_ARG(fd, uap->fd);
11317         AUDIT_ARG(cmd, uap->cmd);
11318         AUDIT_ARG(value32, uap->options);
11319
11320         /* Get the vnode for the file we are getting info on:  */
11321         if ((error = file_vnode(uap->fd, &vp))) {
11322                 return error;
11323         }
11324         fd = uap->fd;
11325         if ((error = vnode_getwithref(vp))) {
11326                 file_drop(fd);
11327                 return error;
11328         }
11329
11330 #if CONFIG_MACF
11331         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
11332                 file_drop(fd);
11333                 vnode_put(vp);
11334                 return error;
11335         }
11336 #endif
11337
11338         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11339
11340         file_drop(fd);
11341
11342         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
11343         if (vp) {
11344                 vnode_put(vp);
11345         }
11346
11347         return error;
11348 }
11349 /* end of fsctl system call */
11350
11351 /*
11352  *  Retrieve the data of an extended attribute.
11353  */
11354 int
11355 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
11356 {
11357         vnode_t vp;
11358         struct nameidata nd;
11359         char attrname[XATTR_MAXNAMELEN + 1];
11360         vfs_context_t ctx = vfs_context_current();
11361         uio_t auio = NULL;
11362         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11363         size_t attrsize = 0;
11364         size_t namelen;
11365         u_int32_t nameiflags;
11366         int error;
11367         char uio_buf[UIO_SIZEOF(1)];
11368
11369         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11370                 return EINVAL;
11371         }
11372
11373         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11374         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
11375         if ((error = namei(&nd))) {
11376                 return error;
11377         }
11378         vp = nd.ni_vp;
11379         nameidone(&nd);
11380
11381         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11382         if (error != 0) {
11383                 goto out;
11384         }
11385         if (xattr_protected(attrname)) {
11386                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
11387                         error = EPERM;
11388                         goto out;
11389                 }
11390         }
11391         /*
11392          * the specific check for 0xffffffff is a hack to preserve
11393          * binaray compatibilty in K64 with applications that discovered
11394          * that passing in a buf pointer and a size of -1 resulted in
11395          * just the size of the indicated extended attribute being returned.
11396          * this isn't part of the documented behavior, but because of the
11397          * original implemtation's check for "uap->size > 0", this behavior
11398          * was allowed. In K32 that check turned into a signed comparison
11399          * even though uap->size is unsigned...  in K64, we blow by that
11400          * check because uap->size is unsigned and doesn't get sign smeared
11401          * in the munger for a 32 bit user app.  we also need to add a
11402          * check to limit the maximum size of the buffer being passed in...
11403          * unfortunately, the underlying fileystems seem to just malloc
11404          * the requested size even if the actual extended attribute is tiny.
11405          * because that malloc is for kernel wired memory, we have to put a
11406          * sane limit on it.
11407          *
11408          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
11409          * U64 running on K64 will yield -1 (64 bits wide)
11410          * U32/U64 running on K32 will yield -1 (32 bits wide)
11411          */
11412         if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
11413                 goto no_uio;
11414         }
11415
11416         if (uap->value) {
11417                 if (uap->size > (size_t)XATTR_MAXSIZE) {
11418                         uap->size = XATTR_MAXSIZE;
11419                 }
11420
11421                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11422                     &uio_buf[0], sizeof(uio_buf));
11423                 uio_addiov(auio, uap->value, uap->size);
11424         }
11425 no_uio:
11426         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
11427 out:
11428         vnode_put(vp);
11429
11430         if (auio) {
11431                 *retval = uap->size - uio_resid(auio);
11432         } else {
11433                 *retval = (user_ssize_t)attrsize;
11434         }
11435
11436         return error;
11437 }
11438
11439 /*
11440  * Retrieve the data of an extended attribute.
11441  */
11442 int
11443 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
11444 {
11445         vnode_t vp;
11446         char attrname[XATTR_MAXNAMELEN + 1];
11447         uio_t auio = NULL;
11448         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11449         size_t attrsize = 0;
11450         size_t namelen;
11451         int error;
11452         char uio_buf[UIO_SIZEOF(1)];
11453
11454         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11455                 return EINVAL;
11456         }
11457
11458         if ((error = file_vnode(uap->fd, &vp))) {
11459                 return error;
11460         }
11461         if ((error = vnode_getwithref(vp))) {
11462                 file_drop(uap->fd);
11463                 return error;
11464         }
11465         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11466         if (error != 0) {
11467                 goto out;
11468         }
11469         if (xattr_protected(attrname)) {
11470                 error = EPERM;
11471                 goto out;
11472         }
11473         if (uap->value && uap->size > 0) {
11474                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11475                     &uio_buf[0], sizeof(uio_buf));
11476                 uio_addiov(auio, uap->value, uap->size);
11477         }
11478
11479         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
11480 out:
11481         (void)vnode_put(vp);
11482         file_drop(uap->fd);
11483
11484         if (auio) {
11485                 *retval = uap->size - uio_resid(auio);
11486         } else {
11487                 *retval = (user_ssize_t)attrsize;
11488         }
11489         return error;
11490 }
11491
11492 /*
11493  * Set the data of an extended attribute.
11494  */
11495 int
11496 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
11497 {
11498         vnode_t vp;
11499         struct nameidata nd;
11500         char attrname[XATTR_MAXNAMELEN + 1];
11501         vfs_context_t ctx = vfs_context_current();
11502         uio_t auio = NULL;
11503         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11504         size_t namelen;
11505         u_int32_t nameiflags;
11506         int error;
11507         char uio_buf[UIO_SIZEOF(1)];
11508
11509         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11510                 return EINVAL;
11511         }
11512
11513         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11514         if (error != 0) {
11515                 if (error == EPERM) {
11516                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11517                         return ENAMETOOLONG;
11518                 }
11519                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11520                 return error;
11521         }
11522         if (xattr_protected(attrname)) {
11523                 return EPERM;
11524         }
11525         if (uap->size != 0 && uap->value == 0) {
11526                 return EINVAL;
11527         }
11528
11529         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11530         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
11531         if ((error = namei(&nd))) {
11532                 return error;
11533         }
11534         vp = nd.ni_vp;
11535         nameidone(&nd);
11536
11537         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11538             &uio_buf[0], sizeof(uio_buf));
11539         uio_addiov(auio, uap->value, uap->size);
11540
11541         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
11542 #if CONFIG_FSE
11543         if (error == 0) {
11544                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11545                     FSE_ARG_VNODE, vp,
11546                     FSE_ARG_DONE);
11547         }
11548 #endif
11549         vnode_put(vp);
11550         *retval = 0;
11551         return error;
11552 }
11553
11554 /*
11555  * Set the data of an extended attribute.
11556  */
11557 int
11558 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
11559 {
11560         vnode_t vp;
11561         char attrname[XATTR_MAXNAMELEN + 1];
11562         uio_t auio = NULL;
11563         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11564         size_t namelen;
11565         int error;
11566         char uio_buf[UIO_SIZEOF(1)];
11567 #if CONFIG_FSE
11568         vfs_context_t ctx = vfs_context_current();
11569 #endif
11570
11571         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11572                 return EINVAL;
11573         }
11574
11575         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11576         if (error != 0) {
11577                 if (error == EPERM) {
11578                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11579                         return ENAMETOOLONG;
11580                 }
11581                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11582                 return error;
11583         }
11584         if (xattr_protected(attrname)) {
11585                 return EPERM;
11586         }
11587         if (uap->size != 0 && uap->value == 0) {
11588                 return EINVAL;
11589         }
11590         if ((error = file_vnode(uap->fd, &vp))) {
11591                 return error;
11592         }
11593         if ((error = vnode_getwithref(vp))) {
11594                 file_drop(uap->fd);
11595                 return error;
11596         }
11597         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11598             &uio_buf[0], sizeof(uio_buf));
11599         uio_addiov(auio, uap->value, uap->size);
11600
11601         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
11602 #if CONFIG_FSE
11603         if (error == 0) {
11604                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11605                     FSE_ARG_VNODE, vp,
11606                     FSE_ARG_DONE);
11607         }
11608 #endif
11609         vnode_put(vp);
11610         file_drop(uap->fd);
11611         *retval = 0;
11612         return error;
11613 }
11614
11615 /*
11616  * Remove an extended attribute.
11617  * XXX Code duplication here.
11618  */
11619 int
11620 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
11621 {
11622         vnode_t vp;
11623         struct nameidata nd;
11624         char attrname[XATTR_MAXNAMELEN + 1];
11625         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11626         vfs_context_t ctx = vfs_context_current();
11627         size_t namelen;
11628         u_int32_t nameiflags;
11629         int error;
11630
11631         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11632                 return EINVAL;
11633         }
11634
11635         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11636         if (error != 0) {
11637                 return error;
11638         }
11639         if (xattr_protected(attrname)) {
11640                 return EPERM;
11641         }
11642         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11643         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
11644         if ((error = namei(&nd))) {
11645                 return error;
11646         }
11647         vp = nd.ni_vp;
11648         nameidone(&nd);
11649
11650         error = vn_removexattr(vp, attrname, uap->options, ctx);
11651 #if CONFIG_FSE
11652         if (error == 0) {
11653                 add_fsevent(FSE_XATTR_REMOVED, ctx,
11654                     FSE_ARG_VNODE, vp,
11655                     FSE_ARG_DONE);
11656         }
11657 #endif
11658         vnode_put(vp);
11659         *retval = 0;
11660         return error;
11661 }
11662
11663 /*
11664  * Remove an extended attribute.
11665  * XXX Code duplication here.
11666  */
11667 int
11668 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
11669 {
11670         vnode_t vp;
11671         char attrname[XATTR_MAXNAMELEN + 1];
11672         size_t namelen;
11673         int error;
11674 #if CONFIG_FSE
11675         vfs_context_t ctx = vfs_context_current();
11676 #endif
11677
11678         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11679                 return EINVAL;
11680         }
11681
11682         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11683         if (error != 0) {
11684                 return error;
11685         }
11686         if (xattr_protected(attrname)) {
11687                 return EPERM;
11688         }
11689         if ((error = file_vnode(uap->fd, &vp))) {
11690                 return error;
11691         }
11692         if ((error = vnode_getwithref(vp))) {
11693                 file_drop(uap->fd);
11694                 return error;
11695         }
11696
11697         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
11698 #if CONFIG_FSE
11699         if (error == 0) {
11700                 add_fsevent(FSE_XATTR_REMOVED, ctx,
11701                     FSE_ARG_VNODE, vp,
11702                     FSE_ARG_DONE);
11703         }
11704 #endif
11705         vnode_put(vp);
11706         file_drop(uap->fd);
11707         *retval = 0;
11708         return error;
11709 }
11710
11711 /*
11712  * Retrieve the list of extended attribute names.
11713  * XXX Code duplication here.
11714  */
11715 int
11716 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
11717 {
11718         vnode_t vp;
11719         struct nameidata nd;
11720         vfs_context_t ctx = vfs_context_current();
11721         uio_t auio = NULL;
11722         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11723         size_t attrsize = 0;
11724         u_int32_t nameiflags;
11725         int error;
11726         char uio_buf[UIO_SIZEOF(1)];
11727
11728         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11729                 return EINVAL;
11730         }
11731
11732         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11733         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
11734         if ((error = namei(&nd))) {
11735                 return error;
11736         }
11737         vp = nd.ni_vp;
11738         nameidone(&nd);
11739         if (uap->namebuf != 0 && uap->bufsize > 0) {
11740                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
11741                     &uio_buf[0], sizeof(uio_buf));
11742                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11743         }
11744
11745         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
11746
11747         vnode_put(vp);
11748         if (auio) {
11749                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11750         } else {
11751                 *retval = (user_ssize_t)attrsize;
11752         }
11753         return error;
11754 }
11755
11756 /*
11757  * Retrieve the list of extended attribute names.
11758  * XXX Code duplication here.
11759  */
11760 int
11761 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
11762 {
11763         vnode_t vp;
11764         uio_t auio = NULL;
11765         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11766         size_t attrsize = 0;
11767         int error;
11768         char uio_buf[UIO_SIZEOF(1)];
11769
11770         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11771                 return EINVAL;
11772         }
11773
11774         if ((error = file_vnode(uap->fd, &vp))) {
11775                 return error;
11776         }
11777         if ((error = vnode_getwithref(vp))) {
11778                 file_drop(uap->fd);
11779                 return error;
11780         }
11781         if (uap->namebuf != 0 && uap->bufsize > 0) {
11782                 auio = uio_createwithbuffer(1, 0, spacetype,
11783                     UIO_READ, &uio_buf[0], sizeof(uio_buf));
11784                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11785         }
11786
11787         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
11788
11789         vnode_put(vp);
11790         file_drop(uap->fd);
11791         if (auio) {
11792                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11793         } else {
11794                 *retval = (user_ssize_t)attrsize;
11795         }
11796         return error;
11797 }
11798
11799 static int
11800 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
11801     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
11802 {
11803         int error;
11804         struct mount *mp = NULL;
11805         vnode_t vp;
11806         int length;
11807         int bpflags;
11808         /* maximum number of times to retry build_path */
11809         unsigned int retries = 0x10;
11810
11811         if (bufsize > PAGE_SIZE) {
11812                 return EINVAL;
11813         }
11814
11815         if (buf == NULL) {
11816                 return ENOMEM;
11817         }
11818
11819 retry:
11820         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
11821                 error = ENOTSUP;  /* unexpected failure */
11822                 return ENOTSUP;
11823         }
11824
11825 unionget:
11826         if (objid == 2) {
11827                 struct vfs_attr vfsattr;
11828                 int use_vfs_root = TRUE;
11829
11830                 VFSATTR_INIT(&vfsattr);
11831                 VFSATTR_WANTED(&vfsattr, f_capabilities);
11832                 if (!(options & FSOPT_ISREALFSID) &&
11833                     vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
11834                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
11835                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
11836                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
11837                                 use_vfs_root = FALSE;
11838                         }
11839                 }
11840
11841                 if (use_vfs_root) {
11842                         error = VFS_ROOT(mp, &vp, ctx);
11843                 } else {
11844                         error = VFS_VGET(mp, objid, &vp, ctx);
11845                 }
11846         } else {
11847                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
11848         }
11849
11850         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
11851                 /*
11852                  * If the fileid isn't found and we're in a union
11853                  * mount volume, then see if the fileid is in the
11854                  * mounted-on volume.
11855                  */
11856                 struct mount *tmp = mp;
11857                 mp = vnode_mount(tmp->mnt_vnodecovered);
11858                 vfs_unbusy(tmp);
11859                 if (vfs_busy(mp, LK_NOWAIT) == 0) {
11860                         goto unionget;
11861                 }
11862         } else {
11863                 vfs_unbusy(mp);
11864         }
11865
11866         if (error) {
11867                 return error;
11868         }
11869
11870 #if CONFIG_MACF
11871         error = mac_vnode_check_fsgetpath(ctx, vp);
11872         if (error) {
11873                 vnode_put(vp);
11874                 return error;
11875         }
11876 #endif
11877
11878         /* Obtain the absolute path to this vnode. */
11879         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
11880         if (options & FSOPT_NOFIRMLINKPATH) {
11881                 bpflags |= BUILDPATH_NO_FIRMLINK;
11882         }
11883         bpflags |= BUILDPATH_CHECK_MOVED;
11884         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
11885         vnode_put(vp);
11886
11887         if (error) {
11888                 /* there was a race building the path, try a few more times */
11889                 if (error == EAGAIN) {
11890                         --retries;
11891                         if (retries > 0) {
11892                                 goto retry;
11893                         }
11894
11895                         error = ENOENT;
11896                 }
11897                 goto out;
11898         }
11899
11900         AUDIT_ARG(text, buf);
11901
11902         if (kdebug_enable) {
11903                 long dbg_parms[NUMPARMS];
11904                 int  dbg_namelen;
11905
11906                 dbg_namelen = (int)sizeof(dbg_parms);
11907
11908                 if (length < dbg_namelen) {
11909                         memcpy((char *)dbg_parms, buf, length);
11910                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
11911
11912                         dbg_namelen = length;
11913                 } else {
11914                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
11915                 }
11916
11917                 kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
11918                     KDBG_VFS_LOOKUP_FLAG_LOOKUP);
11919         }
11920
11921         *pathlen = (user_ssize_t)length; /* may be superseded by error */
11922
11923 out:
11924         return error;
11925 }
11926
11927 /*
11928  * Obtain the full pathname of a file system object by id.
11929  */
11930 static int
11931 fsgetpath_extended(user_addr_t buf, int bufsize, user_addr_t user_fsid, uint64_t objid,
11932     uint32_t options, user_ssize_t *retval)
11933 {
11934         vfs_context_t ctx = vfs_context_current();
11935         fsid_t fsid;
11936         char *realpath;
11937         int length;
11938         int error;
11939
11940         if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
11941                 return EINVAL;
11942         }
11943
11944         if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
11945                 return error;
11946         }
11947         AUDIT_ARG(value32, fsid.val[0]);
11948         AUDIT_ARG(value64, objid);
11949         /* Restrict output buffer size for now. */
11950
11951         if (bufsize > PAGE_SIZE || bufsize <= 0) {
11952                 return EINVAL;
11953         }
11954         MALLOC(realpath, char *, bufsize, M_TEMP, M_WAITOK | M_ZERO);
11955         if (realpath == NULL) {
11956                 return ENOMEM;
11957         }
11958
11959         error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
11960             options, &length);
11961
11962         if (error) {
11963                 goto out;
11964         }
11965
11966         error = copyout((caddr_t)realpath, buf, length);
11967
11968         *retval = (user_ssize_t)length; /* may be superseded by error */
11969 out:
11970         if (realpath) {
11971                 FREE(realpath, M_TEMP);
11972         }
11973         return error;
11974 }
11975
11976 int
11977 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
11978 {
11979         return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
11980                    0, retval);
11981 }
11982
11983 int
11984 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
11985 {
11986         return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
11987                    uap->options, retval);
11988 }
11989
11990 /*
11991  * Common routine to handle various flavors of statfs data heading out
11992  *      to user space.
11993  *
11994  * Returns:     0                       Success
11995  *              EFAULT
11996  */
11997 static int
11998 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
11999     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12000     boolean_t partial_copy)
12001 {
12002         int             error;
12003         int             my_size, copy_size;
12004
12005         if (is_64_bit) {
12006                 struct user64_statfs sfs;
12007                 my_size = copy_size = sizeof(sfs);
12008                 bzero(&sfs, my_size);
12009                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12010                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12011                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12012                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12013                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12014                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12015                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12016                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12017                 sfs.f_files = (user64_long_t)sfsp->f_files;
12018                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12019                 sfs.f_fsid = sfsp->f_fsid;
12020                 sfs.f_owner = sfsp->f_owner;
12021                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12022                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12023                 } else {
12024                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12025                 }
12026                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12027                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12028
12029                 if (partial_copy) {
12030                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12031                 }
12032                 error = copyout((caddr_t)&sfs, bufp, copy_size);
12033         } else {
12034                 struct user32_statfs sfs;
12035
12036                 my_size = copy_size = sizeof(sfs);
12037                 bzero(&sfs, my_size);
12038
12039                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12040                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12041                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12042
12043                 /*
12044                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12045                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
12046                  * to reflect the filesystem size as best we can.
12047                  */
12048                 if ((sfsp->f_blocks > INT_MAX)
12049                     /* Hack for 4061702 . I think the real fix is for Carbon to
12050                      * look for some volume capability and not depend on hidden
12051                      * semantics agreed between a FS and carbon.
12052                      * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12053                      * for Carbon to set bNoVolumeSizes volume attribute.
12054                      * Without this the webdavfs files cannot be copied onto
12055                      * disk as they look huge. This change should not affect
12056                      * XSAN as they should not setting these to -1..
12057                      */
12058                     && (sfsp->f_blocks != 0xffffffffffffffffULL)
12059                     && (sfsp->f_bfree != 0xffffffffffffffffULL)
12060                     && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12061                         int             shift;
12062
12063                         /*
12064                          * Work out how far we have to shift the block count down to make it fit.
12065                          * Note that it's possible to have to shift so far that the resulting
12066                          * blocksize would be unreportably large.  At that point, we will clip
12067                          * any values that don't fit.
12068                          *
12069                          * For safety's sake, we also ensure that f_iosize is never reported as
12070                          * being smaller than f_bsize.
12071                          */
12072                         for (shift = 0; shift < 32; shift++) {
12073                                 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12074                                         break;
12075                                 }
12076                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12077                                         break;
12078                                 }
12079                         }
12080 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12081                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12082                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12083                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12084 #undef __SHIFT_OR_CLIP
12085                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12086                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
12087                 } else {
12088                         /* filesystem is small enough to be reported honestly */
12089                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12090                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12091                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12092                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12093                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12094                 }
12095                 sfs.f_files = (user32_long_t)sfsp->f_files;
12096                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12097                 sfs.f_fsid = sfsp->f_fsid;
12098                 sfs.f_owner = sfsp->f_owner;
12099                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12100                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12101                 } else {
12102                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12103                 }
12104                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12105                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12106
12107                 if (partial_copy) {
12108                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12109                 }
12110                 error = copyout((caddr_t)&sfs, bufp, copy_size);
12111         }
12112
12113         if (sizep != NULL) {
12114                 *sizep = my_size;
12115         }
12116         return error;
12117 }
12118
12119 /*
12120  * copy stat structure into user_stat structure.
12121  */
12122 void
12123 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
12124 {
12125         bzero(usbp, sizeof(*usbp));
12126
12127         usbp->st_dev = sbp->st_dev;
12128         usbp->st_ino = sbp->st_ino;
12129         usbp->st_mode = sbp->st_mode;
12130         usbp->st_nlink = sbp->st_nlink;
12131         usbp->st_uid = sbp->st_uid;
12132         usbp->st_gid = sbp->st_gid;
12133         usbp->st_rdev = sbp->st_rdev;
12134 #ifndef _POSIX_C_SOURCE
12135         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12136         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12137         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12138         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12139         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12140         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12141 #else
12142         usbp->st_atime = sbp->st_atime;
12143         usbp->st_atimensec = sbp->st_atimensec;
12144         usbp->st_mtime = sbp->st_mtime;
12145         usbp->st_mtimensec = sbp->st_mtimensec;
12146         usbp->st_ctime = sbp->st_ctime;
12147         usbp->st_ctimensec = sbp->st_ctimensec;
12148 #endif
12149         usbp->st_size = sbp->st_size;
12150         usbp->st_blocks = sbp->st_blocks;
12151         usbp->st_blksize = sbp->st_blksize;
12152         usbp->st_flags = sbp->st_flags;
12153         usbp->st_gen = sbp->st_gen;
12154         usbp->st_lspare = sbp->st_lspare;
12155         usbp->st_qspare[0] = sbp->st_qspare[0];
12156         usbp->st_qspare[1] = sbp->st_qspare[1];
12157 }
12158
12159 void
12160 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
12161 {
12162         bzero(usbp, sizeof(*usbp));
12163
12164         usbp->st_dev = sbp->st_dev;
12165         usbp->st_ino = sbp->st_ino;
12166         usbp->st_mode = sbp->st_mode;
12167         usbp->st_nlink = sbp->st_nlink;
12168         usbp->st_uid = sbp->st_uid;
12169         usbp->st_gid = sbp->st_gid;
12170         usbp->st_rdev = sbp->st_rdev;
12171 #ifndef _POSIX_C_SOURCE
12172         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12173         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12174         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12175         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12176         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12177         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12178 #else
12179         usbp->st_atime = sbp->st_atime;
12180         usbp->st_atimensec = sbp->st_atimensec;
12181         usbp->st_mtime = sbp->st_mtime;
12182         usbp->st_mtimensec = sbp->st_mtimensec;
12183         usbp->st_ctime = sbp->st_ctime;
12184         usbp->st_ctimensec = sbp->st_ctimensec;
12185 #endif
12186         usbp->st_size = sbp->st_size;
12187         usbp->st_blocks = sbp->st_blocks;
12188         usbp->st_blksize = sbp->st_blksize;
12189         usbp->st_flags = sbp->st_flags;
12190         usbp->st_gen = sbp->st_gen;
12191         usbp->st_lspare = sbp->st_lspare;
12192         usbp->st_qspare[0] = sbp->st_qspare[0];
12193         usbp->st_qspare[1] = sbp->st_qspare[1];
12194 }
12195
12196 /*
12197  * copy stat64 structure into user_stat64 structure.
12198  */
12199 void
12200 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
12201 {
12202         bzero(usbp, sizeof(*usbp));
12203
12204         usbp->st_dev = sbp->st_dev;
12205         usbp->st_ino = sbp->st_ino;
12206         usbp->st_mode = sbp->st_mode;
12207         usbp->st_nlink = sbp->st_nlink;
12208         usbp->st_uid = sbp->st_uid;
12209         usbp->st_gid = sbp->st_gid;
12210         usbp->st_rdev = sbp->st_rdev;
12211 #ifndef _POSIX_C_SOURCE
12212         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12213         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12214         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12215         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12216         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12217         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12218         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12219         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12220 #else
12221         usbp->st_atime = sbp->st_atime;
12222         usbp->st_atimensec = sbp->st_atimensec;
12223         usbp->st_mtime = sbp->st_mtime;
12224         usbp->st_mtimensec = sbp->st_mtimensec;
12225         usbp->st_ctime = sbp->st_ctime;
12226         usbp->st_ctimensec = sbp->st_ctimensec;
12227         usbp->st_birthtime = sbp->st_birthtime;
12228         usbp->st_birthtimensec = sbp->st_birthtimensec;
12229 #endif
12230         usbp->st_size = sbp->st_size;
12231         usbp->st_blocks = sbp->st_blocks;
12232         usbp->st_blksize = sbp->st_blksize;
12233         usbp->st_flags = sbp->st_flags;
12234         usbp->st_gen = sbp->st_gen;
12235         usbp->st_lspare = sbp->st_lspare;
12236         usbp->st_qspare[0] = sbp->st_qspare[0];
12237         usbp->st_qspare[1] = sbp->st_qspare[1];
12238 }
12239
12240 void
12241 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
12242 {
12243         bzero(usbp, sizeof(*usbp));
12244
12245         usbp->st_dev = sbp->st_dev;
12246         usbp->st_ino = sbp->st_ino;
12247         usbp->st_mode = sbp->st_mode;
12248         usbp->st_nlink = sbp->st_nlink;
12249         usbp->st_uid = sbp->st_uid;
12250         usbp->st_gid = sbp->st_gid;
12251         usbp->st_rdev = sbp->st_rdev;
12252 #ifndef _POSIX_C_SOURCE
12253         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12254         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12255         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12256         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12257         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12258         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12259         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12260         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12261 #else
12262         usbp->st_atime = sbp->st_atime;
12263         usbp->st_atimensec = sbp->st_atimensec;
12264         usbp->st_mtime = sbp->st_mtime;
12265         usbp->st_mtimensec = sbp->st_mtimensec;
12266         usbp->st_ctime = sbp->st_ctime;
12267         usbp->st_ctimensec = sbp->st_ctimensec;
12268         usbp->st_birthtime = sbp->st_birthtime;
12269         usbp->st_birthtimensec = sbp->st_birthtimensec;
12270 #endif
12271         usbp->st_size = sbp->st_size;
12272         usbp->st_blocks = sbp->st_blocks;
12273         usbp->st_blksize = sbp->st_blksize;
12274         usbp->st_flags = sbp->st_flags;
12275         usbp->st_gen = sbp->st_gen;
12276         usbp->st_lspare = sbp->st_lspare;
12277         usbp->st_qspare[0] = sbp->st_qspare[0];
12278         usbp->st_qspare[1] = sbp->st_qspare[1];
12279 }
12280
12281 /*
12282  * Purge buffer cache for simulating cold starts
12283  */
12284 static int
12285 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
12286 {
12287         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
12288
12289         return VNODE_RETURNED;
12290 }
12291
12292 static int
12293 vfs_purge_callback(mount_t mp, __unused void * arg)
12294 {
12295         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
12296
12297         return VFS_RETURNED;
12298 }
12299
12300 int
12301 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
12302 {
12303         if (!kauth_cred_issuser(kauth_cred_get())) {
12304                 return EPERM;
12305         }
12306
12307         vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
12308
12309         return 0;
12310 }
12311
12312 /*
12313  * gets the vnode associated with the (unnamed) snapshot directory
12314  * for a Filesystem. The snapshot directory vnode is returned with
12315  * an iocount on it.
12316  */
12317 int
12318 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
12319 {
12320         return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
12321 }
12322
12323 /*
12324  * Get the snapshot vnode.
12325  *
12326  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
12327  * needs nameidone() on ndp.
12328  *
12329  * If the snapshot vnode exists it is returned in ndp->ni_vp.
12330  *
12331  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
12332  * not needed.
12333  */
12334 static int
12335 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
12336     user_addr_t name, struct nameidata *ndp, int32_t op,
12337 #if !CONFIG_TRIGGERS
12338     __unused
12339 #endif
12340     enum path_operation pathop,
12341     vfs_context_t ctx)
12342 {
12343         int error, i;
12344         caddr_t name_buf;
12345         size_t name_len;
12346         struct vfs_attr vfa;
12347
12348         *sdvpp = NULLVP;
12349         *rvpp = NULLVP;
12350
12351         error = vnode_getfromfd(ctx, dirfd, rvpp);
12352         if (error) {
12353                 return error;
12354         }
12355
12356         if (!vnode_isvroot(*rvpp)) {
12357                 error = EINVAL;
12358                 goto out;
12359         }
12360
12361         /* Make sure the filesystem supports snapshots */
12362         VFSATTR_INIT(&vfa);
12363         VFSATTR_WANTED(&vfa, f_capabilities);
12364         if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
12365             !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
12366             !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
12367             VOL_CAP_INT_SNAPSHOT)) ||
12368             !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
12369             VOL_CAP_INT_SNAPSHOT))) {
12370                 error = ENOTSUP;
12371                 goto out;
12372         }
12373
12374         error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
12375         if (error) {
12376                 goto out;
12377         }
12378
12379         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12380         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12381         if (error) {
12382                 goto out1;
12383         }
12384
12385         /*
12386          * Some sanity checks- name can't be empty, "." or ".." or have slashes.
12387          * (the length returned by copyinstr includes the terminating NUL)
12388          */
12389         if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
12390             (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
12391                 error = EINVAL;
12392                 goto out1;
12393         }
12394         for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
12395                 ;
12396         }
12397         if (i < (int)name_len) {
12398                 error = EINVAL;
12399                 goto out1;
12400         }
12401
12402 #if CONFIG_MACF
12403         if (op == CREATE) {
12404                 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
12405                     name_buf);
12406         } else if (op == DELETE) {
12407                 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
12408                     name_buf);
12409         }
12410         if (error) {
12411                 goto out1;
12412         }
12413 #endif
12414
12415         /* Check if the snapshot already exists ... */
12416         NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
12417             UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
12418         ndp->ni_dvp = *sdvpp;
12419
12420         error = namei(ndp);
12421 out1:
12422         FREE(name_buf, M_TEMP);
12423 out:
12424         if (error) {
12425                 if (*sdvpp) {
12426                         vnode_put(*sdvpp);
12427                         *sdvpp = NULLVP;
12428                 }
12429                 if (*rvpp) {
12430                         vnode_put(*rvpp);
12431                         *rvpp = NULLVP;
12432                 }
12433         }
12434         return error;
12435 }
12436
12437 /*
12438  * create a filesystem snapshot (for supporting filesystems)
12439  *
12440  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
12441  * We get to the (unnamed) snapshot directory vnode and create the vnode
12442  * for the snapshot in it.
12443  *
12444  * Restrictions:
12445  *
12446  *    a) Passed in name for snapshot cannot have slashes.
12447  *    b) name can't be "." or ".."
12448  *
12449  * Since this requires superuser privileges, vnode_authorize calls are not
12450  * made.
12451  */
12452 static int
12453 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
12454     vfs_context_t ctx)
12455 {
12456         vnode_t rvp, snapdvp;
12457         int error;
12458         struct nameidata namend;
12459
12460         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
12461             OP_LINK, ctx);
12462         if (error) {
12463                 return error;
12464         }
12465
12466         if (namend.ni_vp) {
12467                 vnode_put(namend.ni_vp);
12468                 error = EEXIST;
12469         } else {
12470                 struct vnode_attr va;
12471                 vnode_t vp = NULLVP;
12472
12473                 VATTR_INIT(&va);
12474                 VATTR_SET(&va, va_type, VREG);
12475                 VATTR_SET(&va, va_mode, 0);
12476
12477                 error = vn_create(snapdvp, &vp, &namend, &va,
12478                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
12479                 if (!error && vp) {
12480                         vnode_put(vp);
12481                 }
12482         }
12483
12484         nameidone(&namend);
12485         vnode_put(snapdvp);
12486         vnode_put(rvp);
12487         return error;
12488 }
12489
12490 /*
12491  * Delete a Filesystem snapshot
12492  *
12493  * get the vnode for the unnamed snapshot directory and the snapshot and
12494  * delete the snapshot.
12495  */
12496 static int
12497 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
12498     vfs_context_t ctx)
12499 {
12500         vnode_t rvp, snapdvp;
12501         int error;
12502         struct nameidata namend;
12503
12504         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
12505             OP_UNLINK, ctx);
12506         if (error) {
12507                 goto out;
12508         }
12509
12510         error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
12511             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
12512
12513         vnode_put(namend.ni_vp);
12514         nameidone(&namend);
12515         vnode_put(snapdvp);
12516         vnode_put(rvp);
12517 out:
12518         return error;
12519 }
12520
12521 /*
12522  * Revert a filesystem to a snapshot
12523  *
12524  * Marks the filesystem to revert to the given snapshot on next mount.
12525  */
12526 static int
12527 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
12528     vfs_context_t ctx)
12529 {
12530         int error;
12531         vnode_t rvp;
12532         mount_t mp;
12533         struct fs_snapshot_revert_args revert_data;
12534         struct componentname cnp;
12535         caddr_t name_buf;
12536         size_t name_len;
12537
12538         error = vnode_getfromfd(ctx, dirfd, &rvp);
12539         if (error) {
12540                 return error;
12541         }
12542         mp = vnode_mount(rvp);
12543
12544         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12545         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12546         if (error) {
12547                 FREE(name_buf, M_TEMP);
12548                 vnode_put(rvp);
12549                 return error;
12550         }
12551
12552 #if CONFIG_MACF
12553         error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
12554         if (error) {
12555                 FREE(name_buf, M_TEMP);
12556                 vnode_put(rvp);
12557                 return error;
12558         }
12559 #endif
12560
12561         /*
12562          * Grab mount_iterref so that we can release the vnode,
12563          * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
12564          */
12565         error = mount_iterref(mp, 0);
12566         vnode_put(rvp);
12567         if (error) {
12568                 FREE(name_buf, M_TEMP);
12569                 return error;
12570         }
12571
12572         memset(&cnp, 0, sizeof(cnp));
12573         cnp.cn_pnbuf = (char *)name_buf;
12574         cnp.cn_nameiop = LOOKUP;
12575         cnp.cn_flags = ISLASTCN | HASBUF;
12576         cnp.cn_pnlen = MAXPATHLEN;
12577         cnp.cn_nameptr = cnp.cn_pnbuf;
12578         cnp.cn_namelen = (int)name_len;
12579         revert_data.sr_cnp = &cnp;
12580
12581         error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
12582         mount_iterdrop(mp);
12583         FREE(name_buf, M_TEMP);
12584
12585         if (error) {
12586                 /* If there was any error, try again using VNOP_IOCTL */
12587
12588                 vnode_t snapdvp;
12589                 struct nameidata namend;
12590
12591                 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
12592                     OP_LOOKUP, ctx);
12593                 if (error) {
12594                         return error;
12595                 }
12596
12597
12598                 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
12599                     0, ctx);
12600
12601                 vnode_put(namend.ni_vp);
12602                 nameidone(&namend);
12603                 vnode_put(snapdvp);
12604                 vnode_put(rvp);
12605         }
12606
12607         return error;
12608 }
12609
12610 /*
12611  * rename a Filesystem snapshot
12612  *
12613  * get the vnode for the unnamed snapshot directory and the snapshot and
12614  * rename the snapshot. This is a very specialised (and simple) case of
12615  * rename(2) (which has to deal with a lot more complications). It differs
12616  * slightly from rename(2) in that EEXIST is returned if the new name exists.
12617  */
12618 static int
12619 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
12620     __unused uint32_t flags, vfs_context_t ctx)
12621 {
12622         vnode_t rvp, snapdvp;
12623         int error, i;
12624         caddr_t newname_buf;
12625         size_t name_len;
12626         vnode_t fvp;
12627         struct nameidata *fromnd, *tond;
12628         /* carving out a chunk for structs that are too big to be on stack. */
12629         struct {
12630                 struct nameidata from_node;
12631                 struct nameidata to_node;
12632         } * __rename_data;
12633
12634         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
12635         fromnd = &__rename_data->from_node;
12636         tond = &__rename_data->to_node;
12637
12638         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
12639             OP_UNLINK, ctx);
12640         if (error) {
12641                 goto out;
12642         }
12643         fvp  = fromnd->ni_vp;
12644
12645         MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12646         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
12647         if (error) {
12648                 goto out1;
12649         }
12650
12651         /*
12652          * Some sanity checks- new name can't be empty, "." or ".." or have
12653          * slashes.
12654          * (the length returned by copyinstr includes the terminating NUL)
12655          *
12656          * The FS rename VNOP is suppossed to handle this but we'll pick it
12657          * off here itself.
12658          */
12659         if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
12660             (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
12661                 error = EINVAL;
12662                 goto out1;
12663         }
12664         for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
12665                 ;
12666         }
12667         if (i < (int)name_len) {
12668                 error = EINVAL;
12669                 goto out1;
12670         }
12671
12672 #if CONFIG_MACF
12673         error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
12674             newname_buf);
12675         if (error) {
12676                 goto out1;
12677         }
12678 #endif
12679
12680         NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
12681             UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
12682         tond->ni_dvp = snapdvp;
12683
12684         error = namei(tond);
12685         if (error) {
12686                 goto out2;
12687         } else if (tond->ni_vp) {
12688                 /*
12689                  * snapshot rename behaves differently than rename(2) - if the
12690                  * new name exists, EEXIST is returned.
12691                  */
12692                 vnode_put(tond->ni_vp);
12693                 error = EEXIST;
12694                 goto out2;
12695         }
12696
12697         error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
12698             &tond->ni_cnd, ctx);
12699
12700 out2:
12701         nameidone(tond);
12702 out1:
12703         FREE(newname_buf, M_TEMP);
12704         vnode_put(fvp);
12705         vnode_put(snapdvp);
12706         vnode_put(rvp);
12707         nameidone(fromnd);
12708 out:
12709         FREE(__rename_data, M_TEMP);
12710         return error;
12711 }
12712
12713 /*
12714  * Mount a Filesystem snapshot
12715  *
12716  * get the vnode for the unnamed snapshot directory and the snapshot and
12717  * mount the snapshot.
12718  */
12719 static int
12720 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
12721     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
12722 {
12723         vnode_t rvp, snapdvp, snapvp, vp, pvp;
12724         int error;
12725         struct nameidata *snapndp, *dirndp;
12726         /* carving out a chunk for structs that are too big to be on stack. */
12727         struct {
12728                 struct nameidata snapnd;
12729                 struct nameidata dirnd;
12730         } * __snapshot_mount_data;
12731
12732         MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
12733             M_TEMP, M_WAITOK);
12734         snapndp = &__snapshot_mount_data->snapnd;
12735         dirndp = &__snapshot_mount_data->dirnd;
12736
12737         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
12738             OP_LOOKUP, ctx);
12739         if (error) {
12740                 goto out;
12741         }
12742
12743         snapvp  = snapndp->ni_vp;
12744         if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
12745                 error = EIO;
12746                 goto out1;
12747         }
12748
12749         /* Get the vnode to be covered */
12750         NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
12751             UIO_USERSPACE, directory, ctx);
12752         error = namei(dirndp);
12753         if (error) {
12754                 goto out1;
12755         }
12756
12757         vp = dirndp->ni_vp;
12758         pvp = dirndp->ni_dvp;
12759
12760         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
12761                 error = EINVAL;
12762         } else {
12763                 mount_t mp = vnode_mount(rvp);
12764                 struct fs_snapshot_mount_args smnt_data;
12765
12766                 smnt_data.sm_mp  = mp;
12767                 smnt_data.sm_cnp = &snapndp->ni_cnd;
12768                 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
12769                     &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
12770                     KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
12771         }
12772
12773         vnode_put(vp);
12774         vnode_put(pvp);
12775         nameidone(dirndp);
12776 out1:
12777         vnode_put(snapvp);
12778         vnode_put(snapdvp);
12779         vnode_put(rvp);
12780         nameidone(snapndp);
12781 out:
12782         FREE(__snapshot_mount_data, M_TEMP);
12783         return error;
12784 }
12785
12786 /*
12787  * Root from a snapshot of the filesystem
12788  *
12789  * Marks the filesystem to root from the given snapshot on next boot.
12790  */
12791 static int
12792 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
12793     vfs_context_t ctx)
12794 {
12795         int error;
12796         vnode_t rvp;
12797         mount_t mp;
12798         struct fs_snapshot_root_args root_data;
12799         struct componentname cnp;
12800         caddr_t name_buf;
12801         size_t name_len;
12802
12803         error = vnode_getfromfd(ctx, dirfd, &rvp);
12804         if (error) {
12805                 return error;
12806         }
12807         mp = vnode_mount(rvp);
12808
12809         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12810         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12811         if (error) {
12812                 FREE(name_buf, M_TEMP);
12813                 vnode_put(rvp);
12814                 return error;
12815         }
12816
12817         // XXX MAC checks ?
12818
12819         /*
12820          * Grab mount_iterref so that we can release the vnode,
12821          * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
12822          */
12823         error = mount_iterref(mp, 0);
12824         vnode_put(rvp);
12825         if (error) {
12826                 FREE(name_buf, M_TEMP);
12827                 return error;
12828         }
12829
12830         memset(&cnp, 0, sizeof(cnp));
12831         cnp.cn_pnbuf = (char *)name_buf;
12832         cnp.cn_nameiop = LOOKUP;
12833         cnp.cn_flags = ISLASTCN | HASBUF;
12834         cnp.cn_pnlen = MAXPATHLEN;
12835         cnp.cn_nameptr = cnp.cn_pnbuf;
12836         cnp.cn_namelen = (int)name_len;
12837         root_data.sr_cnp = &cnp;
12838
12839         error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
12840
12841         mount_iterdrop(mp);
12842         FREE(name_buf, M_TEMP);
12843
12844         return error;
12845 }
12846
12847 /*
12848  * FS snapshot operations dispatcher
12849  */
12850 int
12851 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
12852     __unused int32_t *retval)
12853 {
12854         int error;
12855         vfs_context_t ctx = vfs_context_current();
12856
12857         AUDIT_ARG(fd, uap->dirfd);
12858         AUDIT_ARG(value32, uap->op);
12859
12860         error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
12861         if (error) {
12862                 return error;
12863         }
12864
12865         /*
12866          * Enforce user authorization for snapshot modification operations
12867          */
12868         if ((uap->op != SNAPSHOT_OP_MOUNT) &&
12869             (uap->op != SNAPSHOT_OP_ROOT)) {
12870                 vnode_t dvp = NULLVP;
12871                 vnode_t devvp = NULLVP;
12872                 mount_t mp;
12873
12874                 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
12875                 if (error) {
12876                         return error;
12877                 }
12878                 mp = vnode_mount(dvp);
12879                 devvp = mp->mnt_devvp;
12880
12881                 /* get an iocount on devvp */
12882                 if (devvp == NULLVP) {
12883                         error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
12884                         /* for mounts which arent block devices */
12885                         if (error == ENOENT) {
12886                                 error = ENXIO;
12887                         }
12888                 } else {
12889                         error = vnode_getwithref(devvp);
12890                 }
12891
12892                 if (error) {
12893                         vnode_put(dvp);
12894                         return error;
12895                 }
12896
12897                 if ((vfs_context_issuser(ctx) == 0) &&
12898                     (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0)) {
12899                         error = EPERM;
12900                 }
12901                 vnode_put(dvp);
12902                 vnode_put(devvp);
12903
12904                 if (error) {
12905                         return error;
12906                 }
12907         }
12908
12909         switch (uap->op) {
12910         case SNAPSHOT_OP_CREATE:
12911                 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
12912                 break;
12913         case SNAPSHOT_OP_DELETE:
12914                 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
12915                 break;
12916         case SNAPSHOT_OP_RENAME:
12917                 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
12918                     uap->flags, ctx);
12919                 break;
12920         case SNAPSHOT_OP_MOUNT:
12921                 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
12922                     uap->data, uap->flags, ctx);
12923                 break;
12924         case SNAPSHOT_OP_REVERT:
12925                 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
12926                 break;
12927 #if CONFIG_MNT_ROOTSNAP
12928         case SNAPSHOT_OP_ROOT:
12929                 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
12930                 break;
12931 #endif /* CONFIG_MNT_ROOTSNAP */
12932         default:
12933                 error = ENOSYS;
12934         }
12935
12936         return error;
12937 }