bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/sysctl.h>
  98 #include <sys/xattr.h>
  99 #include <sys/fcntl.h>
 100 #include <sys/fsctl.h>
 101 #include <sys/ubc_internal.h>
 102 #include <sys/disk.h>
 103 #include <sys/content_protection.h>
 104 #include <sys/clonefile.h>
 105 #include <sys/snapshot.h>
 106 #include <sys/priv.h>
 107 #include <sys/fsgetpath.h>
 108 #include <machine/cons.h>
 109 #include <machine/limits.h>
 110 #include <miscfs/specfs/specdev.h>
 111
 112 #include <vfs/vfs_disk_conditioner.h>
 113
 114 #include <security/audit/audit.h>
 115 #include <bsm/audit_kevents.h>
 116
 117 #include <mach/mach_types.h>
 118 #include <kern/kern_types.h>
 119 #include <kern/kalloc.h>
 120 #include <kern/task.h>
 121
 122 #include <vm/vm_pageout.h>
 123 #include <vm/vm_protos.h>
 124
 125 #include <libkern/OSAtomic.h>
 126 #include <pexpert/pexpert.h>
 127 #include <IOKit/IOBSD.h>
 128
 129 // deps for MIG call
 130 #include <kern/host.h>
 131 #include <kern/ipc_misc.h>
 132 #include <mach/host_priv.h>
 133 #include <mach/vfs_nspace.h>
 134 #include <os/log.h>
 135
 136 #if ROUTEFS
 137 #include <miscfs/routefs/routefs.h>
 138 #endif /* ROUTEFS */
 139
 140 #if CONFIG_MACF
 141 #include <security/mac.h>
 142 #include <security/mac_framework.h>
 143 #endif
 144
 145 #if CONFIG_FSE
 146 #define GET_PATH(x) \
 147         (x) = get_pathbuff();
 148 #define RELEASE_PATH(x) \
 149         release_pathbuff(x);
 150 #else
 151 #define GET_PATH(x)     \
 152         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 153 #define RELEASE_PATH(x) \
 154         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 155 #endif /* CONFIG_FSE */
 156
 157 #ifndef HFS_GET_BOOT_INFO
 158 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
 159 #endif
 160
 161 #ifndef HFS_SET_BOOT_INFO
 162 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
 163 #endif
 164
 165 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
 166 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
 167 #endif
 168
 169 extern void disk_conditioner_unmount(mount_t mp);
 170
 171 /* struct for checkdirs iteration */
 172 struct cdirargs {
 173         vnode_t olddp;
 174         vnode_t newdp;
 175 };
 176 /* callback  for checkdirs iteration */
 177 static int checkdirs_callback(proc_t p, void * arg);
 178
 179 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 180 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 181 void enablequotas(struct mount *mp, vfs_context_t ctx);
 182 static int getfsstat_callback(mount_t mp, void * arg);
 183 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 184 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 185 static int sync_callback(mount_t, void *);
 186 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 187     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 188     boolean_t partial_copy);
 189 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 190 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 191     struct componentname *cnp, user_addr_t fsmountargs,
 192     int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 193     vfs_context_t ctx);
 194 void vfs_notify_mount(vnode_t pdvp);
 195
 196 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 197
 198 struct fd_vn_data * fg_vn_data_alloc(void);
 199
 200 /*
 201  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 202  * Concurrent lookups (or lookups by ids) on hard links can cause the
 203  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 204  * does) to return ENOENT as the path cannot be returned from the name cache
 205  * alone. We have no option but to retry and hope to get one namei->reverse path
 206  * generation done without an intervening lookup, lookup by id on the hard link
 207  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 208  * which currently are the MAC hooks for rename, unlink and rmdir.
 209  */
 210 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 211
 212 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
 213     int unlink_flags);
 214
 215 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, uint32_t options, int *);
 216
 217 #ifdef CONFIG_IMGSRC_ACCESS
 218 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 219 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 220 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 221 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 222 static void mount_end_update(mount_t mp);
 223 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 224 #endif /* CONFIG_IMGSRC_ACCESS */
 225
 226 #if CONFIG_LOCKERBOOT
 227 int mount_locker_protoboot(const char *fsname, const char *mntpoint,
 228     const char *pbdevpath);
 229 #endif
 230
 231 //snapshot functions
 232 #if CONFIG_MNT_ROOTSNAP
 233 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
 234 #else
 235 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
 236 #endif
 237
 238 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 239
 240 __private_extern__
 241 int sync_internal(void);
 242
 243 __private_extern__
 244 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 245
 246 extern lck_grp_t *fd_vn_lck_grp;
 247 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 248 extern lck_attr_t *fd_vn_lck_attr;
 249
 250 /*
 251  * incremented each time a mount or unmount operation occurs
 252  * used to invalidate the cached value of the rootvp in the
 253  * mount structure utilized by cache_lookup_path
 254  */
 255 uint32_t mount_generation = 0;
 256
 257 /* counts number of mount and unmount operations */
 258 unsigned int vfs_nummntops = 0;
 259
 260 extern const struct fileops vnops;
 261 #if CONFIG_APPLEDOUBLE
 262 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 263 #endif /* CONFIG_APPLEDOUBLE */
 264
 265 /*
 266  * Virtual File System System Calls
 267  */
 268
 269 #if NFSCLIENT || DEVFS || ROUTEFS
 270 /*
 271  * Private in-kernel mounting spi (NFS only, not exported)
 272  */
 273 __private_extern__
 274 boolean_t
 275 vfs_iskernelmount(mount_t mp)
 276 {
 277         return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
 278 }
 279
 280 __private_extern__
 281 int
 282 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 283     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags, vfs_context_t ctx)
 284 {
 285         struct nameidata nd;
 286         boolean_t did_namei;
 287         int error;
 288
 289         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 290             UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 291
 292         /*
 293          * Get the vnode to be covered if it's not supplied
 294          */
 295         if (vp == NULLVP) {
 296                 error = namei(&nd);
 297                 if (error) {
 298                         if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VMVOL | KERNEL_MOUNT_DATAVOL)) {
 299                                 printf("failed to locate mount-on path: %s ", path);
 300                         }
 301                         return error;
 302                 }
 303                 vp = nd.ni_vp;
 304                 pvp = nd.ni_dvp;
 305                 did_namei = TRUE;
 306         } else {
 307                 char *pnbuf = CAST_DOWN(char *, path);
 308
 309                 nd.ni_cnd.cn_pnbuf = pnbuf;
 310                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 311                 did_namei = FALSE;
 312         }
 313
 314         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 315             syscall_flags, kern_flags, NULL, TRUE, ctx);
 316
 317         if (did_namei) {
 318                 vnode_put(vp);
 319                 vnode_put(pvp);
 320                 nameidone(&nd);
 321         }
 322
 323         return error;
 324 }
 325 #endif /* NFSCLIENT || DEVFS */
 326
 327 /*
 328  * Mount a file system.
 329  */
 330 /* ARGSUSED */
 331 int
 332 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 333 {
 334         struct __mac_mount_args muap;
 335
 336         muap.type = uap->type;
 337         muap.path = uap->path;
 338         muap.flags = uap->flags;
 339         muap.data = uap->data;
 340         muap.mac_p = USER_ADDR_NULL;
 341         return __mac_mount(p, &muap, retval);
 342 }
 343
 344 int
 345 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
 346 {
 347         struct componentname    cn;
 348         vfs_context_t           ctx = vfs_context_current();
 349         size_t                  dummy = 0;
 350         int                     error;
 351         int                     flags = uap->flags;
 352         char                    fstypename[MFSNAMELEN];
 353         char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
 354         vnode_t                 pvp;
 355         vnode_t                 vp;
 356
 357         AUDIT_ARG(fd, uap->fd);
 358         AUDIT_ARG(fflags, flags);
 359         /* fstypename will get audited by mount_common */
 360
 361         /* Sanity check the flags */
 362         if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
 363                 return ENOTSUP;
 364         }
 365
 366         if (flags & MNT_UNION) {
 367                 return EPERM;
 368         }
 369
 370         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 371         if (error) {
 372                 return error;
 373         }
 374
 375         if ((error = file_vnode(uap->fd, &vp)) != 0) {
 376                 return error;
 377         }
 378
 379         if ((error = vnode_getwithref(vp)) != 0) {
 380                 file_drop(uap->fd);
 381                 return error;
 382         }
 383
 384         pvp = vnode_getparent(vp);
 385         if (pvp == NULL) {
 386                 vnode_put(vp);
 387                 file_drop(uap->fd);
 388                 return EINVAL;
 389         }
 390
 391         memset(&cn, 0, sizeof(struct componentname));
 392         MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
 393         cn.cn_pnlen = MAXPATHLEN;
 394
 395         if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
 396                 FREE(cn.cn_pnbuf, M_TEMP);
 397                 vnode_put(pvp);
 398                 vnode_put(vp);
 399                 file_drop(uap->fd);
 400                 return error;
 401         }
 402
 403         error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
 404
 405         FREE(cn.cn_pnbuf, M_TEMP);
 406         vnode_put(pvp);
 407         vnode_put(vp);
 408         file_drop(uap->fd);
 409
 410         return error;
 411 }
 412
 413 void
 414 vfs_notify_mount(vnode_t pdvp)
 415 {
 416         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 417         lock_vnode_and_post(pdvp, NOTE_WRITE);
 418 }
 419
 420 /*
 421  * __mac_mount:
 422  *      Mount a file system taking into account MAC label behavior.
 423  *      See mount(2) man page for more information
 424  *
 425  * Parameters:    p                        Process requesting the mount
 426  *                uap                      User argument descriptor (see below)
 427  *                retval                   (ignored)
 428  *
 429  * Indirect:      uap->type                Filesystem type
 430  *                uap->path                Path to mount
 431  *                uap->data                Mount arguments
 432  *                uap->mac_p               MAC info
 433  *                uap->flags               Mount flags
 434  *
 435  *
 436  * Returns:        0                       Success
 437  *                !0                       Not success
 438  */
 439 boolean_t root_fs_upgrade_try = FALSE;
 440
 441 int
 442 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 443 {
 444         vnode_t pvp = NULL;
 445         vnode_t vp = NULL;
 446         int need_nameidone = 0;
 447         vfs_context_t ctx = vfs_context_current();
 448         char fstypename[MFSNAMELEN];
 449         struct nameidata nd;
 450         size_t dummy = 0;
 451         char *labelstr = NULL;
 452         int flags = uap->flags;
 453         int error;
 454 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 455         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 456 #else
 457 #pragma unused(p)
 458 #endif
 459         /*
 460          * Get the fs type name from user space
 461          */
 462         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 463         if (error) {
 464                 return error;
 465         }
 466
 467         /*
 468          * Get the vnode to be covered
 469          */
 470         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 471             UIO_USERSPACE, uap->path, ctx);
 472         error = namei(&nd);
 473         if (error) {
 474                 goto out;
 475         }
 476         need_nameidone = 1;
 477         vp = nd.ni_vp;
 478         pvp = nd.ni_dvp;
 479
 480 #ifdef CONFIG_IMGSRC_ACCESS
 481         /* Mounting image source cannot be batched with other operations */
 482         if (flags == MNT_IMGSRC_BY_INDEX) {
 483                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 484                     ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 485                 goto out;
 486         }
 487 #endif /* CONFIG_IMGSRC_ACCESS */
 488
 489 #if CONFIG_MACF
 490         /*
 491          * Get the label string (if any) from user space
 492          */
 493         if (uap->mac_p != USER_ADDR_NULL) {
 494                 struct user_mac mac;
 495                 size_t ulen = 0;
 496
 497                 if (is_64bit) {
 498                         struct user64_mac mac64;
 499                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 500                         mac.m_buflen = mac64.m_buflen;
 501                         mac.m_string = mac64.m_string;
 502                 } else {
 503                         struct user32_mac mac32;
 504                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 505                         mac.m_buflen = mac32.m_buflen;
 506                         mac.m_string = mac32.m_string;
 507                 }
 508                 if (error) {
 509                         goto out;
 510                 }
 511                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 512                     (mac.m_buflen < 2)) {
 513                         error = EINVAL;
 514                         goto out;
 515                 }
 516                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 517                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 518                 if (error) {
 519                         goto out;
 520                 }
 521                 AUDIT_ARG(mac_string, labelstr);
 522         }
 523 #endif /* CONFIG_MACF */
 524
 525         AUDIT_ARG(fflags, flags);
 526
 527 #if SECURE_KERNEL
 528         if (flags & MNT_UNION) {
 529                 /* No union mounts on release kernels */
 530                 error = EPERM;
 531                 goto out;
 532         }
 533 #endif
 534
 535         if ((vp->v_flag & VROOT) &&
 536             (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 537                 if (!(flags & MNT_UNION)) {
 538                         flags |= MNT_UPDATE;
 539                 } else {
 540                         /*
 541                          * For a union mount on '/', treat it as fresh
 542                          * mount instead of update.
 543                          * Otherwise, union mouting on '/' used to panic the
 544                          * system before, since mnt_vnodecovered was found to
 545                          * be NULL for '/' which is required for unionlookup
 546                          * after it gets ENOENT on union mount.
 547                          */
 548                         flags = (flags & ~(MNT_UPDATE));
 549                 }
 550
 551 #if SECURE_KERNEL
 552                 if ((flags & MNT_RDONLY) == 0) {
 553                         /* Release kernels are not allowed to mount "/" as rw */
 554                         error = EPERM;
 555                         goto out;
 556                 }
 557 #endif
 558                 /*
 559                  * See 7392553 for more details on why this check exists.
 560                  * Suffice to say: If this check is ON and something tries
 561                  * to mount the rootFS RW, we'll turn off the codesign
 562                  * bitmap optimization.
 563                  */
 564 #if CHECK_CS_VALIDATION_BITMAP
 565                 if ((flags & MNT_RDONLY) == 0) {
 566                         root_fs_upgrade_try = TRUE;
 567                 }
 568 #endif
 569         }
 570
 571         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 572             labelstr, FALSE, ctx);
 573
 574 out:
 575
 576 #if CONFIG_MACF
 577         if (labelstr) {
 578                 FREE(labelstr, M_MACTEMP);
 579         }
 580 #endif /* CONFIG_MACF */
 581
 582         if (vp) {
 583                 vnode_put(vp);
 584         }
 585         if (pvp) {
 586                 vnode_put(pvp);
 587         }
 588         if (need_nameidone) {
 589                 nameidone(&nd);
 590         }
 591
 592         return error;
 593 }
 594
 595 /*
 596  * common mount implementation (final stage of mounting)
 597  *
 598  * Arguments:
 599  *  fstypename  file system type (ie it's vfs name)
 600  *  pvp         parent of covered vnode
 601  *  vp          covered vnode
 602  *  cnp         component name (ie path) of covered vnode
 603  *  flags       generic mount flags
 604  *  fsmountargs file system specific data
 605  *  labelstr    optional MAC label
 606  *  kernelmount TRUE for mounts initiated from inside the kernel
 607  *  ctx         caller's context
 608  */
 609 static int
 610 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 611     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 612     char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 613 {
 614 #if !CONFIG_MACF
 615 #pragma unused(labelstr)
 616 #endif
 617         struct vnode *devvp = NULLVP;
 618         struct vnode *device_vnode = NULLVP;
 619 #if CONFIG_MACF
 620         struct vnode *rvp;
 621 #endif
 622         struct mount *mp;
 623         struct vfstable *vfsp = (struct vfstable *)0;
 624         struct proc *p = vfs_context_proc(ctx);
 625         int error, flag = 0;
 626         user_addr_t devpath = USER_ADDR_NULL;
 627         int ronly = 0;
 628         int mntalloc = 0;
 629         boolean_t vfsp_ref = FALSE;
 630         boolean_t is_rwlock_locked = FALSE;
 631         boolean_t did_rele = FALSE;
 632         boolean_t have_usecount = FALSE;
 633
 634 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM
 635         /* Check for mutually-exclusive flag bits */
 636         uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL));
 637         int bitcount = 0;
 638         while (checkflags != 0) {
 639                 checkflags &= (checkflags - 1);
 640                 bitcount++;
 641         }
 642
 643         if (bitcount > 1) {
 644                 //not allowed to request multiple mount-by-role flags
 645                 error = EINVAL;
 646                 goto out1;
 647         }
 648 #endif
 649
 650         /*
 651          * Process an update for an existing mount
 652          */
 653         if (flags & MNT_UPDATE) {
 654                 if ((vp->v_flag & VROOT) == 0) {
 655                         error = EINVAL;
 656                         goto out1;
 657                 }
 658                 mp = vp->v_mount;
 659
 660                 /* unmount in progress return error */
 661                 mount_lock_spin(mp);
 662                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 663                         mount_unlock(mp);
 664                         error = EBUSY;
 665                         goto out1;
 666                 }
 667                 mount_unlock(mp);
 668                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 669                 is_rwlock_locked = TRUE;
 670                 /*
 671                  * We only allow the filesystem to be reloaded if it
 672                  * is currently mounted read-only.
 673                  */
 674                 if ((flags & MNT_RELOAD) &&
 675                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 676                         error = ENOTSUP;
 677                         goto out1;
 678                 }
 679
 680                 /*
 681                  * If content protection is enabled, update mounts are not
 682                  * allowed to turn it off.
 683                  */
 684                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 685                     ((flags & MNT_CPROTECT) == 0)) {
 686                         error = EINVAL;
 687                         goto out1;
 688                 }
 689
 690                 /*
 691                  * can't turn off MNT_REMOVABLE either but it may be an unexpected
 692                  * failure to return an error for this so we'll just silently
 693                  * add it if it is not passed in.
 694                  */
 695                 if ((mp->mnt_flag & MNT_REMOVABLE) &&
 696                     ((flags & MNT_REMOVABLE) == 0)) {
 697                         flags |= MNT_REMOVABLE;
 698                 }
 699
 700 #ifdef CONFIG_IMGSRC_ACCESS
 701                 /* Can't downgrade the backer of the root FS */
 702                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 703                     (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 704                         error = ENOTSUP;
 705                         goto out1;
 706                 }
 707 #endif /* CONFIG_IMGSRC_ACCESS */
 708
 709                 /*
 710                  * Only root, or the user that did the original mount is
 711                  * permitted to update it.
 712                  */
 713                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 714                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 715                         goto out1;
 716                 }
 717 #if CONFIG_MACF
 718                 error = mac_mount_check_remount(ctx, mp);
 719                 if (error != 0) {
 720                         goto out1;
 721                 }
 722 #endif
 723                 /*
 724                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 725                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 726                  */
 727                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 728                         flags |= MNT_NOSUID | MNT_NODEV;
 729                         if (mp->mnt_flag & MNT_NOEXEC) {
 730                                 flags |= MNT_NOEXEC;
 731                         }
 732                 }
 733                 flag = mp->mnt_flag;
 734
 735
 736
 737                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 738
 739                 vfsp = mp->mnt_vtable;
 740                 goto update;
 741         } // MNT_UPDATE
 742
 743         /*
 744          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 745          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 746          */
 747         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 748                 flags |= MNT_NOSUID | MNT_NODEV;
 749                 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
 750                         flags |= MNT_NOEXEC;
 751                 }
 752         }
 753
 754         /* XXXAUDIT: Should we capture the type on the error path as well? */
 755         AUDIT_ARG(text, fstypename);
 756         mount_list_lock();
 757         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 758                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 759                         vfsp->vfc_refcount++;
 760                         vfsp_ref = TRUE;
 761                         break;
 762                 }
 763         }
 764         mount_list_unlock();
 765         if (vfsp == NULL) {
 766                 error = ENODEV;
 767                 goto out1;
 768         }
 769
 770         /*
 771          * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
 772          * except in ROSV configs.
 773          */
 774         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
 775             ((internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL)) == 0)) {
 776                 error = EINVAL;  /* unsupported request */
 777                 goto out1;
 778         }
 779
 780         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 781         if (error != 0) {
 782                 goto out1;
 783         }
 784
 785         /*
 786          * Allocate and initialize the filesystem (mount_t)
 787          */
 788         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 789             M_MOUNT, M_WAITOK);
 790         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 791         mntalloc = 1;
 792
 793         /* Initialize the default IO constraints */
 794         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 795         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 796         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 797         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 798         mp->mnt_devblocksize = DEV_BSIZE;
 799         mp->mnt_alignmentmask = PAGE_MASK;
 800         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 801         mp->mnt_ioscale = 1;
 802         mp->mnt_ioflags = 0;
 803         mp->mnt_realrootvp = NULLVP;
 804         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 805
 806         TAILQ_INIT(&mp->mnt_vnodelist);
 807         TAILQ_INIT(&mp->mnt_workerqueue);
 808         TAILQ_INIT(&mp->mnt_newvnodes);
 809         mount_lock_init(mp);
 810         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 811         is_rwlock_locked = TRUE;
 812         mp->mnt_op = vfsp->vfc_vfsops;
 813         mp->mnt_vtable = vfsp;
 814         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 815         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 816         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 817         do {
 818                 int pathlen = MAXPATHLEN;
 819
 820                 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
 821                         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 822                 }
 823         } while (0);
 824         mp->mnt_vnodecovered = vp;
 825         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 826         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 827         mp->mnt_devbsdunit = 0;
 828
 829         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 830         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 831
 832 #if NFSCLIENT || DEVFS || ROUTEFS
 833         if (kernelmount) {
 834                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 835         }
 836         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
 837                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 838         }
 839 #endif /* NFSCLIENT || DEVFS */
 840
 841 update:
 842
 843         /*
 844          * Set the mount level flags.
 845          */
 846         if (flags & MNT_RDONLY) {
 847                 mp->mnt_flag |= MNT_RDONLY;
 848         } else if (mp->mnt_flag & MNT_RDONLY) {
 849                 // disallow read/write upgrades of file systems that
 850                 // had the TYPENAME_OVERRIDE feature set.
 851                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 852                         error = EPERM;
 853                         goto out1;
 854                 }
 855                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 856         }
 857         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 858             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 859             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 860             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
 861             MNT_QUARANTINE | MNT_CPROTECT);
 862
 863 #if SECURE_KERNEL
 864 #if !CONFIG_MNT_SUID
 865         /*
 866          * On release builds of iOS based platforms, always enforce NOSUID on
 867          * all mounts. We do this here because we can catch update mounts as well as
 868          * non-update mounts in this case.
 869          */
 870         mp->mnt_flag |= (MNT_NOSUID);
 871 #endif
 872 #endif
 873
 874         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 875             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 876             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 877             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
 878             MNT_QUARANTINE | MNT_CPROTECT);
 879
 880 #if CONFIG_MACF
 881         if (flags & MNT_MULTILABEL) {
 882                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 883                         error = EINVAL;
 884                         goto out1;
 885                 }
 886                 mp->mnt_flag |= MNT_MULTILABEL;
 887         }
 888 #endif
 889         /*
 890          * Process device path for local file systems if requested
 891          */
 892         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
 893             !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL))) {
 894                 //snapshot, vm, datavolume mounts are special
 895                 if (vfs_context_is64bit(ctx)) {
 896                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
 897                                 goto out1;
 898                         }
 899                         fsmountargs += sizeof(devpath);
 900                 } else {
 901                         user32_addr_t tmp;
 902                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
 903                                 goto out1;
 904                         }
 905                         /* munge into LP64 addr */
 906                         devpath = CAST_USER_ADDR_T(tmp);
 907                         fsmountargs += sizeof(tmp);
 908                 }
 909
 910                 /* Lookup device and authorize access to it */
 911                 if ((devpath)) {
 912                         struct nameidata nd;
 913
 914                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 915                         if ((error = namei(&nd))) {
 916                                 goto out1;
 917                         }
 918
 919                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 920                         devvp = nd.ni_vp;
 921
 922                         nameidone(&nd);
 923
 924                         if (devvp->v_type != VBLK) {
 925                                 error = ENOTBLK;
 926                                 goto out2;
 927                         }
 928                         if (major(devvp->v_rdev) >= nblkdev) {
 929                                 error = ENXIO;
 930                                 goto out2;
 931                         }
 932                         /*
 933                          * If mount by non-root, then verify that user has necessary
 934                          * permissions on the device.
 935                          */
 936                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 937                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 938
 939                                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 940                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 941                                 }
 942                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
 943                                         goto out2;
 944                                 }
 945                         }
 946                 }
 947                 /* On first mount, preflight and open device */
 948                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 949                         if ((error = vnode_ref(devvp))) {
 950                                 goto out2;
 951                         }
 952                         /*
 953                          * Disallow multiple mounts of the same device.
 954                          * Disallow mounting of a device that is currently in use
 955                          * (except for root, which might share swap device for miniroot).
 956                          * Flush out any old buffers remaining from a previous use.
 957                          */
 958                         if ((error = vfs_mountedon(devvp))) {
 959                                 goto out3;
 960                         }
 961
 962                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 963                                 error = EBUSY;
 964                                 goto out3;
 965                         }
 966                         if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
 967                                 error = ENOTBLK;
 968                                 goto out3;
 969                         }
 970                         if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
 971                                 goto out3;
 972                         }
 973
 974                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 975 #if CONFIG_MACF
 976                         error = mac_vnode_check_open(ctx,
 977                             devvp,
 978                             ronly ? FREAD : FREAD | FWRITE);
 979                         if (error) {
 980                                 goto out3;
 981                         }
 982 #endif /* MAC */
 983                         if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
 984                                 goto out3;
 985                         }
 986
 987                         mp->mnt_devvp = devvp;
 988                         device_vnode = devvp;
 989                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 990                     (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 991                     (device_vnode = mp->mnt_devvp)) {
 992                         dev_t dev;
 993                         int maj;
 994                         /*
 995                          * If upgrade to read-write by non-root, then verify
 996                          * that user has necessary permissions on the device.
 997                          */
 998                         vnode_getalways(device_vnode);
 999
1000                         if (suser(vfs_context_ucred(ctx), NULL) &&
1001                             (error = vnode_authorize(device_vnode, NULL,
1002                             KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1003                             ctx)) != 0) {
1004                                 vnode_put(device_vnode);
1005                                 goto out2;
1006                         }
1007
1008                         /* Tell the device that we're upgrading */
1009                         dev = (dev_t)device_vnode->v_rdev;
1010                         maj = major(dev);
1011
1012                         if ((u_int)maj >= (u_int)nblkdev) {
1013                                 panic("Volume mounted on a device with invalid major number.");
1014                         }
1015
1016                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1017                         vnode_put(device_vnode);
1018                         device_vnode = NULLVP;
1019                         if (error != 0) {
1020                                 goto out2;
1021                         }
1022                 }
1023         } // localargs && !(snapshot | data | vm)
1024
1025 #if CONFIG_MACF
1026         if ((flags & MNT_UPDATE) == 0) {
1027                 mac_mount_label_init(mp);
1028                 mac_mount_label_associate(ctx, mp);
1029         }
1030         if (labelstr) {
1031                 if ((flags & MNT_UPDATE) != 0) {
1032                         error = mac_mount_check_label_update(ctx, mp);
1033                         if (error != 0) {
1034                                 goto out3;
1035                         }
1036                 }
1037         }
1038 #endif
1039         /*
1040          * Mount the filesystem.  We already asserted that internal_flags
1041          * cannot have more than one mount-by-role bit set.
1042          */
1043         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1044                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1045                     (caddr_t)fsmountargs, 0, ctx);
1046         } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1047 #if CONFIG_ROSV_STARTUP
1048                 struct mount *origin_mp = (struct mount*)fsmountargs;
1049                 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1050                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1051                 if (error) {
1052                         printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1053                 } else {
1054                         /* Mark volume associated with system volume */
1055                         mp->mnt_kern_flag |= MNTK_SYSTEM;
1056
1057                         /* Attempt to acquire the mnt_devvp and set it up */
1058                         struct vnode *mp_devvp = NULL;
1059                         if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1060                                 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1061                                     0, &mp_devvp, vfs_context_kernel());
1062                                 if (!lerr) {
1063                                         mp->mnt_devvp = mp_devvp;
1064                                         //vnode_lookup took an iocount, need to drop it.
1065                                         vnode_put(mp_devvp);
1066                                         // now set `device_vnode` to the devvp that was acquired.
1067                                         // this is needed in order to ensure vfs_init_io_attributes is invoked.
1068                                         // note that though the iocount above was dropped, the mount acquires
1069                                         // an implicit reference against the device.
1070                                         device_vnode = mp_devvp;
1071                                 }
1072                         }
1073                 }
1074 #else
1075                 error = EINVAL;
1076 #endif
1077         } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1078 #if CONFIG_MOUNT_VM
1079                 struct mount *origin_mp = (struct mount*)fsmountargs;
1080                 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1081                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1082                 if (error) {
1083                         printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1084                 } else {
1085                         /* Mark volume associated with system volume and a swap mount */
1086                         mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1087                         /* Attempt to acquire the mnt_devvp and set it up */
1088                         struct vnode *mp_devvp = NULL;
1089                         if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1090                                 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1091                                     0, &mp_devvp, vfs_context_kernel());
1092                                 if (!lerr) {
1093                                         mp->mnt_devvp = mp_devvp;
1094                                         //vnode_lookup took an iocount, need to drop it.
1095                                         vnode_put(mp_devvp);
1096
1097                                         // now set `device_vnode` to the devvp that was acquired.
1098                                         // note that though the iocount above was dropped, the mount acquires
1099                                         // an implicit reference against the device.
1100                                         device_vnode = mp_devvp;
1101                                 }
1102                         }
1103                 }
1104 #else
1105                 error = EINVAL;
1106 #endif
1107         } else {
1108                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1109         }
1110
1111         if (flags & MNT_UPDATE) {
1112                 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1113                         mp->mnt_flag &= ~MNT_RDONLY;
1114                 }
1115                 mp->mnt_flag &= ~
1116                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1117                 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1118                 if (error) {
1119                         mp->mnt_flag = flag;  /* restore flag value */
1120                 }
1121                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1122                 lck_rw_done(&mp->mnt_rwlock);
1123                 is_rwlock_locked = FALSE;
1124                 if (!error) {
1125                         enablequotas(mp, ctx);
1126                 }
1127                 goto exit;
1128         }
1129
1130         /*
1131          * Put the new filesystem on the mount list after root.
1132          */
1133         if (error == 0) {
1134                 struct vfs_attr vfsattr;
1135 #if CONFIG_MACF
1136                 error = mac_mount_check_mount_late(ctx, mp);
1137                 if (error != 0) {
1138                         goto out3;
1139                 }
1140
1141                 if (vfs_flags(mp) & MNT_MULTILABEL) {
1142                         error = VFS_ROOT(mp, &rvp, ctx);
1143                         if (error) {
1144                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1145                                 goto out3;
1146                         }
1147                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1148                         /*
1149                          * drop reference provided by VFS_ROOT
1150                          */
1151                         vnode_put(rvp);
1152
1153                         if (error) {
1154                                 goto out3;
1155                         }
1156                 }
1157 #endif  /* MAC */
1158
1159                 vnode_lock_spin(vp);
1160                 CLR(vp->v_flag, VMOUNT);
1161                 vp->v_mountedhere = mp;
1162                 vnode_unlock(vp);
1163
1164                 /*
1165                  * taking the name_cache_lock exclusively will
1166                  * insure that everyone is out of the fast path who
1167                  * might be trying to use a now stale copy of
1168                  * vp->v_mountedhere->mnt_realrootvp
1169                  * bumping mount_generation causes the cached values
1170                  * to be invalidated
1171                  */
1172                 name_cache_lock();
1173                 mount_generation++;
1174                 name_cache_unlock();
1175
1176                 error = vnode_ref(vp);
1177                 if (error != 0) {
1178                         goto out4;
1179                 }
1180
1181                 have_usecount = TRUE;
1182
1183                 error = checkdirs(vp, ctx);
1184                 if (error != 0) {
1185                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
1186                         goto out4;
1187                 }
1188                 /*
1189                  * there is no cleanup code here so I have made it void
1190                  * we need to revisit this
1191                  */
1192                 (void)VFS_START(mp, 0, ctx);
1193
1194                 if (mount_list_add(mp) != 0) {
1195                         /*
1196                          * The system is shutting down trying to umount
1197                          * everything, so fail with a plausible errno.
1198                          */
1199                         error = EBUSY;
1200                         goto out4;
1201                 }
1202                 lck_rw_done(&mp->mnt_rwlock);
1203                 is_rwlock_locked = FALSE;
1204
1205                 /* Check if this mounted file system supports EAs or named streams. */
1206                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1207                 VFSATTR_INIT(&vfsattr);
1208                 VFSATTR_WANTED(&vfsattr, f_capabilities);
1209                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1210                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1211                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1212                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1213                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1214                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1215                         }
1216 #if NAMEDSTREAMS
1217                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1218                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1219                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1220                         }
1221 #endif
1222                         /* Check if this file system supports path from id lookups. */
1223                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1224                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1225                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1226                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
1227                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1228                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1229                         }
1230
1231                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1232                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1233                                 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1234                         }
1235                 }
1236                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1237                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1238                 }
1239                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1240                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1241                 }
1242                 /* increment the operations count */
1243                 OSAddAtomic(1, &vfs_nummntops);
1244                 enablequotas(mp, ctx);
1245
1246                 if (device_vnode) {
1247                         device_vnode->v_specflags |= SI_MOUNTEDON;
1248
1249                         /*
1250                          *   cache the IO attributes for the underlying physical media...
1251                          *   an error return indicates the underlying driver doesn't
1252                          *   support all the queries necessary... however, reasonable
1253                          *   defaults will have been set, so no reason to bail or care
1254                          */
1255                         vfs_init_io_attributes(device_vnode, mp);
1256                 }
1257
1258                 /* Now that mount is setup, notify the listeners */
1259                 vfs_notify_mount(pvp);
1260                 IOBSDMountChange(mp, kIOMountChangeMount);
1261         } else {
1262                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1263                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1264                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1265                             mp->mnt_vtable->vfc_name, error);
1266                 }
1267
1268                 vnode_lock_spin(vp);
1269                 CLR(vp->v_flag, VMOUNT);
1270                 vnode_unlock(vp);
1271                 mount_list_lock();
1272                 mp->mnt_vtable->vfc_refcount--;
1273                 mount_list_unlock();
1274
1275                 if (device_vnode) {
1276                         vnode_rele(device_vnode);
1277                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1278                 }
1279                 lck_rw_done(&mp->mnt_rwlock);
1280                 is_rwlock_locked = FALSE;
1281
1282                 /*
1283                  * if we get here, we have a mount structure that needs to be freed,
1284                  * but since the coveredvp hasn't yet been updated to point at it,
1285                  * no need to worry about other threads holding a crossref on this mp
1286                  * so it's ok to just free it
1287                  */
1288                 mount_lock_destroy(mp);
1289 #if CONFIG_MACF
1290                 mac_mount_label_destroy(mp);
1291 #endif
1292                 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1293         }
1294 exit:
1295         /*
1296          * drop I/O count on the device vp if there was one
1297          */
1298         if (devpath && devvp) {
1299                 vnode_put(devvp);
1300         }
1301
1302         return error;
1303
1304 /* Error condition exits */
1305 out4:
1306         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1307
1308         /*
1309          * If the mount has been placed on the covered vp,
1310          * it may have been discovered by now, so we have
1311          * to treat this just like an unmount
1312          */
1313         mount_lock_spin(mp);
1314         mp->mnt_lflag |= MNT_LDEAD;
1315         mount_unlock(mp);
1316
1317         if (device_vnode != NULLVP) {
1318                 vnode_rele(device_vnode);
1319                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1320                     ctx);
1321                 did_rele = TRUE;
1322         }
1323
1324         vnode_lock_spin(vp);
1325
1326         mp->mnt_crossref++;
1327         vp->v_mountedhere = (mount_t) 0;
1328
1329         vnode_unlock(vp);
1330
1331         if (have_usecount) {
1332                 vnode_rele(vp);
1333         }
1334 out3:
1335         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1336                 vnode_rele(devvp);
1337         }
1338 out2:
1339         if (devpath && devvp) {
1340                 vnode_put(devvp);
1341         }
1342 out1:
1343         /* Release mnt_rwlock only when it was taken */
1344         if (is_rwlock_locked == TRUE) {
1345                 lck_rw_done(&mp->mnt_rwlock);
1346         }
1347
1348         if (mntalloc) {
1349                 if (mp->mnt_crossref) {
1350                         mount_dropcrossref(mp, vp, 0);
1351                 } else {
1352                         mount_lock_destroy(mp);
1353 #if CONFIG_MACF
1354                         mac_mount_label_destroy(mp);
1355 #endif
1356                         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1357                 }
1358         }
1359         if (vfsp_ref) {
1360                 mount_list_lock();
1361                 vfsp->vfc_refcount--;
1362                 mount_list_unlock();
1363         }
1364
1365         return error;
1366 }
1367
1368 /*
1369  * Flush in-core data, check for competing mount attempts,
1370  * and set VMOUNT
1371  */
1372 int
1373 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1374 {
1375 #if !CONFIG_MACF
1376 #pragma unused(cnp,fsname)
1377 #endif
1378         struct vnode_attr va;
1379         int error;
1380
1381         if (!skip_auth) {
1382                 /*
1383                  * If the user is not root, ensure that they own the directory
1384                  * onto which we are attempting to mount.
1385                  */
1386                 VATTR_INIT(&va);
1387                 VATTR_WANTED(&va, va_uid);
1388                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1389                     (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1390                     (!vfs_context_issuser(ctx)))) {
1391                         error = EPERM;
1392                         goto out;
1393                 }
1394         }
1395
1396         if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1397                 goto out;
1398         }
1399
1400         if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1401                 goto out;
1402         }
1403
1404         if (vp->v_type != VDIR) {
1405                 error = ENOTDIR;
1406                 goto out;
1407         }
1408
1409         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1410                 error = EBUSY;
1411                 goto out;
1412         }
1413
1414 #if CONFIG_MACF
1415         error = mac_mount_check_mount(ctx, vp,
1416             cnp, fsname);
1417         if (error != 0) {
1418                 goto out;
1419         }
1420 #endif
1421
1422         vnode_lock_spin(vp);
1423         SET(vp->v_flag, VMOUNT);
1424         vnode_unlock(vp);
1425
1426 out:
1427         return error;
1428 }
1429
1430 #if CONFIG_IMGSRC_ACCESS
1431
1432 #define DEBUG_IMGSRC 0
1433
1434 #if DEBUG_IMGSRC
1435 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1436 #else
1437 #define IMGSRC_DEBUG(args...) do { } while(0)
1438 #endif
1439
1440 static int
1441 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1442 {
1443         struct nameidata nd;
1444         vnode_t vp, realdevvp;
1445         mode_t accessmode;
1446         int error;
1447         enum uio_seg uio = UIO_USERSPACE;
1448
1449         if (ctx == vfs_context_kernel()) {
1450                 uio = UIO_SYSSPACE;
1451         }
1452
1453         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1454         if ((error = namei(&nd))) {
1455                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1456                 return error;
1457         }
1458
1459         vp = nd.ni_vp;
1460
1461         if (!vnode_isblk(vp)) {
1462                 IMGSRC_DEBUG("Not block device.\n");
1463                 error = ENOTBLK;
1464                 goto out;
1465         }
1466
1467         realdevvp = mp->mnt_devvp;
1468         if (realdevvp == NULLVP) {
1469                 IMGSRC_DEBUG("No device backs the mount.\n");
1470                 error = ENXIO;
1471                 goto out;
1472         }
1473
1474         error = vnode_getwithref(realdevvp);
1475         if (error != 0) {
1476                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1477                 goto out;
1478         }
1479
1480         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1481                 IMGSRC_DEBUG("Wrong dev_t.\n");
1482                 error = ENXIO;
1483                 goto out1;
1484         }
1485
1486         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1487
1488         /*
1489          * If mount by non-root, then verify that user has necessary
1490          * permissions on the device.
1491          */
1492         if (!vfs_context_issuser(ctx)) {
1493                 accessmode = KAUTH_VNODE_READ_DATA;
1494                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1495                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1496                 }
1497                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1498                         IMGSRC_DEBUG("Access denied.\n");
1499                         goto out1;
1500                 }
1501         }
1502
1503         *devvpp = vp;
1504
1505 out1:
1506         vnode_put(realdevvp);
1507
1508 out:
1509         nameidone(&nd);
1510
1511         if (error) {
1512                 vnode_put(vp);
1513         }
1514
1515         return error;
1516 }
1517
1518 /*
1519  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1520  * and call checkdirs()
1521  */
1522 static int
1523 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1524 {
1525         int error;
1526
1527         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1528
1529         IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1530             mp->mnt_vtable->vfc_name, vnode_getname(vp));
1531
1532         vnode_lock_spin(vp);
1533         CLR(vp->v_flag, VMOUNT);
1534         vp->v_mountedhere = mp;
1535         vnode_unlock(vp);
1536
1537         /*
1538          * taking the name_cache_lock exclusively will
1539          * insure that everyone is out of the fast path who
1540          * might be trying to use a now stale copy of
1541          * vp->v_mountedhere->mnt_realrootvp
1542          * bumping mount_generation causes the cached values
1543          * to be invalidated
1544          */
1545         name_cache_lock();
1546         mount_generation++;
1547         name_cache_unlock();
1548
1549         error = vnode_ref(vp);
1550         if (error != 0) {
1551                 goto out;
1552         }
1553
1554         error = checkdirs(vp, ctx);
1555         if (error != 0) {
1556                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1557                 vnode_rele(vp);
1558                 goto out;
1559         }
1560
1561 out:
1562         if (error != 0) {
1563                 mp->mnt_vnodecovered = NULLVP;
1564         }
1565         return error;
1566 }
1567
1568 static void
1569 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1570 {
1571         vnode_rele(vp);
1572         vnode_lock_spin(vp);
1573         vp->v_mountedhere = (mount_t)NULL;
1574         vnode_unlock(vp);
1575
1576         mp->mnt_vnodecovered = NULLVP;
1577 }
1578
1579 static int
1580 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1581 {
1582         int error;
1583
1584         /* unmount in progress return error */
1585         mount_lock_spin(mp);
1586         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1587                 mount_unlock(mp);
1588                 return EBUSY;
1589         }
1590         mount_unlock(mp);
1591         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1592
1593         /*
1594          * We only allow the filesystem to be reloaded if it
1595          * is currently mounted read-only.
1596          */
1597         if ((flags & MNT_RELOAD) &&
1598             ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1599                 error = ENOTSUP;
1600                 goto out;
1601         }
1602
1603         /*
1604          * Only root, or the user that did the original mount is
1605          * permitted to update it.
1606          */
1607         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1608             (!vfs_context_issuser(ctx))) {
1609                 error = EPERM;
1610                 goto out;
1611         }
1612 #if CONFIG_MACF
1613         error = mac_mount_check_remount(ctx, mp);
1614         if (error != 0) {
1615                 goto out;
1616         }
1617 #endif
1618
1619 out:
1620         if (error) {
1621                 lck_rw_done(&mp->mnt_rwlock);
1622         }
1623
1624         return error;
1625 }
1626
1627 static void
1628 mount_end_update(mount_t mp)
1629 {
1630         lck_rw_done(&mp->mnt_rwlock);
1631 }
1632
1633 static int
1634 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1635 {
1636         vnode_t vp;
1637
1638         if (height >= MAX_IMAGEBOOT_NESTING) {
1639                 return EINVAL;
1640         }
1641
1642         vp = imgsrc_rootvnodes[height];
1643         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1644                 *rvpp = vp;
1645                 return 0;
1646         } else {
1647                 return ENOENT;
1648         }
1649 }
1650
1651 static int
1652 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1653     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1654     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1655 {
1656         int error;
1657         mount_t mp;
1658         boolean_t placed = FALSE;
1659         struct vfstable *vfsp;
1660         user_addr_t devpath;
1661         char *old_mntonname;
1662         vnode_t rvp;
1663         vnode_t devvp;
1664         uint32_t height;
1665         uint32_t flags;
1666
1667         /* If we didn't imageboot, nothing to move */
1668         if (imgsrc_rootvnodes[0] == NULLVP) {
1669                 return EINVAL;
1670         }
1671
1672         /* Only root can do this */
1673         if (!vfs_context_issuser(ctx)) {
1674                 return EPERM;
1675         }
1676
1677         IMGSRC_DEBUG("looking for root vnode.\n");
1678
1679         /*
1680          * Get root vnode of filesystem we're moving.
1681          */
1682         if (by_index) {
1683                 if (is64bit) {
1684                         struct user64_mnt_imgsrc_args mia64;
1685                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1686                         if (error != 0) {
1687                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1688                                 return error;
1689                         }
1690
1691                         height = mia64.mi_height;
1692                         flags = mia64.mi_flags;
1693                         devpath = mia64.mi_devpath;
1694                 } else {
1695                         struct user32_mnt_imgsrc_args mia32;
1696                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1697                         if (error != 0) {
1698                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1699                                 return error;
1700                         }
1701
1702                         height = mia32.mi_height;
1703                         flags = mia32.mi_flags;
1704                         devpath = mia32.mi_devpath;
1705                 }
1706         } else {
1707                 /*
1708                  * For binary compatibility--assumes one level of nesting.
1709                  */
1710                 if (is64bit) {
1711                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1712                                 return error;
1713                         }
1714                 } else {
1715                         user32_addr_t tmp;
1716                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1717                                 return error;
1718                         }
1719
1720                         /* munge into LP64 addr */
1721                         devpath = CAST_USER_ADDR_T(tmp);
1722                 }
1723
1724                 height = 0;
1725                 flags = 0;
1726         }
1727
1728         if (flags != 0) {
1729                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1730                 return EINVAL;
1731         }
1732
1733         error = get_imgsrc_rootvnode(height, &rvp);
1734         if (error != 0) {
1735                 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1736                 return error;
1737         }
1738
1739         IMGSRC_DEBUG("got old root vnode\n");
1740
1741         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1742
1743         /* Can only move once */
1744         mp = vnode_mount(rvp);
1745         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1746                 IMGSRC_DEBUG("Already moved.\n");
1747                 error = EBUSY;
1748                 goto out0;
1749         }
1750
1751         IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1752         IMGSRC_DEBUG("Starting updated.\n");
1753
1754         /* Get exclusive rwlock on mount, authorize update on mp */
1755         error = mount_begin_update(mp, ctx, 0);
1756         if (error != 0) {
1757                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1758                 goto out0;
1759         }
1760
1761         /*
1762          * It can only be moved once.  Flag is set under the rwlock,
1763          * so we're now safe to proceed.
1764          */
1765         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1766                 IMGSRC_DEBUG("Already moved [2]\n");
1767                 goto out1;
1768         }
1769
1770         IMGSRC_DEBUG("Preparing coveredvp.\n");
1771
1772         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1773         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1774         if (error != 0) {
1775                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1776                 goto out1;
1777         }
1778
1779         IMGSRC_DEBUG("Covered vp OK.\n");
1780
1781         /* Sanity check the name caller has provided */
1782         vfsp = mp->mnt_vtable;
1783         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1784                 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1785                     vfsp->vfc_name, fsname);
1786                 error = EINVAL;
1787                 goto out2;
1788         }
1789
1790         /* Check the device vnode and update mount-from name, for local filesystems */
1791         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1792                 IMGSRC_DEBUG("Local, doing device validation.\n");
1793
1794                 if (devpath != USER_ADDR_NULL) {
1795                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1796                         if (error) {
1797                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1798                                 goto out2;
1799                         }
1800
1801                         vnode_put(devvp);
1802                 }
1803         }
1804
1805         /*
1806          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1807          * and increment the name cache's mount generation
1808          */
1809
1810         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1811         error = place_mount_and_checkdirs(mp, vp, ctx);
1812         if (error != 0) {
1813                 goto out2;
1814         }
1815
1816         placed = TRUE;
1817
1818         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1819         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1820
1821         /* Forbid future moves */
1822         mount_lock(mp);
1823         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1824         mount_unlock(mp);
1825
1826         /* Finally, add to mount list, completely ready to go */
1827         if (mount_list_add(mp) != 0) {
1828                 /*
1829                  * The system is shutting down trying to umount
1830                  * everything, so fail with a plausible errno.
1831                  */
1832                 error = EBUSY;
1833                 goto out3;
1834         }
1835
1836         mount_end_update(mp);
1837         vnode_put(rvp);
1838         FREE(old_mntonname, M_TEMP);
1839
1840         vfs_notify_mount(pvp);
1841
1842         return 0;
1843 out3:
1844         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1845
1846         mount_lock(mp);
1847         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1848         mount_unlock(mp);
1849
1850 out2:
1851         /*
1852          * Placing the mp on the vnode clears VMOUNT,
1853          * so cleanup is different after that point
1854          */
1855         if (placed) {
1856                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1857                 undo_place_on_covered_vp(mp, vp);
1858         } else {
1859                 vnode_lock_spin(vp);
1860                 CLR(vp->v_flag, VMOUNT);
1861                 vnode_unlock(vp);
1862         }
1863 out1:
1864         mount_end_update(mp);
1865
1866 out0:
1867         vnode_put(rvp);
1868         FREE(old_mntonname, M_TEMP);
1869         return error;
1870 }
1871
1872 #if CONFIG_LOCKERBOOT
1873 __private_extern__
1874 int
1875 mount_locker_protoboot(const char *fsname, const char *mntpoint,
1876     const char *pbdevpath)
1877 {
1878         int error = -1;
1879         struct nameidata nd;
1880         boolean_t cleanup_nd = FALSE;
1881         vfs_context_t ctx = vfs_context_kernel();
1882         boolean_t is64 = TRUE;
1883         boolean_t by_index = TRUE;
1884         struct user64_mnt_imgsrc_args mia64 = {
1885                 .mi_height = 0,
1886                 .mi_flags = 0,
1887                 .mi_devpath = CAST_USER_ADDR_T(pbdevpath),
1888         };
1889         user_addr_t mia64addr = CAST_USER_ADDR_T(&mia64);
1890
1891         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
1892             UIO_SYSSPACE, CAST_USER_ADDR_T(mntpoint), ctx);
1893         error = namei(&nd);
1894         if (error) {
1895                 IMGSRC_DEBUG("namei: %d\n", error);
1896                 goto out;
1897         }
1898
1899         cleanup_nd = TRUE;
1900         error = relocate_imageboot_source(nd.ni_dvp, nd.ni_vp,
1901             &nd.ni_cnd, fsname, ctx, is64, mia64addr, by_index);
1902
1903 out:
1904         if (cleanup_nd) {
1905                 int stashed = error;
1906
1907                 error = vnode_put(nd.ni_vp);
1908                 if (error) {
1909                         panic("vnode_put() returned non-zero: %d", error);
1910                 }
1911
1912                 if (nd.ni_dvp) {
1913                         error = vnode_put(nd.ni_dvp);
1914                         if (error) {
1915                                 panic("vnode_put() returned non-zero: %d", error);
1916                         }
1917                 }
1918                 nameidone(&nd);
1919
1920                 error = stashed;
1921         }
1922         return error;
1923 }
1924 #endif /* CONFIG_LOCKERBOOT */
1925 #endif /* CONFIG_IMGSRC_ACCESS */
1926
1927 void
1928 enablequotas(struct mount *mp, vfs_context_t ctx)
1929 {
1930         struct nameidata qnd;
1931         int type;
1932         char qfpath[MAXPATHLEN];
1933         const char *qfname = QUOTAFILENAME;
1934         const char *qfopsname = QUOTAOPSNAME;
1935         const char *qfextension[] = INITQFNAMES;
1936
1937         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1938         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
1939                 return;
1940         }
1941         /*
1942          * Enable filesystem disk quotas if necessary.
1943          * We ignore errors as this should not interfere with final mount
1944          */
1945         for (type = 0; type < MAXQUOTAS; type++) {
1946                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1947                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1948                     CAST_USER_ADDR_T(qfpath), ctx);
1949                 if (namei(&qnd) != 0) {
1950                         continue;           /* option file to trigger quotas is not present */
1951                 }
1952                 vnode_put(qnd.ni_vp);
1953                 nameidone(&qnd);
1954                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1955
1956                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1957         }
1958         return;
1959 }
1960
1961
1962 static int
1963 checkdirs_callback(proc_t p, void * arg)
1964 {
1965         struct cdirargs * cdrp = (struct cdirargs *)arg;
1966         vnode_t olddp = cdrp->olddp;
1967         vnode_t newdp = cdrp->newdp;
1968         struct filedesc *fdp;
1969         vnode_t tvp;
1970         vnode_t fdp_cvp;
1971         vnode_t fdp_rvp;
1972         int cdir_changed = 0;
1973         int rdir_changed = 0;
1974
1975         /*
1976          * XXX Also needs to iterate each thread in the process to see if it
1977          * XXX is using a per-thread current working directory, and, if so,
1978          * XXX update that as well.
1979          */
1980
1981         proc_fdlock(p);
1982         fdp = p->p_fd;
1983         if (fdp == (struct filedesc *)0) {
1984                 proc_fdunlock(p);
1985                 return PROC_RETURNED;
1986         }
1987         fdp_cvp = fdp->fd_cdir;
1988         fdp_rvp = fdp->fd_rdir;
1989         proc_fdunlock(p);
1990
1991         if (fdp_cvp == olddp) {
1992                 vnode_ref(newdp);
1993                 tvp = fdp->fd_cdir;
1994                 fdp_cvp = newdp;
1995                 cdir_changed = 1;
1996                 vnode_rele(tvp);
1997         }
1998         if (fdp_rvp == olddp) {
1999                 vnode_ref(newdp);
2000                 tvp = fdp->fd_rdir;
2001                 fdp_rvp = newdp;
2002                 rdir_changed = 1;
2003                 vnode_rele(tvp);
2004         }
2005         if (cdir_changed || rdir_changed) {
2006                 proc_fdlock(p);
2007                 fdp->fd_cdir = fdp_cvp;
2008                 fdp->fd_rdir = fdp_rvp;
2009                 proc_fdunlock(p);
2010         }
2011         return PROC_RETURNED;
2012 }
2013
2014
2015
2016 /*
2017  * Scan all active processes to see if any of them have a current
2018  * or root directory onto which the new filesystem has just been
2019  * mounted. If so, replace them with the new mount point.
2020  */
2021 static int
2022 checkdirs(vnode_t olddp, vfs_context_t ctx)
2023 {
2024         vnode_t newdp;
2025         vnode_t tvp;
2026         int err;
2027         struct cdirargs cdr;
2028
2029         if (olddp->v_usecount == 1) {
2030                 return 0;
2031         }
2032         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2033
2034         if (err != 0) {
2035 #if DIAGNOSTIC
2036                 panic("mount: lost mount: error %d", err);
2037 #endif
2038                 return err;
2039         }
2040
2041         cdr.olddp = olddp;
2042         cdr.newdp = newdp;
2043         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2044         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2045
2046         if (rootvnode == olddp) {
2047                 vnode_ref(newdp);
2048                 tvp = rootvnode;
2049                 rootvnode = newdp;
2050                 vnode_rele(tvp);
2051         }
2052
2053         vnode_put(newdp);
2054         return 0;
2055 }
2056
2057 /*
2058  * Unmount a file system.
2059  *
2060  * Note: unmount takes a path to the vnode mounted on as argument,
2061  * not special file (as before).
2062  */
2063 /* ARGSUSED */
2064 int
2065 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2066 {
2067         vnode_t vp;
2068         struct mount *mp;
2069         int error;
2070         struct nameidata nd;
2071         vfs_context_t ctx = vfs_context_current();
2072
2073         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2074             UIO_USERSPACE, uap->path, ctx);
2075         error = namei(&nd);
2076         if (error) {
2077                 return error;
2078         }
2079         vp = nd.ni_vp;
2080         mp = vp->v_mount;
2081         nameidone(&nd);
2082
2083 #if CONFIG_MACF
2084         error = mac_mount_check_umount(ctx, mp);
2085         if (error != 0) {
2086                 vnode_put(vp);
2087                 return error;
2088         }
2089 #endif
2090         /*
2091          * Must be the root of the filesystem
2092          */
2093         if ((vp->v_flag & VROOT) == 0) {
2094                 vnode_put(vp);
2095                 return EINVAL;
2096         }
2097         mount_ref(mp, 0);
2098         vnode_put(vp);
2099         /* safedounmount consumes the mount ref */
2100         return safedounmount(mp, uap->flags, ctx);
2101 }
2102
2103 int
2104 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2105 {
2106         mount_t mp;
2107
2108         mp = mount_list_lookupby_fsid(fsid, 0, 1);
2109         if (mp == (mount_t)0) {
2110                 return ENOENT;
2111         }
2112         mount_ref(mp, 0);
2113         mount_iterdrop(mp);
2114         /* safedounmount consumes the mount ref */
2115         return safedounmount(mp, flags, ctx);
2116 }
2117
2118
2119 /*
2120  * The mount struct comes with a mount ref which will be consumed.
2121  * Do the actual file system unmount, prevent some common foot shooting.
2122  */
2123 int
2124 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2125 {
2126         int error;
2127         proc_t p = vfs_context_proc(ctx);
2128
2129         /*
2130          * If the file system is not responding and MNT_NOBLOCK
2131          * is set and not a forced unmount then return EBUSY.
2132          */
2133         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2134             (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2135                 error = EBUSY;
2136                 goto out;
2137         }
2138
2139         /*
2140          * Skip authorization if the mount is tagged as permissive and
2141          * this is not a forced-unmount attempt.
2142          */
2143         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
2144                 /*
2145                  * Only root, or the user that did the original mount is
2146                  * permitted to unmount this filesystem.
2147                  */
2148                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2149                     (error = suser(kauth_cred_get(), &p->p_acflag))) {
2150                         goto out;
2151                 }
2152         }
2153         /*
2154          * Don't allow unmounting the root file system (or the associated VM or DATA mounts) .
2155          */
2156         if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2157                 error = EBUSY; /* the root (or associated volumes) is always busy */
2158                 goto out;
2159         }
2160
2161 #ifdef CONFIG_IMGSRC_ACCESS
2162         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2163                 error = EBUSY;
2164                 goto out;
2165         }
2166 #endif /* CONFIG_IMGSRC_ACCESS */
2167
2168         return dounmount(mp, flags, 1, ctx);
2169
2170 out:
2171         mount_drop(mp, 0);
2172         return error;
2173 }
2174
2175 /*
2176  * Do the actual file system unmount.
2177  */
2178 int
2179 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2180 {
2181         vnode_t coveredvp = (vnode_t)0;
2182         int error;
2183         int needwakeup = 0;
2184         int forcedunmount = 0;
2185         int lflags = 0;
2186         struct vnode *devvp = NULLVP;
2187 #if CONFIG_TRIGGERS
2188         proc_t p = vfs_context_proc(ctx);
2189         int did_vflush = 0;
2190         int pflags_save = 0;
2191 #endif /* CONFIG_TRIGGERS */
2192
2193 #if CONFIG_FSE
2194         if (!(flags & MNT_FORCE)) {
2195                 fsevent_unmount(mp, ctx);  /* has to come first! */
2196         }
2197 #endif
2198
2199         mount_lock(mp);
2200
2201         /*
2202          * If already an unmount in progress just return EBUSY.
2203          * Even a forced unmount cannot override.
2204          */
2205         if (mp->mnt_lflag & MNT_LUNMOUNT) {
2206                 if (withref != 0) {
2207                         mount_drop(mp, 1);
2208                 }
2209                 mount_unlock(mp);
2210                 return EBUSY;
2211         }
2212
2213         if (flags & MNT_FORCE) {
2214                 forcedunmount = 1;
2215                 mp->mnt_lflag |= MNT_LFORCE;
2216         }
2217
2218 #if CONFIG_TRIGGERS
2219         if (flags & MNT_NOBLOCK && p != kernproc) {
2220                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2221         }
2222 #endif
2223
2224         mp->mnt_kern_flag |= MNTK_UNMOUNT;
2225         mp->mnt_lflag |= MNT_LUNMOUNT;
2226         mp->mnt_flag &= ~MNT_ASYNC;
2227         /*
2228          * anyone currently in the fast path that
2229          * trips over the cached rootvp will be
2230          * dumped out and forced into the slow path
2231          * to regenerate a new cached value
2232          */
2233         mp->mnt_realrootvp = NULLVP;
2234         mount_unlock(mp);
2235
2236         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2237                 /*
2238                  * Force unmount any mounts in this filesystem.
2239                  * If any unmounts fail - just leave them dangling.
2240                  * Avoids recursion.
2241                  */
2242                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2243         }
2244
2245         /*
2246          * taking the name_cache_lock exclusively will
2247          * insure that everyone is out of the fast path who
2248          * might be trying to use a now stale copy of
2249          * vp->v_mountedhere->mnt_realrootvp
2250          * bumping mount_generation causes the cached values
2251          * to be invalidated
2252          */
2253         name_cache_lock();
2254         mount_generation++;
2255         name_cache_unlock();
2256
2257
2258         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2259         if (withref != 0) {
2260                 mount_drop(mp, 0);
2261         }
2262         error = 0;
2263         if (forcedunmount == 0) {
2264                 ubc_umount(mp); /* release cached vnodes */
2265                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2266                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
2267                         if (error) {
2268                                 mount_lock(mp);
2269                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2270                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2271                                 mp->mnt_lflag &= ~MNT_LFORCE;
2272                                 goto out;
2273                         }
2274                 }
2275         }
2276
2277         IOBSDMountChange(mp, kIOMountChangeUnmount);
2278
2279 #if CONFIG_TRIGGERS
2280         vfs_nested_trigger_unmounts(mp, flags, ctx);
2281         did_vflush = 1;
2282 #endif
2283         if (forcedunmount) {
2284                 lflags |= FORCECLOSE;
2285         }
2286         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2287         if ((forcedunmount == 0) && error) {
2288                 mount_lock(mp);
2289                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2290                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2291                 mp->mnt_lflag &= ~MNT_LFORCE;
2292                 goto out;
2293         }
2294
2295         /* make sure there are no one in the mount iterations or lookup */
2296         mount_iterdrain(mp);
2297
2298         error = VFS_UNMOUNT(mp, flags, ctx);
2299         if (error) {
2300                 mount_iterreset(mp);
2301                 mount_lock(mp);
2302                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2303                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2304                 mp->mnt_lflag &= ~MNT_LFORCE;
2305                 goto out;
2306         }
2307
2308         /* increment the operations count */
2309         if (!error) {
2310                 OSAddAtomic(1, &vfs_nummntops);
2311         }
2312
2313         if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2314                 /* hold an io reference and drop the usecount before close */
2315                 devvp = mp->mnt_devvp;
2316                 vnode_getalways(devvp);
2317                 vnode_rele(devvp);
2318                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2319                     ctx);
2320                 vnode_clearmountedon(devvp);
2321                 vnode_put(devvp);
2322         }
2323         lck_rw_done(&mp->mnt_rwlock);
2324         mount_list_remove(mp);
2325         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2326
2327         /* mark the mount point hook in the vp but not drop the ref yet */
2328         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2329                 /*
2330                  * The covered vnode needs special handling. Trying to get an
2331                  * iocount must not block here as this may lead to deadlocks
2332                  * if the Filesystem to which the covered vnode belongs is
2333                  * undergoing forced unmounts. Since we hold a usecount, the
2334                  * vnode cannot be reused (it can, however, still be terminated)
2335                  */
2336                 vnode_getalways(coveredvp);
2337                 vnode_lock_spin(coveredvp);
2338
2339                 mp->mnt_crossref++;
2340                 coveredvp->v_mountedhere = (struct mount *)0;
2341                 CLR(coveredvp->v_flag, VMOUNT);
2342
2343                 vnode_unlock(coveredvp);
2344                 vnode_put(coveredvp);
2345         }
2346
2347         mount_list_lock();
2348         mp->mnt_vtable->vfc_refcount--;
2349         mount_list_unlock();
2350
2351         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2352         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2353         mount_lock(mp);
2354         mp->mnt_lflag |= MNT_LDEAD;
2355
2356         if (mp->mnt_lflag & MNT_LWAIT) {
2357                 /*
2358                  * do the wakeup here
2359                  * in case we block in mount_refdrain
2360                  * which will drop the mount lock
2361                  * and allow anyone blocked in vfs_busy
2362                  * to wakeup and see the LDEAD state
2363                  */
2364                 mp->mnt_lflag &= ~MNT_LWAIT;
2365                 wakeup((caddr_t)mp);
2366         }
2367         mount_refdrain(mp);
2368
2369         /* free disk_conditioner_info structure for this mount */
2370         disk_conditioner_unmount(mp);
2371
2372 out:
2373         if (mp->mnt_lflag & MNT_LWAIT) {
2374                 mp->mnt_lflag &= ~MNT_LWAIT;
2375                 needwakeup = 1;
2376         }
2377
2378 #if CONFIG_TRIGGERS
2379         if (flags & MNT_NOBLOCK && p != kernproc) {
2380                 // Restore P_NOREMOTEHANG bit to its previous value
2381                 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2382                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2383                 }
2384         }
2385
2386         /*
2387          * Callback and context are set together under the mount lock, and
2388          * never cleared, so we're safe to examine them here, drop the lock,
2389          * and call out.
2390          */
2391         if (mp->mnt_triggercallback != NULL) {
2392                 mount_unlock(mp);
2393                 if (error == 0) {
2394                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2395                 } else if (did_vflush) {
2396                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2397                 }
2398         } else {
2399                 mount_unlock(mp);
2400         }
2401 #else
2402         mount_unlock(mp);
2403 #endif /* CONFIG_TRIGGERS */
2404
2405         lck_rw_done(&mp->mnt_rwlock);
2406
2407         if (needwakeup) {
2408                 wakeup((caddr_t)mp);
2409         }
2410
2411         if (!error) {
2412                 if ((coveredvp != NULLVP)) {
2413                         vnode_t pvp = NULLVP;
2414
2415                         /*
2416                          * The covered vnode needs special handling. Trying to
2417                          * get an iocount must not block here as this may lead
2418                          * to deadlocks if the Filesystem to which the covered
2419                          * vnode belongs is undergoing forced unmounts. Since we
2420                          * hold a usecount, the  vnode cannot be reused
2421                          * (it can, however, still be terminated).
2422                          */
2423                         vnode_getalways(coveredvp);
2424
2425                         mount_dropcrossref(mp, coveredvp, 0);
2426                         /*
2427                          * We'll _try_ to detect if this really needs to be
2428                          * done. The coveredvp can only be in termination (or
2429                          * terminated) if the coveredvp's mount point is in a
2430                          * forced unmount (or has been) since we still hold the
2431                          * ref.
2432                          */
2433                         if (!vnode_isrecycled(coveredvp)) {
2434                                 pvp = vnode_getparent(coveredvp);
2435 #if CONFIG_TRIGGERS
2436                                 if (coveredvp->v_resolve) {
2437                                         vnode_trigger_rearm(coveredvp, ctx);
2438                                 }
2439 #endif
2440                         }
2441
2442                         vnode_rele(coveredvp);
2443                         vnode_put(coveredvp);
2444                         coveredvp = NULLVP;
2445
2446                         if (pvp) {
2447                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2448                                 vnode_put(pvp);
2449                         }
2450                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2451                         mount_lock_destroy(mp);
2452 #if CONFIG_MACF
2453                         mac_mount_label_destroy(mp);
2454 #endif
2455                         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2456                 } else {
2457                         panic("dounmount: no coveredvp");
2458                 }
2459         }
2460         return error;
2461 }
2462
2463 /*
2464  * Unmount any mounts in this filesystem.
2465  */
2466 void
2467 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2468 {
2469         mount_t smp;
2470         fsid_t *fsids, fsid;
2471         int fsids_sz;
2472         int count = 0, i, m = 0;
2473         vnode_t vp;
2474
2475         mount_list_lock();
2476
2477         // Get an array to hold the submounts fsids.
2478         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2479         count++;
2480         fsids_sz = count * sizeof(fsid_t);
2481         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2482         if (fsids == NULL) {
2483                 mount_list_unlock();
2484                 goto out;
2485         }
2486         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2487
2488         /*
2489          * Fill the array with submount fsids.
2490          * Since mounts are always added to the tail of the mount list, the
2491          * list is always in mount order.
2492          * For each mount check if the mounted-on vnode belongs to a
2493          * mount that's already added to our array of mounts to be unmounted.
2494          */
2495         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2496                 vp = smp->mnt_vnodecovered;
2497                 if (vp == NULL) {
2498                         continue;
2499                 }
2500                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2501                 for (i = 0; i <= m; i++) {
2502                         if (fsids[i].val[0] == fsid.val[0] &&
2503                             fsids[i].val[1] == fsid.val[1]) {
2504                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2505                                 break;
2506                         }
2507                 }
2508         }
2509         mount_list_unlock();
2510
2511         // Unmount the submounts in reverse order. Ignore errors.
2512         for (i = m; i > 0; i--) {
2513                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2514                 if (smp) {
2515                         mount_ref(smp, 0);
2516                         mount_iterdrop(smp);
2517                         (void) dounmount(smp, flags, 1, ctx);
2518                 }
2519         }
2520 out:
2521         if (fsids) {
2522                 FREE(fsids, M_TEMP);
2523         }
2524 }
2525
2526 void
2527 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2528 {
2529         vnode_lock(dp);
2530         mp->mnt_crossref--;
2531
2532         if (mp->mnt_crossref < 0) {
2533                 panic("mount cross refs -ve");
2534         }
2535
2536         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2537                 if (need_put) {
2538                         vnode_put_locked(dp);
2539                 }
2540                 vnode_unlock(dp);
2541
2542                 mount_lock_destroy(mp);
2543 #if CONFIG_MACF
2544                 mac_mount_label_destroy(mp);
2545 #endif
2546                 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2547                 return;
2548         }
2549         if (need_put) {
2550                 vnode_put_locked(dp);
2551         }
2552         vnode_unlock(dp);
2553 }
2554
2555
2556 /*
2557  * Sync each mounted filesystem.
2558  */
2559 #if DIAGNOSTIC
2560 int syncprt = 0;
2561 #endif
2562
2563 int print_vmpage_stat = 0;
2564
2565 /*
2566  * sync_callback:       simple wrapper that calls VFS_SYNC() on volumes
2567  *                      mounted read-write with the passed waitfor value.
2568  *
2569  * Parameters:  mp      mount-point descriptor per mounted file-system instance.
2570  *              arg     user argument (please see below)
2571  *
2572  * User argument is a pointer to 32 bit unsigned integer which describes the
2573  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
2574  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2575  * waitfor value.
2576  *
2577  * Returns:             VFS_RETURNED
2578  */
2579 static int
2580 sync_callback(mount_t mp, void *arg)
2581 {
2582         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2583                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2584                 unsigned waitfor = MNT_NOWAIT;
2585
2586                 if (arg) {
2587                         waitfor = *(uint32_t*)arg;
2588                 }
2589
2590                 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
2591                 if (waitfor != MNT_WAIT &&
2592                     waitfor != (MNT_WAIT | MNT_VOLUME) &&
2593                     waitfor != MNT_NOWAIT &&
2594                     waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2595                     waitfor != MNT_DWAIT &&
2596                     waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2597                         panic("Passed inappropriate waitfor %u to "
2598                             "sync_callback()", waitfor);
2599                 }
2600
2601                 mp->mnt_flag &= ~MNT_ASYNC;
2602                 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2603                 if (asyncflag) {
2604                         mp->mnt_flag |= MNT_ASYNC;
2605                 }
2606         }
2607
2608         return VFS_RETURNED;
2609 }
2610
2611 /* ARGSUSED */
2612 int
2613 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2614 {
2615         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2616
2617         if (print_vmpage_stat) {
2618                 vm_countdirtypages();
2619         }
2620
2621 #if DIAGNOSTIC
2622         if (syncprt) {
2623                 vfs_bufstats();
2624         }
2625 #endif /* DIAGNOSTIC */
2626         return 0;
2627 }
2628
2629 typedef enum {
2630         SYNC_ALL = 0,
2631         SYNC_ONLY_RELIABLE_MEDIA = 1,
2632         SYNC_ONLY_UNRELIABLE_MEDIA = 2
2633 } sync_type_t;
2634
2635 static int
2636 sync_internal_callback(mount_t mp, void *arg)
2637 {
2638         if (arg) {
2639                 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2640                     (mp->mnt_flag & MNT_LOCAL);
2641                 sync_type_t sync_type = *((sync_type_t *)arg);
2642
2643                 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2644                         return VFS_RETURNED;
2645                 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2646                         return VFS_RETURNED;
2647                 }
2648         }
2649
2650         (void)sync_callback(mp, NULL);
2651
2652         return VFS_RETURNED;
2653 }
2654
2655 int sync_thread_state = 0;
2656 int sync_timeout_seconds = 5;
2657
2658 #define SYNC_THREAD_RUN       0x0001
2659 #define SYNC_THREAD_RUNNING   0x0002
2660
2661 static void
2662 sync_thread(__unused void *arg, __unused wait_result_t wr)
2663 {
2664         sync_type_t sync_type;
2665
2666         lck_mtx_lock(sync_mtx_lck);
2667         while (sync_thread_state & SYNC_THREAD_RUN) {
2668                 sync_thread_state &= ~SYNC_THREAD_RUN;
2669                 lck_mtx_unlock(sync_mtx_lck);
2670
2671                 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2672                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2673                 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2674                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2675
2676                 lck_mtx_lock(sync_mtx_lck);
2677         }
2678         /*
2679          * This wakeup _has_ to be issued before the lock is released otherwise
2680          * we may end up waking up a thread in sync_internal which is
2681          * expecting a wakeup from a thread it just created and not from this
2682          * thread which is about to exit.
2683          */
2684         wakeup(&sync_thread_state);
2685         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2686         lck_mtx_unlock(sync_mtx_lck);
2687
2688         if (print_vmpage_stat) {
2689                 vm_countdirtypages();
2690         }
2691
2692 #if DIAGNOSTIC
2693         if (syncprt) {
2694                 vfs_bufstats();
2695         }
2696 #endif /* DIAGNOSTIC */
2697 }
2698
2699 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2700
2701 /*
2702  * An in-kernel sync for power management to call.
2703  * This function always returns within sync_timeout seconds.
2704  */
2705 __private_extern__ int
2706 sync_internal(void)
2707 {
2708         thread_t thd;
2709         int error;
2710         int thread_created = FALSE;
2711         struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2712
2713         lck_mtx_lock(sync_mtx_lck);
2714         sync_thread_state |= SYNC_THREAD_RUN;
2715         if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2716                 int kr;
2717
2718                 sync_thread_state |= SYNC_THREAD_RUNNING;
2719                 kr = kernel_thread_start(sync_thread, NULL, &thd);
2720                 if (kr != KERN_SUCCESS) {
2721                         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2722                         lck_mtx_unlock(sync_mtx_lck);
2723                         printf("sync_thread failed\n");
2724                         return 0;
2725                 }
2726                 thread_created = TRUE;
2727         }
2728
2729         error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
2730             (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2731         if (error) {
2732                 struct timeval now;
2733
2734                 microtime(&now);
2735                 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2736                         printf("sync timed out: %d sec\n", sync_timeout_seconds);
2737                         sync_timeout_last_print.tv_sec = now.tv_sec;
2738                 }
2739         }
2740
2741         if (thread_created) {
2742                 thread_deallocate(thd);
2743         }
2744
2745         return 0;
2746 } /* end of sync_internal call */
2747
2748 /*
2749  * Change filesystem quotas.
2750  */
2751 #if QUOTA
2752 int
2753 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2754 {
2755         struct mount *mp;
2756         int error, quota_cmd, quota_status = 0;
2757         caddr_t datap;
2758         size_t fnamelen;
2759         struct nameidata nd;
2760         vfs_context_t ctx = vfs_context_current();
2761         struct dqblk my_dqblk = {};
2762
2763         AUDIT_ARG(uid, uap->uid);
2764         AUDIT_ARG(cmd, uap->cmd);
2765         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2766             uap->path, ctx);
2767         error = namei(&nd);
2768         if (error) {
2769                 return error;
2770         }
2771         mp = nd.ni_vp->v_mount;
2772         vnode_put(nd.ni_vp);
2773         nameidone(&nd);
2774
2775         /* copyin any data we will need for downstream code */
2776         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2777
2778         switch (quota_cmd) {
2779         case Q_QUOTAON:
2780                 /* uap->arg specifies a file from which to take the quotas */
2781                 fnamelen = MAXPATHLEN;
2782                 datap = kalloc(MAXPATHLEN);
2783                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2784                 break;
2785         case Q_GETQUOTA:
2786                 /* uap->arg is a pointer to a dqblk structure. */
2787                 datap = (caddr_t) &my_dqblk;
2788                 break;
2789         case Q_SETQUOTA:
2790         case Q_SETUSE:
2791                 /* uap->arg is a pointer to a dqblk structure. */
2792                 datap = (caddr_t) &my_dqblk;
2793                 if (proc_is64bit(p)) {
2794                         struct user_dqblk       my_dqblk64;
2795                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2796                         if (error == 0) {
2797                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2798                         }
2799                 } else {
2800                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2801                 }
2802                 break;
2803         case Q_QUOTASTAT:
2804                 /* uap->arg is a pointer to an integer */
2805                 datap = (caddr_t) &quota_status;
2806                 break;
2807         default:
2808                 datap = NULL;
2809                 break;
2810         } /* switch */
2811
2812         if (error == 0) {
2813                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2814         }
2815
2816         switch (quota_cmd) {
2817         case Q_QUOTAON:
2818                 if (datap != NULL) {
2819                         kfree(datap, MAXPATHLEN);
2820                 }
2821                 break;
2822         case Q_GETQUOTA:
2823                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2824                 if (error == 0) {
2825                         if (proc_is64bit(p)) {
2826                                 struct user_dqblk       my_dqblk64;
2827
2828                                 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
2829                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2830                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
2831                         } else {
2832                                 error = copyout(datap, uap->arg, sizeof(struct dqblk));
2833                         }
2834                 }
2835                 break;
2836         case Q_QUOTASTAT:
2837                 /* uap->arg is a pointer to an integer */
2838                 if (error == 0) {
2839                         error = copyout(datap, uap->arg, sizeof(quota_status));
2840                 }
2841                 break;
2842         default:
2843                 break;
2844         } /* switch */
2845
2846         return error;
2847 }
2848 #else
2849 int
2850 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2851 {
2852         return EOPNOTSUPP;
2853 }
2854 #endif /* QUOTA */
2855
2856 /*
2857  * Get filesystem statistics.
2858  *
2859  * Returns:     0                       Success
2860  *      namei:???
2861  *      vfs_update_vfsstat:???
2862  *      munge_statfs:EFAULT
2863  */
2864 /* ARGSUSED */
2865 int
2866 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2867 {
2868         struct mount *mp;
2869         struct vfsstatfs *sp;
2870         int error;
2871         struct nameidata nd;
2872         vfs_context_t ctx = vfs_context_current();
2873         vnode_t vp;
2874
2875         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2876             UIO_USERSPACE, uap->path, ctx);
2877         error = namei(&nd);
2878         if (error != 0) {
2879                 return error;
2880         }
2881         vp = nd.ni_vp;
2882         mp = vp->v_mount;
2883         sp = &mp->mnt_vfsstat;
2884         nameidone(&nd);
2885
2886 #if CONFIG_MACF
2887         error = mac_mount_check_stat(ctx, mp);
2888         if (error != 0) {
2889                 vnode_put(vp);
2890                 return error;
2891         }
2892 #endif
2893
2894         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2895         if (error != 0) {
2896                 vnode_put(vp);
2897                 return error;
2898         }
2899
2900         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2901         vnode_put(vp);
2902         return error;
2903 }
2904
2905 /*
2906  * Get filesystem statistics.
2907  */
2908 /* ARGSUSED */
2909 int
2910 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2911 {
2912         vnode_t vp;
2913         struct mount *mp;
2914         struct vfsstatfs *sp;
2915         int error;
2916
2917         AUDIT_ARG(fd, uap->fd);
2918
2919         if ((error = file_vnode(uap->fd, &vp))) {
2920                 return error;
2921         }
2922
2923         error = vnode_getwithref(vp);
2924         if (error) {
2925                 file_drop(uap->fd);
2926                 return error;
2927         }
2928
2929         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2930
2931         mp = vp->v_mount;
2932         if (!mp) {
2933                 error = EBADF;
2934                 goto out;
2935         }
2936
2937 #if CONFIG_MACF
2938         error = mac_mount_check_stat(vfs_context_current(), mp);
2939         if (error != 0) {
2940                 goto out;
2941         }
2942 #endif
2943
2944         sp = &mp->mnt_vfsstat;
2945         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2946                 goto out;
2947         }
2948
2949         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2950
2951 out:
2952         file_drop(uap->fd);
2953         vnode_put(vp);
2954
2955         return error;
2956 }
2957
2958 void
2959 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
2960 {
2961         struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
2962
2963         bzero(sfs, sizeof(*sfs));
2964
2965         sfs->f_bsize = vsfs->f_bsize;
2966         sfs->f_iosize = (int32_t)vsfs->f_iosize;
2967         sfs->f_blocks = vsfs->f_blocks;
2968         sfs->f_bfree = vsfs->f_bfree;
2969         sfs->f_bavail = vsfs->f_bavail;
2970         sfs->f_files = vsfs->f_files;
2971         sfs->f_ffree = vsfs->f_ffree;
2972         sfs->f_fsid = vsfs->f_fsid;
2973         sfs->f_owner = vsfs->f_owner;
2974         sfs->f_type = mp->mnt_vtable->vfc_typenum;
2975         sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2976         sfs->f_fssubtype = vsfs->f_fssubtype;
2977         sfs->f_flags_ext = ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS)) ? MNT_EXT_ROOT_DATA_VOL : 0;
2978         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2979                 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
2980         } else {
2981                 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
2982         }
2983         strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
2984         strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
2985 }
2986
2987 /*
2988  * Get file system statistics in 64-bit mode
2989  */
2990 int
2991 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
2992 {
2993         struct mount *mp;
2994         int error;
2995         struct nameidata nd;
2996         struct statfs64 sfs;
2997         vfs_context_t ctxp = vfs_context_current();
2998         vnode_t vp;
2999
3000         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3001             UIO_USERSPACE, uap->path, ctxp);
3002         error = namei(&nd);
3003         if (error != 0) {
3004                 return error;
3005         }
3006         vp = nd.ni_vp;
3007         mp = vp->v_mount;
3008         nameidone(&nd);
3009
3010 #if CONFIG_MACF
3011         error = mac_mount_check_stat(ctxp, mp);
3012         if (error != 0) {
3013                 vnode_put(vp);
3014                 return error;
3015         }
3016 #endif
3017
3018         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3019         if (error != 0) {
3020                 vnode_put(vp);
3021                 return error;
3022         }
3023
3024         vfs_get_statfs64(mp, &sfs);
3025         if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3026             (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3027                 /* This process does not want to see a seperate data volume mountpoint */
3028                 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3029         }
3030         error = copyout(&sfs, uap->buf, sizeof(sfs));
3031         vnode_put(vp);
3032
3033         return error;
3034 }
3035
3036 /*
3037  * Get file system statistics in 64-bit mode
3038  */
3039 int
3040 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3041 {
3042         struct vnode *vp;
3043         struct mount *mp;
3044         struct statfs64 sfs;
3045         int error;
3046
3047         AUDIT_ARG(fd, uap->fd);
3048
3049         if ((error = file_vnode(uap->fd, &vp))) {
3050                 return error;
3051         }
3052
3053         error = vnode_getwithref(vp);
3054         if (error) {
3055                 file_drop(uap->fd);
3056                 return error;
3057         }
3058
3059         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3060
3061         mp = vp->v_mount;
3062         if (!mp) {
3063                 error = EBADF;
3064                 goto out;
3065         }
3066
3067 #if CONFIG_MACF
3068         error = mac_mount_check_stat(vfs_context_current(), mp);
3069         if (error != 0) {
3070                 goto out;
3071         }
3072 #endif
3073
3074         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3075                 goto out;
3076         }
3077
3078         vfs_get_statfs64(mp, &sfs);
3079         if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3080             (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3081                 /* This process does not want to see a seperate data volume mountpoint */
3082                 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3083         }
3084         error = copyout(&sfs, uap->buf, sizeof(sfs));
3085
3086 out:
3087         file_drop(uap->fd);
3088         vnode_put(vp);
3089
3090         return error;
3091 }
3092
3093 struct getfsstat_struct {
3094         user_addr_t     sfsp;
3095         user_addr_t     *mp;
3096         int             count;
3097         int             maxcount;
3098         int             flags;
3099         int             error;
3100 };
3101
3102
3103 static int
3104 getfsstat_callback(mount_t mp, void * arg)
3105 {
3106         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3107         struct vfsstatfs *sp;
3108         int error, my_size;
3109         vfs_context_t ctx = vfs_context_current();
3110
3111         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3112 #if CONFIG_MACF
3113                 error = mac_mount_check_stat(ctx, mp);
3114                 if (error != 0) {
3115                         fstp->error = error;
3116                         return VFS_RETURNED_DONE;
3117                 }
3118 #endif
3119                 sp = &mp->mnt_vfsstat;
3120                 /*
3121                  * If MNT_NOWAIT is specified, do not refresh the
3122                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3123                  */
3124                 if ((mp->mnt_lflag & MNT_LDEAD) ||
3125                     (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3126                     (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3127                     (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3128                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3129                         return VFS_RETURNED;
3130                 }
3131
3132                 /*
3133                  * Need to handle LP64 version of struct statfs
3134                  */
3135                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3136                 if (error) {
3137                         fstp->error = error;
3138                         return VFS_RETURNED_DONE;
3139                 }
3140                 fstp->sfsp += my_size;
3141
3142                 if (fstp->mp) {
3143 #if CONFIG_MACF
3144                         error = mac_mount_label_get(mp, *fstp->mp);
3145                         if (error) {
3146                                 fstp->error = error;
3147                                 return VFS_RETURNED_DONE;
3148                         }
3149 #endif
3150                         fstp->mp++;
3151                 }
3152         }
3153         fstp->count++;
3154         return VFS_RETURNED;
3155 }
3156
3157 /*
3158  * Get statistics on all filesystems.
3159  */
3160 int
3161 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3162 {
3163         struct __mac_getfsstat_args muap;
3164
3165         muap.buf = uap->buf;
3166         muap.bufsize = uap->bufsize;
3167         muap.mac = USER_ADDR_NULL;
3168         muap.macsize = 0;
3169         muap.flags = uap->flags;
3170
3171         return __mac_getfsstat(p, &muap, retval);
3172 }
3173
3174 /*
3175  * __mac_getfsstat: Get MAC-related file system statistics
3176  *
3177  * Parameters:    p                        (ignored)
3178  *                uap                      User argument descriptor (see below)
3179  *                retval                   Count of file system statistics (N stats)
3180  *
3181  * Indirect:      uap->bufsize             Buffer size
3182  *                uap->macsize             MAC info size
3183  *                uap->buf                 Buffer where information will be returned
3184  *                uap->mac                 MAC info
3185  *                uap->flags               File system flags
3186  *
3187  *
3188  * Returns:        0                       Success
3189  *                !0                       Not success
3190  *
3191  */
3192 int
3193 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3194 {
3195         user_addr_t sfsp;
3196         user_addr_t *mp;
3197         size_t count, maxcount, bufsize, macsize;
3198         struct getfsstat_struct fst;
3199
3200         if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3201                 return EINVAL;
3202         }
3203
3204         bufsize = (size_t) uap->bufsize;
3205         macsize = (size_t) uap->macsize;
3206
3207         if (IS_64BIT_PROCESS(p)) {
3208                 maxcount = bufsize / sizeof(struct user64_statfs);
3209         } else {
3210                 maxcount = bufsize / sizeof(struct user32_statfs);
3211         }
3212         sfsp = uap->buf;
3213         count = 0;
3214
3215         mp = NULL;
3216
3217 #if CONFIG_MACF
3218         if (uap->mac != USER_ADDR_NULL) {
3219                 u_int32_t *mp0;
3220                 int error;
3221                 unsigned int i;
3222
3223                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3224                 if (count != maxcount) {
3225                         return EINVAL;
3226                 }
3227
3228                 /* Copy in the array */
3229                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
3230                 if (mp0 == NULL) {
3231                         return ENOMEM;
3232                 }
3233
3234                 error = copyin(uap->mac, mp0, macsize);
3235                 if (error) {
3236                         FREE(mp0, M_MACTEMP);
3237                         return error;
3238                 }
3239
3240                 /* Normalize to an array of user_addr_t */
3241                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
3242                 if (mp == NULL) {
3243                         FREE(mp0, M_MACTEMP);
3244                         return ENOMEM;
3245                 }
3246
3247                 for (i = 0; i < count; i++) {
3248                         if (IS_64BIT_PROCESS(p)) {
3249                                 mp[i] = ((user_addr_t *)mp0)[i];
3250                         } else {
3251                                 mp[i] = (user_addr_t)mp0[i];
3252                         }
3253                 }
3254                 FREE(mp0, M_MACTEMP);
3255         }
3256 #endif
3257
3258
3259         fst.sfsp = sfsp;
3260         fst.mp = mp;
3261         fst.flags = uap->flags;
3262         fst.count = 0;
3263         fst.error = 0;
3264         fst.maxcount = maxcount;
3265
3266
3267         vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3268
3269         if (mp) {
3270                 FREE(mp, M_MACTEMP);
3271         }
3272
3273         if (fst.error) {
3274                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3275                 return fst.error;
3276         }
3277
3278         if (fst.sfsp && fst.count > fst.maxcount) {
3279                 *retval = fst.maxcount;
3280         } else {
3281                 *retval = fst.count;
3282         }
3283         return 0;
3284 }
3285
3286 static int
3287 getfsstat64_callback(mount_t mp, void * arg)
3288 {
3289         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3290         struct vfsstatfs *sp;
3291         struct statfs64 sfs;
3292         int error;
3293
3294         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3295 #if CONFIG_MACF
3296                 error = mac_mount_check_stat(vfs_context_current(), mp);
3297                 if (error != 0) {
3298                         fstp->error = error;
3299                         return VFS_RETURNED_DONE;
3300                 }
3301 #endif
3302                 sp = &mp->mnt_vfsstat;
3303                 /*
3304                  * If MNT_NOWAIT is specified, do not refresh the fsstat
3305                  * cache. MNT_WAIT overrides MNT_NOWAIT.
3306                  *
3307                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
3308                  * getfsstat, since the constants are out of the same
3309                  * namespace.
3310                  */
3311                 if ((mp->mnt_lflag & MNT_LDEAD) ||
3312                     ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3313                     (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3314                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3315                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3316                         return VFS_RETURNED;
3317                 }
3318
3319                 vfs_get_statfs64(mp, &sfs);
3320                 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3321                 if (error) {
3322                         fstp->error = error;
3323                         return VFS_RETURNED_DONE;
3324                 }
3325                 fstp->sfsp += sizeof(sfs);
3326         }
3327         fstp->count++;
3328         return VFS_RETURNED;
3329 }
3330
3331 /*
3332  * Get statistics on all file systems in 64 bit mode.
3333  */
3334 int
3335 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3336 {
3337         user_addr_t sfsp;
3338         int count, maxcount;
3339         struct getfsstat_struct fst;
3340
3341         maxcount = uap->bufsize / sizeof(struct statfs64);
3342
3343         sfsp = uap->buf;
3344         count = 0;
3345
3346         fst.sfsp = sfsp;
3347         fst.flags = uap->flags;
3348         fst.count = 0;
3349         fst.error = 0;
3350         fst.maxcount = maxcount;
3351
3352         vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3353
3354         if (fst.error) {
3355                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3356                 return fst.error;
3357         }
3358
3359         if (fst.sfsp && fst.count > fst.maxcount) {
3360                 *retval = fst.maxcount;
3361         } else {
3362                 *retval = fst.count;
3363         }
3364
3365         return 0;
3366 }
3367
3368 /*
3369  * gets the associated vnode with the file descriptor passed.
3370  * as input
3371  *
3372  * INPUT
3373  * ctx - vfs context of caller
3374  * fd - file descriptor for which vnode is required.
3375  * vpp - Pointer to pointer to vnode to be returned.
3376  *
3377  * The vnode is returned with an iocount so any vnode obtained
3378  * by this call needs a vnode_put
3379  *
3380  */
3381 int
3382 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3383 {
3384         int error;
3385         vnode_t vp;
3386         struct fileproc *fp;
3387         proc_t p = vfs_context_proc(ctx);
3388
3389         *vpp =  NULLVP;
3390
3391         error = fp_getfvp(p, fd, &fp, &vp);
3392         if (error) {
3393                 return error;
3394         }
3395
3396         error = vnode_getwithref(vp);
3397         if (error) {
3398                 (void)fp_drop(p, fd, fp, 0);
3399                 return error;
3400         }
3401
3402         (void)fp_drop(p, fd, fp, 0);
3403         *vpp = vp;
3404         return error;
3405 }
3406
3407 /*
3408  * Wrapper function around namei to start lookup from a directory
3409  * specified by a file descriptor ni_dirfd.
3410  *
3411  * In addition to all the errors returned by namei, this call can
3412  * return ENOTDIR if the file descriptor does not refer to a directory.
3413  * and EBADF if the file descriptor is not valid.
3414  */
3415 int
3416 nameiat(struct nameidata *ndp, int dirfd)
3417 {
3418         if ((dirfd != AT_FDCWD) &&
3419             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3420             !(ndp->ni_cnd.cn_flags & USEDVP)) {
3421                 int error = 0;
3422                 char c;
3423
3424                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3425                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3426                         if (error) {
3427                                 return error;
3428                         }
3429                 } else {
3430                         c = *((char *)(ndp->ni_dirp));
3431                 }
3432
3433                 if (c != '/') {
3434                         vnode_t dvp_at;
3435
3436                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3437                             &dvp_at);
3438                         if (error) {
3439                                 return error;
3440                         }
3441
3442                         if (vnode_vtype(dvp_at) != VDIR) {
3443                                 vnode_put(dvp_at);
3444                                 return ENOTDIR;
3445                         }
3446
3447                         ndp->ni_dvp = dvp_at;
3448                         ndp->ni_cnd.cn_flags |= USEDVP;
3449                         error = namei(ndp);
3450                         ndp->ni_cnd.cn_flags &= ~USEDVP;
3451                         vnode_put(dvp_at);
3452                         return error;
3453                 }
3454         }
3455
3456         return namei(ndp);
3457 }
3458
3459 /*
3460  * Change current working directory to a given file descriptor.
3461  */
3462 /* ARGSUSED */
3463 static int
3464 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3465 {
3466         struct filedesc *fdp = p->p_fd;
3467         vnode_t vp;
3468         vnode_t tdp;
3469         vnode_t tvp;
3470         struct mount *mp;
3471         int error;
3472         vfs_context_t ctx = vfs_context_current();
3473
3474         AUDIT_ARG(fd, uap->fd);
3475         if (per_thread && uap->fd == -1) {
3476                 /*
3477                  * Switching back from per-thread to per process CWD; verify we
3478                  * in fact have one before proceeding.  The only success case
3479                  * for this code path is to return 0 preemptively after zapping
3480                  * the thread structure contents.
3481                  */
3482                 thread_t th = vfs_context_thread(ctx);
3483                 if (th) {
3484                         uthread_t uth = get_bsdthread_info(th);
3485                         tvp = uth->uu_cdir;
3486                         uth->uu_cdir = NULLVP;
3487                         if (tvp != NULLVP) {
3488                                 vnode_rele(tvp);
3489                                 return 0;
3490                         }
3491                 }
3492                 return EBADF;
3493         }
3494
3495         if ((error = file_vnode(uap->fd, &vp))) {
3496                 return error;
3497         }
3498         if ((error = vnode_getwithref(vp))) {
3499                 file_drop(uap->fd);
3500                 return error;
3501         }
3502
3503         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3504
3505         if (vp->v_type != VDIR) {
3506                 error = ENOTDIR;
3507                 goto out;
3508         }
3509
3510 #if CONFIG_MACF
3511         error = mac_vnode_check_chdir(ctx, vp);
3512         if (error) {
3513                 goto out;
3514         }
3515 #endif
3516         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3517         if (error) {
3518                 goto out;
3519         }
3520
3521         while (!error && (mp = vp->v_mountedhere) != NULL) {
3522                 if (vfs_busy(mp, LK_NOWAIT)) {
3523                         error = EACCES;
3524                         goto out;
3525                 }
3526                 error = VFS_ROOT(mp, &tdp, ctx);
3527                 vfs_unbusy(mp);
3528                 if (error) {
3529                         break;
3530                 }
3531                 vnode_put(vp);
3532                 vp = tdp;
3533         }
3534         if (error) {
3535                 goto out;
3536         }
3537         if ((error = vnode_ref(vp))) {
3538                 goto out;
3539         }
3540         vnode_put(vp);
3541
3542         if (per_thread) {
3543                 thread_t th = vfs_context_thread(ctx);
3544                 if (th) {
3545                         uthread_t uth = get_bsdthread_info(th);
3546                         tvp = uth->uu_cdir;
3547                         uth->uu_cdir = vp;
3548                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3549                 } else {
3550                         vnode_rele(vp);
3551                         return ENOENT;
3552                 }
3553         } else {
3554                 proc_fdlock(p);
3555                 tvp = fdp->fd_cdir;
3556                 fdp->fd_cdir = vp;
3557                 proc_fdunlock(p);
3558         }
3559
3560         if (tvp) {
3561                 vnode_rele(tvp);
3562         }
3563         file_drop(uap->fd);
3564
3565         return 0;
3566 out:
3567         vnode_put(vp);
3568         file_drop(uap->fd);
3569
3570         return error;
3571 }
3572
3573 int
3574 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3575 {
3576         return common_fchdir(p, uap, 0);
3577 }
3578
3579 int
3580 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3581 {
3582         return common_fchdir(p, (void *)uap, 1);
3583 }
3584
3585
3586 /*
3587  * Change current working directory (".").
3588  *
3589  * Returns:     0                       Success
3590  *      change_dir:ENOTDIR
3591  *      change_dir:???
3592  *      vnode_ref:ENOENT                No such file or directory
3593  */
3594 /* ARGSUSED */
3595 int
3596 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3597 {
3598         struct filedesc *fdp = p->p_fd;
3599         int error;
3600         vnode_t tvp;
3601
3602         error = change_dir(ndp, ctx);
3603         if (error) {
3604                 return error;
3605         }
3606         if ((error = vnode_ref(ndp->ni_vp))) {
3607                 vnode_put(ndp->ni_vp);
3608                 return error;
3609         }
3610         /*
3611          * drop the iocount we picked up in change_dir
3612          */
3613         vnode_put(ndp->ni_vp);
3614
3615         if (per_thread) {
3616                 thread_t th = vfs_context_thread(ctx);
3617                 if (th) {
3618                         uthread_t uth = get_bsdthread_info(th);
3619                         tvp = uth->uu_cdir;
3620                         uth->uu_cdir = ndp->ni_vp;
3621                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3622                 } else {
3623                         vnode_rele(ndp->ni_vp);
3624                         return ENOENT;
3625                 }
3626         } else {
3627                 proc_fdlock(p);
3628                 tvp = fdp->fd_cdir;
3629                 fdp->fd_cdir = ndp->ni_vp;
3630                 proc_fdunlock(p);
3631         }
3632
3633         if (tvp) {
3634                 vnode_rele(tvp);
3635         }
3636
3637         return 0;
3638 }
3639
3640
3641 /*
3642  * Change current working directory (".").
3643  *
3644  * Returns:     0                       Success
3645  *      chdir_internal:ENOTDIR
3646  *      chdir_internal:ENOENT           No such file or directory
3647  *      chdir_internal:???
3648  */
3649 /* ARGSUSED */
3650 static int
3651 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3652 {
3653         struct nameidata nd;
3654         vfs_context_t ctx = vfs_context_current();
3655
3656         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3657             UIO_USERSPACE, uap->path, ctx);
3658
3659         return chdir_internal(p, ctx, &nd, per_thread);
3660 }
3661
3662
3663 /*
3664  * chdir
3665  *
3666  * Change current working directory (".") for the entire process
3667  *
3668  * Parameters:  p       Process requesting the call
3669  *              uap     User argument descriptor (see below)
3670  *              retval  (ignored)
3671  *
3672  * Indirect parameters: uap->path       Directory path
3673  *
3674  * Returns:     0                       Success
3675  *              common_chdir: ENOTDIR
3676  *              common_chdir: ENOENT    No such file or directory
3677  *              common_chdir: ???
3678  *
3679  */
3680 int
3681 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3682 {
3683         return common_chdir(p, (void *)uap, 0);
3684 }
3685
3686 /*
3687  * __pthread_chdir
3688  *
3689  * Change current working directory (".") for a single thread
3690  *
3691  * Parameters:  p       Process requesting the call
3692  *              uap     User argument descriptor (see below)
3693  *              retval  (ignored)
3694  *
3695  * Indirect parameters: uap->path       Directory path
3696  *
3697  * Returns:     0                       Success
3698  *              common_chdir: ENOTDIR
3699  *              common_chdir: ENOENT    No such file or directory
3700  *              common_chdir: ???
3701  *
3702  */
3703 int
3704 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3705 {
3706         return common_chdir(p, (void *)uap, 1);
3707 }
3708
3709
3710 /*
3711  * Change notion of root (``/'') directory.
3712  */
3713 /* ARGSUSED */
3714 int
3715 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3716 {
3717         struct filedesc *fdp = p->p_fd;
3718         int error;
3719         struct nameidata nd;
3720         vnode_t tvp;
3721         vfs_context_t ctx = vfs_context_current();
3722
3723         if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3724                 return error;
3725         }
3726
3727         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3728             UIO_USERSPACE, uap->path, ctx);
3729         error = change_dir(&nd, ctx);
3730         if (error) {
3731                 return error;
3732         }
3733
3734 #if CONFIG_MACF
3735         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3736             &nd.ni_cnd);
3737         if (error) {
3738                 vnode_put(nd.ni_vp);
3739                 return error;
3740         }
3741 #endif
3742
3743         if ((error = vnode_ref(nd.ni_vp))) {
3744                 vnode_put(nd.ni_vp);
3745                 return error;
3746         }
3747         vnode_put(nd.ni_vp);
3748
3749         proc_fdlock(p);
3750         tvp = fdp->fd_rdir;
3751         fdp->fd_rdir = nd.ni_vp;
3752         fdp->fd_flags |= FD_CHROOT;
3753         proc_fdunlock(p);
3754
3755         if (tvp != NULL) {
3756                 vnode_rele(tvp);
3757         }
3758
3759         return 0;
3760 }
3761
3762 /*
3763  * Common routine for chroot and chdir.
3764  *
3765  * Returns:     0                       Success
3766  *              ENOTDIR                 Not a directory
3767  *              namei:???               [anything namei can return]
3768  *              vnode_authorize:???     [anything vnode_authorize can return]
3769  */
3770 static int
3771 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3772 {
3773         vnode_t vp;
3774         int error;
3775
3776         if ((error = namei(ndp))) {
3777                 return error;
3778         }
3779         nameidone(ndp);
3780         vp = ndp->ni_vp;
3781
3782         if (vp->v_type != VDIR) {
3783                 vnode_put(vp);
3784                 return ENOTDIR;
3785         }
3786
3787 #if CONFIG_MACF
3788         error = mac_vnode_check_chdir(ctx, vp);
3789         if (error) {
3790                 vnode_put(vp);
3791                 return error;
3792         }
3793 #endif
3794
3795         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3796         if (error) {
3797                 vnode_put(vp);
3798                 return error;
3799         }
3800
3801         return error;
3802 }
3803
3804 /*
3805  * Free the vnode data (for directories) associated with the file glob.
3806  */
3807 struct fd_vn_data *
3808 fg_vn_data_alloc(void)
3809 {
3810         struct fd_vn_data *fvdata;
3811
3812         /* Allocate per fd vnode data */
3813         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3814             M_FD_VN_DATA, M_WAITOK | M_ZERO);
3815         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3816         return fvdata;
3817 }
3818
3819 /*
3820  * Free the vnode data (for directories) associated with the file glob.
3821  */
3822 void
3823 fg_vn_data_free(void *fgvndata)
3824 {
3825         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3826
3827         if (fvdata->fv_buf) {
3828                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3829         }
3830         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3831         FREE(fvdata, M_FD_VN_DATA);
3832 }
3833
3834 /*
3835  * Check permissions, allocate an open file structure,
3836  * and call the device open routine if any.
3837  *
3838  * Returns:     0                       Success
3839  *              EINVAL
3840  *              EINTR
3841  *      falloc:ENFILE
3842  *      falloc:EMFILE
3843  *      falloc:ENOMEM
3844  *      vn_open_auth:???
3845  *      dupfdopen:???
3846  *      VNOP_ADVLOCK:???
3847  *      vnode_setsize:???
3848  *
3849  * XXX Need to implement uid, gid
3850  */
3851 int
3852 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3853     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3854     int32_t *retval)
3855 {
3856         proc_t p = vfs_context_proc(ctx);
3857         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3858         struct fileproc *fp;
3859         vnode_t vp;
3860         int flags, oflags;
3861         int type, indx, error;
3862         struct flock lf;
3863         struct vfs_context context;
3864
3865         oflags = uflags;
3866
3867         if ((oflags & O_ACCMODE) == O_ACCMODE) {
3868                 return EINVAL;
3869         }
3870
3871         flags = FFLAGS(uflags);
3872         CLR(flags, FENCRYPTED);
3873         CLR(flags, FUNENCRYPTED);
3874
3875         AUDIT_ARG(fflags, oflags);
3876         AUDIT_ARG(mode, vap->va_mode);
3877
3878         if ((error = falloc_withalloc(p,
3879             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3880                 return error;
3881         }
3882         uu->uu_dupfd = -indx - 1;
3883
3884         if ((error = vn_open_auth(ndp, &flags, vap))) {
3885                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {       /* XXX from fdopen */
3886                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3887                                 fp_drop(p, indx, NULL, 0);
3888                                 *retval = indx;
3889                                 return 0;
3890                         }
3891                 }
3892                 if (error == ERESTART) {
3893                         error = EINTR;
3894                 }
3895                 fp_free(p, indx, fp);
3896                 return error;
3897         }
3898         uu->uu_dupfd = 0;
3899         vp = ndp->ni_vp;
3900
3901         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3902         fp->f_fglob->fg_ops = &vnops;
3903         fp->f_fglob->fg_data = (caddr_t)vp;
3904
3905         if (flags & (O_EXLOCK | O_SHLOCK)) {
3906                 lf.l_whence = SEEK_SET;
3907                 lf.l_start = 0;
3908                 lf.l_len = 0;
3909                 if (flags & O_EXLOCK) {
3910                         lf.l_type = F_WRLCK;
3911                 } else {
3912                         lf.l_type = F_RDLCK;
3913                 }
3914                 type = F_FLOCK;
3915                 if ((flags & FNONBLOCK) == 0) {
3916                         type |= F_WAIT;
3917                 }
3918 #if CONFIG_MACF
3919                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3920                     F_SETLK, &lf);
3921                 if (error) {
3922                         goto bad;
3923                 }
3924 #endif
3925                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL))) {
3926                         goto bad;
3927                 }
3928                 fp->f_fglob->fg_flag |= FHASLOCK;
3929         }
3930
3931         /* try to truncate by setting the size attribute */
3932         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
3933                 goto bad;
3934         }
3935
3936         /*
3937          * For directories we hold some additional information in the fd.
3938          */
3939         if (vnode_vtype(vp) == VDIR) {
3940                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3941         } else {
3942                 fp->f_fglob->fg_vn_data = NULL;
3943         }
3944
3945         vnode_put(vp);
3946
3947         /*
3948          * The first terminal open (without a O_NOCTTY) by a session leader
3949          * results in it being set as the controlling terminal.
3950          */
3951         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3952             !(flags & O_NOCTTY)) {
3953                 int tmp = 0;
3954
3955                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3956                     (caddr_t)&tmp, ctx);
3957         }
3958
3959         proc_fdlock(p);
3960         if (flags & O_CLOEXEC) {
3961                 *fdflags(p, indx) |= UF_EXCLOSE;
3962         }
3963         if (flags & O_CLOFORK) {
3964                 *fdflags(p, indx) |= UF_FORKCLOSE;
3965         }
3966         procfdtbl_releasefd(p, indx, NULL);
3967
3968 #if CONFIG_SECLUDED_MEMORY
3969         if (secluded_for_filecache &&
3970             FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
3971             vnode_vtype(vp) == VREG) {
3972                 memory_object_control_t moc;
3973
3974                 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
3975
3976                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
3977                         /* nothing to do... */
3978                 } else if (fp->f_fglob->fg_flag & FWRITE) {
3979                         /* writable -> no longer  eligible for secluded pages */
3980                         memory_object_mark_eligible_for_secluded(moc,
3981                             FALSE);
3982                 } else if (secluded_for_filecache == 1) {
3983                         char pathname[32] = { 0, };
3984                         size_t copied;
3985                         /* XXX FBDP: better way to detect /Applications/ ? */
3986                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3987                                 (void)copyinstr(ndp->ni_dirp,
3988                                     pathname,
3989                                     sizeof(pathname),
3990                                     &copied);
3991                         } else {
3992                                 copystr(CAST_DOWN(void *, ndp->ni_dirp),
3993                                     pathname,
3994                                     sizeof(pathname),
3995                                     &copied);
3996                         }
3997                         pathname[sizeof(pathname) - 1] = '\0';
3998                         if (strncmp(pathname,
3999                             "/Applications/",
4000                             strlen("/Applications/")) == 0 &&
4001                             strncmp(pathname,
4002                             "/Applications/Camera.app/",
4003                             strlen("/Applications/Camera.app/")) != 0) {
4004                                 /*
4005                                  * not writable
4006                                  * AND from "/Applications/"
4007                                  * AND not from "/Applications/Camera.app/"
4008                                  * ==> eligible for secluded
4009                                  */
4010                                 memory_object_mark_eligible_for_secluded(moc,
4011                                     TRUE);
4012                         }
4013                 } else if (secluded_for_filecache == 2) {
4014 #if __arm64__
4015 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
4016 #elif __arm__
4017 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
4018 #else
4019 /* not implemented... */
4020 #endif
4021                         size_t len = strlen(vp->v_name);
4022                         if (!strncmp(vp->v_name, DYLD_SHARED_CACHE_NAME, len) ||
4023                             !strncmp(vp->v_name, "dyld", len) ||
4024                             !strncmp(vp->v_name, "launchd", len) ||
4025                             !strncmp(vp->v_name, "Camera", len) ||
4026                             !strncmp(vp->v_name, "mediaserverd", len) ||
4027                             !strncmp(vp->v_name, "SpringBoard", len) ||
4028                             !strncmp(vp->v_name, "backboardd", len)) {
4029                                 /*
4030                                  * This file matters when launching Camera:
4031                                  * do not store its contents in the secluded
4032                                  * pool that will be drained on Camera launch.
4033                                  */
4034                                 memory_object_mark_eligible_for_secluded(moc,
4035                                     FALSE);
4036                         }
4037                 }
4038         }
4039 #endif /* CONFIG_SECLUDED_MEMORY */
4040
4041         fp_drop(p, indx, fp, 1);
4042         proc_fdunlock(p);
4043
4044         *retval = indx;
4045
4046         return 0;
4047 bad:
4048         context = *vfs_context_current();
4049         context.vc_ucred = fp->f_fglob->fg_cred;
4050
4051         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
4052             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
4053                 lf.l_whence = SEEK_SET;
4054                 lf.l_start = 0;
4055                 lf.l_len = 0;
4056                 lf.l_type = F_UNLCK;
4057
4058                 (void)VNOP_ADVLOCK(
4059                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4060         }
4061
4062         vn_close(vp, fp->f_fglob->fg_flag, &context);
4063         vnode_put(vp);
4064         fp_free(p, indx, fp);
4065
4066         return error;
4067 }
4068
4069 /*
4070  * While most of the *at syscall handlers can call nameiat() which
4071  * is a wrapper around namei, the use of namei and initialisation
4072  * of nameidata are far removed and in different functions  - namei
4073  * gets called in vn_open_auth for open1. So we'll just do here what
4074  * nameiat() does.
4075  */
4076 static int
4077 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4078     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
4079     int dirfd)
4080 {
4081         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4082                 int error;
4083                 char c;
4084
4085                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4086                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
4087                         if (error) {
4088                                 return error;
4089                         }
4090                 } else {
4091                         c = *((char *)(ndp->ni_dirp));
4092                 }
4093
4094                 if (c != '/') {
4095                         vnode_t dvp_at;
4096
4097                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4098                             &dvp_at);
4099                         if (error) {
4100                                 return error;
4101                         }
4102
4103                         if (vnode_vtype(dvp_at) != VDIR) {
4104                                 vnode_put(dvp_at);
4105                                 return ENOTDIR;
4106                         }
4107
4108                         ndp->ni_dvp = dvp_at;
4109                         ndp->ni_cnd.cn_flags |= USEDVP;
4110                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
4111                             retval);
4112                         vnode_put(dvp_at);
4113                         return error;
4114                 }
4115         }
4116
4117         return open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval);
4118 }
4119
4120 /*
4121  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4122  *
4123  * Parameters:  p                       Process requesting the open
4124  *              uap                     User argument descriptor (see below)
4125  *              retval                  Pointer to an area to receive the
4126  *                                      return calue from the system call
4127  *
4128  * Indirect:    uap->path               Path to open (same as 'open')
4129  *              uap->flags              Flags to open (same as 'open'
4130  *              uap->uid                UID to set, if creating
4131  *              uap->gid                GID to set, if creating
4132  *              uap->mode               File mode, if creating (same as 'open')
4133  *              uap->xsecurity          ACL to set, if creating
4134  *
4135  * Returns:     0                       Success
4136  *              !0                      errno value
4137  *
4138  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4139  *
4140  * XXX:         We should enummerate the possible errno values here, and where
4141  *              in the code they originated.
4142  */
4143 int
4144 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4145 {
4146         struct filedesc *fdp = p->p_fd;
4147         int ciferror;
4148         kauth_filesec_t xsecdst;
4149         struct vnode_attr va;
4150         struct nameidata nd;
4151         int cmode;
4152
4153         AUDIT_ARG(owner, uap->uid, uap->gid);
4154
4155         xsecdst = NULL;
4156         if ((uap->xsecurity != USER_ADDR_NULL) &&
4157             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4158                 return ciferror;
4159         }
4160
4161         VATTR_INIT(&va);
4162         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4163         VATTR_SET(&va, va_mode, cmode);
4164         if (uap->uid != KAUTH_UID_NONE) {
4165                 VATTR_SET(&va, va_uid, uap->uid);
4166         }
4167         if (uap->gid != KAUTH_GID_NONE) {
4168                 VATTR_SET(&va, va_gid, uap->gid);
4169         }
4170         if (xsecdst != NULL) {
4171                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4172         }
4173
4174         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4175             uap->path, vfs_context_current());
4176
4177         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4178             fileproc_alloc_init, NULL, retval);
4179         if (xsecdst != NULL) {
4180                 kauth_filesec_free(xsecdst);
4181         }
4182
4183         return ciferror;
4184 }
4185
4186 /*
4187  * Go through the data-protected atomically controlled open (2)
4188  *
4189  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4190  */
4191 int
4192 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4193 {
4194         int flags = uap->flags;
4195         int class = uap->class;
4196         int dpflags = uap->dpflags;
4197
4198         /*
4199          * Follow the same path as normal open(2)
4200          * Look up the item if it exists, and acquire the vnode.
4201          */
4202         struct filedesc *fdp = p->p_fd;
4203         struct vnode_attr va;
4204         struct nameidata nd;
4205         int cmode;
4206         int error;
4207
4208         VATTR_INIT(&va);
4209         /* Mask off all but regular access permissions */
4210         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4211         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4212
4213         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4214             uap->path, vfs_context_current());
4215
4216         /*
4217          * Initialize the extra fields in vnode_attr to pass down our
4218          * extra fields.
4219          * 1. target cprotect class.
4220          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4221          */
4222         if (flags & O_CREAT) {
4223                 /* lower level kernel code validates that the class is valid before applying it. */
4224                 if (class != PROTECTION_CLASS_DEFAULT) {
4225                         /*
4226                          * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4227                          * file behave the same as open (2)
4228                          */
4229                         VATTR_SET(&va, va_dataprotect_class, class);
4230                 }
4231         }
4232
4233         if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4234                 if (flags & (O_RDWR | O_WRONLY)) {
4235                         /* Not allowed to write raw encrypted bytes */
4236                         return EINVAL;
4237                 }
4238                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4239                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4240                 }
4241                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4242                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4243                 }
4244         }
4245
4246         error = open1(vfs_context_current(), &nd, uap->flags, &va,
4247             fileproc_alloc_init, NULL, retval);
4248
4249         return error;
4250 }
4251
4252 static int
4253 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4254     int fd, enum uio_seg segflg, int *retval)
4255 {
4256         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
4257         struct vnode_attr va;
4258         struct nameidata nd;
4259         int cmode;
4260
4261         VATTR_INIT(&va);
4262         /* Mask off all but regular access permissions */
4263         cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4264         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4265
4266         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4267             segflg, path, ctx);
4268
4269         return open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
4270                    retval, fd);
4271 }
4272
4273 int
4274 open(proc_t p, struct open_args *uap, int32_t *retval)
4275 {
4276         __pthread_testcancel(1);
4277         return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4278 }
4279
4280 int
4281 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4282     int32_t *retval)
4283 {
4284         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4285                    uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4286 }
4287
4288 int
4289 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4290     int32_t *retval)
4291 {
4292         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4293                    uap->mode, uap->fd, UIO_USERSPACE, retval);
4294 }
4295
4296 int
4297 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4298 {
4299         __pthread_testcancel(1);
4300         return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4301 }
4302
4303 /*
4304  * openbyid_np: open a file given a file system id and a file system object id
4305  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
4306  *      file systems that don't support object ids it is a node id (uint64_t).
4307  *
4308  * Parameters:  p                       Process requesting the open
4309  *              uap                     User argument descriptor (see below)
4310  *              retval                  Pointer to an area to receive the
4311  *                                      return calue from the system call
4312  *
4313  * Indirect:    uap->path               Path to open (same as 'open')
4314  *
4315  *              uap->fsid               id of target file system
4316  *              uap->objid              id of target file system object
4317  *              uap->flags              Flags to open (same as 'open')
4318  *
4319  * Returns:     0                       Success
4320  *              !0                      errno value
4321  *
4322  *
4323  * XXX:         We should enummerate the possible errno values here, and where
4324  *              in the code they originated.
4325  */
4326 int
4327 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4328 {
4329         fsid_t fsid;
4330         uint64_t objid;
4331         int error;
4332         char *buf = NULL;
4333         int buflen = MAXPATHLEN;
4334         int pathlen = 0;
4335         vfs_context_t ctx = vfs_context_current();
4336
4337         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4338                 return error;
4339         }
4340
4341         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4342                 return error;
4343         }
4344
4345         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4346         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4347                 return error;
4348         }
4349
4350         AUDIT_ARG(value32, fsid.val[0]);
4351         AUDIT_ARG(value64, objid);
4352
4353         /*resolve path from fsis, objid*/
4354         do {
4355                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
4356                 if (buf == NULL) {
4357                         return ENOMEM;
4358                 }
4359
4360                 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4361                     buf, FSOPT_ISREALFSID, &pathlen);
4362
4363                 if (error) {
4364                         FREE(buf, M_TEMP);
4365                         buf = NULL;
4366                 }
4367         } while (error == ENOSPC && (buflen += MAXPATHLEN));
4368
4369         if (error) {
4370                 return error;
4371         }
4372
4373         buf[pathlen] = 0;
4374
4375         error = openat_internal(
4376                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4377
4378         FREE(buf, M_TEMP);
4379
4380         return error;
4381 }
4382
4383
4384 /*
4385  * Create a special file.
4386  */
4387 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4388
4389 int
4390 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4391 {
4392         struct vnode_attr va;
4393         vfs_context_t ctx = vfs_context_current();
4394         int error;
4395         struct nameidata nd;
4396         vnode_t vp, dvp;
4397
4398         VATTR_INIT(&va);
4399         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4400         VATTR_SET(&va, va_rdev, uap->dev);
4401
4402         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4403         if ((uap->mode & S_IFMT) == S_IFIFO) {
4404                 return mkfifo1(ctx, uap->path, &va);
4405         }
4406
4407         AUDIT_ARG(mode, uap->mode);
4408         AUDIT_ARG(value32, uap->dev);
4409
4410         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4411                 return error;
4412         }
4413         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4414             UIO_USERSPACE, uap->path, ctx);
4415         error = namei(&nd);
4416         if (error) {
4417                 return error;
4418         }
4419         dvp = nd.ni_dvp;
4420         vp = nd.ni_vp;
4421
4422         if (vp != NULL) {
4423                 error = EEXIST;
4424                 goto out;
4425         }
4426
4427         switch (uap->mode & S_IFMT) {
4428         case S_IFCHR:
4429                 VATTR_SET(&va, va_type, VCHR);
4430                 break;
4431         case S_IFBLK:
4432                 VATTR_SET(&va, va_type, VBLK);
4433                 break;
4434         default:
4435                 error = EINVAL;
4436                 goto out;
4437         }
4438
4439 #if CONFIG_MACF
4440         error = mac_vnode_check_create(ctx,
4441             nd.ni_dvp, &nd.ni_cnd, &va);
4442         if (error) {
4443                 goto out;
4444         }
4445 #endif
4446
4447         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4448                 goto out;
4449         }
4450
4451         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4452                 goto out;
4453         }
4454
4455         if (vp) {
4456                 int     update_flags = 0;
4457
4458                 // Make sure the name & parent pointers are hooked up
4459                 if (vp->v_name == NULL) {
4460                         update_flags |= VNODE_UPDATE_NAME;
4461                 }
4462                 if (vp->v_parent == NULLVP) {
4463                         update_flags |= VNODE_UPDATE_PARENT;
4464                 }
4465
4466                 if (update_flags) {
4467                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4468                 }
4469
4470 #if CONFIG_FSE
4471                 add_fsevent(FSE_CREATE_FILE, ctx,
4472                     FSE_ARG_VNODE, vp,
4473                     FSE_ARG_DONE);
4474 #endif
4475         }
4476
4477 out:
4478         /*
4479          * nameidone has to happen before we vnode_put(dvp)
4480          * since it may need to release the fs_nodelock on the dvp
4481          */
4482         nameidone(&nd);
4483
4484         if (vp) {
4485                 vnode_put(vp);
4486         }
4487         vnode_put(dvp);
4488
4489         return error;
4490 }
4491
4492 /*
4493  * Create a named pipe.
4494  *
4495  * Returns:     0                       Success
4496  *              EEXIST
4497  *      namei:???
4498  *      vnode_authorize:???
4499  *      vn_create:???
4500  */
4501 static int
4502 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4503 {
4504         vnode_t vp, dvp;
4505         int error;
4506         struct nameidata nd;
4507
4508         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4509             UIO_USERSPACE, upath, ctx);
4510         error = namei(&nd);
4511         if (error) {
4512                 return error;
4513         }
4514         dvp = nd.ni_dvp;
4515         vp = nd.ni_vp;
4516
4517         /* check that this is a new file and authorize addition */
4518         if (vp != NULL) {
4519                 error = EEXIST;
4520                 goto out;
4521         }
4522         VATTR_SET(vap, va_type, VFIFO);
4523
4524         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4525                 goto out;
4526         }
4527
4528         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4529 out:
4530         /*
4531          * nameidone has to happen before we vnode_put(dvp)
4532          * since it may need to release the fs_nodelock on the dvp
4533          */
4534         nameidone(&nd);
4535
4536         if (vp) {
4537                 vnode_put(vp);
4538         }
4539         vnode_put(dvp);
4540
4541         return error;
4542 }
4543
4544
4545 /*
4546  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4547  *
4548  * Parameters:  p                       Process requesting the open
4549  *              uap                     User argument descriptor (see below)
4550  *              retval                  (Ignored)
4551  *
4552  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
4553  *              uap->uid                UID to set
4554  *              uap->gid                GID to set
4555  *              uap->mode               File mode to set (same as 'mkfifo')
4556  *              uap->xsecurity          ACL to set, if creating
4557  *
4558  * Returns:     0                       Success
4559  *              !0                      errno value
4560  *
4561  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4562  *
4563  * XXX:         We should enummerate the possible errno values here, and where
4564  *              in the code they originated.
4565  */
4566 int
4567 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4568 {
4569         int ciferror;
4570         kauth_filesec_t xsecdst;
4571         struct vnode_attr va;
4572
4573         AUDIT_ARG(owner, uap->uid, uap->gid);
4574
4575         xsecdst = KAUTH_FILESEC_NONE;
4576         if (uap->xsecurity != USER_ADDR_NULL) {
4577                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4578                         return ciferror;
4579                 }
4580         }
4581
4582         VATTR_INIT(&va);
4583         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4584         if (uap->uid != KAUTH_UID_NONE) {
4585                 VATTR_SET(&va, va_uid, uap->uid);
4586         }
4587         if (uap->gid != KAUTH_GID_NONE) {
4588                 VATTR_SET(&va, va_gid, uap->gid);
4589         }
4590         if (xsecdst != KAUTH_FILESEC_NONE) {
4591                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4592         }
4593
4594         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4595
4596         if (xsecdst != KAUTH_FILESEC_NONE) {
4597                 kauth_filesec_free(xsecdst);
4598         }
4599         return ciferror;
4600 }
4601
4602 /* ARGSUSED */
4603 int
4604 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4605 {
4606         struct vnode_attr va;
4607
4608         VATTR_INIT(&va);
4609         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4610
4611         return mkfifo1(vfs_context_current(), uap->path, &va);
4612 }
4613
4614
4615 static char *
4616 my_strrchr(char *p, int ch)
4617 {
4618         char *save;
4619
4620         for (save = NULL;; ++p) {
4621                 if (*p == ch) {
4622                         save = p;
4623                 }
4624                 if (!*p) {
4625                         return save;
4626                 }
4627         }
4628         /* NOTREACHED */
4629 }
4630
4631 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4632 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4633 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4634
4635 int
4636 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4637 {
4638         int ret, len = _len;
4639
4640         *truncated_path = 0;
4641
4642         if (firmlink) {
4643                 ret = vn_getpath(dvp, path, &len);
4644         } else {
4645                 ret = vn_getpath_no_firmlink(dvp, path, &len);
4646         }
4647         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4648                 if (leafname) {
4649                         path[len - 1] = '/';
4650                         len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4651                         if (len > MAXPATHLEN) {
4652                                 char *ptr;
4653
4654                                 // the string got truncated!
4655                                 *truncated_path = 1;
4656                                 ptr = my_strrchr(path, '/');
4657                                 if (ptr) {
4658                                         *ptr = '\0';   // chop off the string at the last directory component
4659                                 }
4660                                 len = strlen(path) + 1;
4661                         }
4662                 }
4663         } else if (ret == 0) {
4664                 *truncated_path = 1;
4665         } else if (ret != 0) {
4666                 struct vnode *mydvp = dvp;
4667
4668                 if (ret != ENOSPC) {
4669                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4670                             dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4671                 }
4672                 *truncated_path = 1;
4673
4674                 do {
4675                         if (mydvp->v_parent != NULL) {
4676                                 mydvp = mydvp->v_parent;
4677                         } else if (mydvp->v_mount) {
4678                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4679                                 break;
4680                         } else {
4681                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4682                                 strlcpy(path, "/", _len);
4683                                 len = 2;
4684                                 mydvp = NULL;
4685                         }
4686
4687                         if (mydvp == NULL) {
4688                                 break;
4689                         }
4690
4691                         len = _len;
4692                         if (firmlink) {
4693                                 ret = vn_getpath(mydvp, path, &len);
4694                         } else {
4695                                 ret = vn_getpath_no_firmlink(mydvp, path, &len);
4696                         }
4697                 } while (ret == ENOSPC);
4698         }
4699
4700         return len;
4701 }
4702
4703 int
4704 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4705 {
4706         return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
4707 }
4708
4709 int
4710 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4711 {
4712         return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
4713 }
4714
4715 /*
4716  * Make a hard file link.
4717  *
4718  * Returns:     0                       Success
4719  *              EPERM
4720  *              EEXIST
4721  *              EXDEV
4722  *      namei:???
4723  *      vnode_authorize:???
4724  *      VNOP_LINK:???
4725  */
4726 /* ARGSUSED */
4727 static int
4728 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4729     user_addr_t link, int flag, enum uio_seg segflg)
4730 {
4731         vnode_t vp, pvp, dvp, lvp;
4732         struct nameidata nd;
4733         int follow;
4734         int error;
4735 #if CONFIG_FSE
4736         fse_info finfo;
4737 #endif
4738         int need_event, has_listeners, need_kpath2;
4739         char *target_path = NULL;
4740         int truncated = 0;
4741
4742         vp = dvp = lvp = NULLVP;
4743
4744         /* look up the object we are linking to */
4745         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4746         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4747             segflg, path, ctx);
4748
4749         error = nameiat(&nd, fd1);
4750         if (error) {
4751                 return error;
4752         }
4753         vp = nd.ni_vp;
4754
4755         nameidone(&nd);
4756
4757         /*
4758          * Normally, linking to directories is not supported.
4759          * However, some file systems may have limited support.
4760          */
4761         if (vp->v_type == VDIR) {
4762                 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4763                         error = EPERM;   /* POSIX */
4764                         goto out;
4765                 }
4766
4767                 /* Linking to a directory requires ownership. */
4768                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4769                         struct vnode_attr dva;
4770
4771                         VATTR_INIT(&dva);
4772                         VATTR_WANTED(&dva, va_uid);
4773                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4774                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4775                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4776                                 error = EACCES;
4777                                 goto out;
4778                         }
4779                 }
4780         }
4781
4782         /* lookup the target node */
4783 #if CONFIG_TRIGGERS
4784         nd.ni_op = OP_LINK;
4785 #endif
4786         nd.ni_cnd.cn_nameiop = CREATE;
4787         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4788         nd.ni_dirp = link;
4789         error = nameiat(&nd, fd2);
4790         if (error != 0) {
4791                 goto out;
4792         }
4793         dvp = nd.ni_dvp;
4794         lvp = nd.ni_vp;
4795
4796 #if CONFIG_MACF
4797         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
4798                 goto out2;
4799         }
4800 #endif
4801
4802         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4803         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
4804                 goto out2;
4805         }
4806
4807         /* target node must not exist */
4808         if (lvp != NULLVP) {
4809                 error = EEXIST;
4810                 goto out2;
4811         }
4812         /* cannot link across mountpoints */
4813         if (vnode_mount(vp) != vnode_mount(dvp)) {
4814                 error = EXDEV;
4815                 goto out2;
4816         }
4817
4818         /* authorize creation of the target note */
4819         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4820                 goto out2;
4821         }
4822
4823         /* and finally make the link */
4824         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4825         if (error) {
4826                 goto out2;
4827         }
4828
4829 #if CONFIG_MACF
4830         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4831 #endif
4832
4833 #if CONFIG_FSE
4834         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4835 #else
4836         need_event = 0;
4837 #endif
4838         has_listeners = kauth_authorize_fileop_has_listeners();
4839
4840         need_kpath2 = 0;
4841 #if CONFIG_AUDIT
4842         if (AUDIT_RECORD_EXISTS()) {
4843                 need_kpath2 = 1;
4844         }
4845 #endif
4846
4847         if (need_event || has_listeners || need_kpath2) {
4848                 char *link_to_path = NULL;
4849                 int len, link_name_len;
4850
4851                 /* build the path to the new link file */
4852                 GET_PATH(target_path);
4853                 if (target_path == NULL) {
4854                         error = ENOMEM;
4855                         goto out2;
4856                 }
4857
4858                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4859
4860                 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
4861
4862                 if (has_listeners) {
4863                         /* build the path to file we are linking to */
4864                         GET_PATH(link_to_path);
4865                         if (link_to_path == NULL) {
4866                                 error = ENOMEM;
4867                                 goto out2;
4868                         }
4869
4870                         link_name_len = MAXPATHLEN;
4871                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4872                                 /*
4873                                  * Call out to allow 3rd party notification of rename.
4874                                  * Ignore result of kauth_authorize_fileop call.
4875                                  */
4876                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4877                                     (uintptr_t)link_to_path,
4878                                     (uintptr_t)target_path);
4879                         }
4880                         if (link_to_path != NULL) {
4881                                 RELEASE_PATH(link_to_path);
4882                         }
4883                 }
4884 #if CONFIG_FSE
4885                 if (need_event) {
4886                         /* construct fsevent */
4887                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4888                                 if (truncated) {
4889                                         finfo.mode |= FSE_TRUNCATED_PATH;
4890                                 }
4891
4892                                 // build the path to the destination of the link
4893                                 add_fsevent(FSE_CREATE_FILE, ctx,
4894                                     FSE_ARG_STRING, len, target_path,
4895                                     FSE_ARG_FINFO, &finfo,
4896                                     FSE_ARG_DONE);
4897                         }
4898
4899                         pvp = vp->v_parent;
4900                         // need an iocount on pvp in this case
4901                         if (pvp && pvp != dvp) {
4902                                 error = vnode_get(pvp);
4903                                 if (error) {
4904                                         pvp = NULLVP;
4905                                         error = 0;
4906                                 }
4907                         }
4908                         if (pvp) {
4909                                 add_fsevent(FSE_STAT_CHANGED, ctx,
4910                                     FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
4911                         }
4912                         if (pvp && pvp != dvp) {
4913                                 vnode_put(pvp);
4914                         }
4915                 }
4916 #endif
4917         }
4918 out2:
4919         /*
4920          * nameidone has to happen before we vnode_put(dvp)
4921          * since it may need to release the fs_nodelock on the dvp
4922          */
4923         nameidone(&nd);
4924         if (target_path != NULL) {
4925                 RELEASE_PATH(target_path);
4926         }
4927 out:
4928         if (lvp) {
4929                 vnode_put(lvp);
4930         }
4931         if (dvp) {
4932                 vnode_put(dvp);
4933         }
4934         vnode_put(vp);
4935         return error;
4936 }
4937
4938 int
4939 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4940 {
4941         return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4942                    AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
4943 }
4944
4945 int
4946 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4947 {
4948         if (uap->flag & ~AT_SYMLINK_FOLLOW) {
4949                 return EINVAL;
4950         }
4951
4952         return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4953                    uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
4954 }
4955
4956 /*
4957  * Make a symbolic link.
4958  *
4959  * We could add support for ACLs here too...
4960  */
4961 /* ARGSUSED */
4962 static int
4963 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4964     user_addr_t link, enum uio_seg segflg)
4965 {
4966         struct vnode_attr va;
4967         char *path;
4968         int error;
4969         struct nameidata nd;
4970         vnode_t vp, dvp;
4971         size_t dummy = 0;
4972         proc_t p;
4973
4974         error = 0;
4975         if (UIO_SEG_IS_USER_SPACE(segflg)) {
4976                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
4977                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
4978         } else {
4979                 path = (char *)path_data;
4980         }
4981         if (error) {
4982                 goto out;
4983         }
4984         AUDIT_ARG(text, path);  /* This is the link string */
4985
4986         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
4987             segflg, link, ctx);
4988
4989         error = nameiat(&nd, fd);
4990         if (error) {
4991                 goto out;
4992         }
4993         dvp = nd.ni_dvp;
4994         vp = nd.ni_vp;
4995
4996         p = vfs_context_proc(ctx);
4997         VATTR_INIT(&va);
4998         VATTR_SET(&va, va_type, VLNK);
4999         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
5000
5001 #if CONFIG_MACF
5002         error = mac_vnode_check_create(ctx,
5003             dvp, &nd.ni_cnd, &va);
5004 #endif
5005         if (error != 0) {
5006                 goto skipit;
5007         }
5008
5009         if (vp != NULL) {
5010                 error = EEXIST;
5011                 goto skipit;
5012         }
5013
5014         /* authorize */
5015         if (error == 0) {
5016                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5017         }
5018         /* get default ownership, etc. */
5019         if (error == 0) {
5020                 error = vnode_authattr_new(dvp, &va, 0, ctx);
5021         }
5022         if (error == 0) {
5023                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5024         }
5025
5026 #if CONFIG_MACF
5027         if (error == 0 && vp) {
5028                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5029         }
5030 #endif
5031
5032         /* do fallback attribute handling */
5033         if (error == 0 && vp) {
5034                 error = vnode_setattr_fallback(vp, &va, ctx);
5035         }
5036
5037         if (error == 0) {
5038                 int     update_flags = 0;
5039
5040                 /*check if a new vnode was created, else try to get one*/
5041                 if (vp == NULL) {
5042                         nd.ni_cnd.cn_nameiop = LOOKUP;
5043 #if CONFIG_TRIGGERS
5044                         nd.ni_op = OP_LOOKUP;
5045 #endif
5046                         nd.ni_cnd.cn_flags = 0;
5047                         error = nameiat(&nd, fd);
5048                         vp = nd.ni_vp;
5049
5050                         if (vp == NULL) {
5051                                 goto skipit;
5052                         }
5053                 }
5054
5055 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5056                 /* call out to allow 3rd party notification of rename.
5057                  * Ignore result of kauth_authorize_fileop call.
5058                  */
5059                 if (kauth_authorize_fileop_has_listeners() &&
5060                     namei(&nd) == 0) {
5061                         char *new_link_path = NULL;
5062                         int             len;
5063
5064                         /* build the path to the new link file */
5065                         new_link_path = get_pathbuff();
5066                         len = MAXPATHLEN;
5067                         vn_getpath(dvp, new_link_path, &len);
5068                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5069                                 new_link_path[len - 1] = '/';
5070                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5071                         }
5072
5073                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5074                             (uintptr_t)path, (uintptr_t)new_link_path);
5075                         if (new_link_path != NULL) {
5076                                 release_pathbuff(new_link_path);
5077                         }
5078                 }
5079 #endif
5080                 // Make sure the name & parent pointers are hooked up
5081                 if (vp->v_name == NULL) {
5082                         update_flags |= VNODE_UPDATE_NAME;
5083                 }
5084                 if (vp->v_parent == NULLVP) {
5085                         update_flags |= VNODE_UPDATE_PARENT;
5086                 }
5087
5088                 if (update_flags) {
5089                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5090                 }
5091
5092 #if CONFIG_FSE
5093                 add_fsevent(FSE_CREATE_FILE, ctx,
5094                     FSE_ARG_VNODE, vp,
5095                     FSE_ARG_DONE);
5096 #endif
5097         }
5098
5099 skipit:
5100         /*
5101          * nameidone has to happen before we vnode_put(dvp)
5102          * since it may need to release the fs_nodelock on the dvp
5103          */
5104         nameidone(&nd);
5105
5106         if (vp) {
5107                 vnode_put(vp);
5108         }
5109         vnode_put(dvp);
5110 out:
5111         if (path && (path != (char *)path_data)) {
5112                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
5113         }
5114
5115         return error;
5116 }
5117
5118 int
5119 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5120 {
5121         return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5122                    uap->link, UIO_USERSPACE);
5123 }
5124
5125 int
5126 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5127     __unused int32_t *retval)
5128 {
5129         return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5130                    uap->path2, UIO_USERSPACE);
5131 }
5132
5133 /*
5134  * Delete a whiteout from the filesystem.
5135  * No longer supported.
5136  */
5137 int
5138 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5139 {
5140         return ENOTSUP;
5141 }
5142
5143 /*
5144  * Delete a name from the filesystem.
5145  */
5146 /* ARGSUSED */
5147 static int
5148 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5149     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5150 {
5151         struct nameidata nd;
5152         vnode_t vp, dvp;
5153         int error;
5154         struct componentname *cnp;
5155         char  *path = NULL;
5156         char  *no_firmlink_path = NULL;
5157         int  len_path = 0;
5158         int  len_no_firmlink_path = 0;
5159 #if CONFIG_FSE
5160         fse_info  finfo;
5161         struct vnode_attr va;
5162 #endif
5163         int flags;
5164         int need_event;
5165         int has_listeners;
5166         int truncated_path;
5167         int truncated_no_firmlink_path;
5168         int batched;
5169         struct vnode_attr *vap;
5170         int do_retry;
5171         int retry_count = 0;
5172         int cn_flags;
5173
5174         cn_flags = LOCKPARENT;
5175         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5176                 cn_flags |= AUDITVNPATH1;
5177         }
5178         /* If a starting dvp is passed, it trumps any fd passed. */
5179         if (start_dvp) {
5180                 cn_flags |= USEDVP;
5181         }
5182
5183 #if NAMEDRSRCFORK
5184         /* unlink or delete is allowed on rsrc forks and named streams */
5185         cn_flags |= CN_ALLOWRSRCFORK;
5186 #endif
5187
5188 retry:
5189         do_retry = 0;
5190         flags = 0;
5191         need_event = 0;
5192         has_listeners = 0;
5193         truncated_path = 0;
5194         truncated_no_firmlink_path = 0;
5195         vap = NULL;
5196
5197         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5198
5199         nd.ni_dvp = start_dvp;
5200         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
5201         cnp = &nd.ni_cnd;
5202
5203 continue_lookup:
5204         error = nameiat(&nd, fd);
5205         if (error) {
5206                 return error;
5207         }
5208
5209         dvp = nd.ni_dvp;
5210         vp = nd.ni_vp;
5211
5212
5213         /* With Carbon delete semantics, busy files cannot be deleted */
5214         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5215                 flags |= VNODE_REMOVE_NODELETEBUSY;
5216         }
5217
5218         /* Skip any potential upcalls if told to. */
5219         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5220                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5221         }
5222
5223         if (vp) {
5224                 batched = vnode_compound_remove_available(vp);
5225                 /*
5226                  * The root of a mounted filesystem cannot be deleted.
5227                  */
5228                 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5229                         error = EBUSY;
5230                         goto out;
5231                 }
5232
5233 #if DEVELOPMENT || DEBUG
5234                 /*
5235                  * XXX VSWAP: Check for entitlements or special flag here
5236                  * so we can restrict access appropriately.
5237                  */
5238 #else /* DEVELOPMENT || DEBUG */
5239
5240                 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5241                         error = EPERM;
5242                         goto out;
5243                 }
5244 #endif /* DEVELOPMENT || DEBUG */
5245
5246                 if (!batched) {
5247                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5248                         if (error) {
5249                                 if (error == ENOENT) {
5250                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5251                                                 do_retry = 1;
5252                                                 retry_count++;
5253                                         }
5254                                 }
5255                                 goto out;
5256                         }
5257                 }
5258         } else {
5259                 batched = 1;
5260
5261                 if (!vnode_compound_remove_available(dvp)) {
5262                         panic("No vp, but no compound remove?");
5263                 }
5264         }
5265
5266 #if CONFIG_FSE
5267         need_event = need_fsevent(FSE_DELETE, dvp);
5268         if (need_event) {
5269                 if (!batched) {
5270                         if ((vp->v_flag & VISHARDLINK) == 0) {
5271                                 /* XXX need to get these data in batched VNOP */
5272                                 get_fse_info(vp, &finfo, ctx);
5273                         }
5274                 } else {
5275                         error = vfs_get_notify_attributes(&va);
5276                         if (error) {
5277                                 goto out;
5278                         }
5279
5280                         vap = &va;
5281                 }
5282         }
5283 #endif
5284         has_listeners = kauth_authorize_fileop_has_listeners();
5285         if (need_event || has_listeners) {
5286                 if (path == NULL) {
5287                         GET_PATH(path);
5288                         if (path == NULL) {
5289                                 error = ENOMEM;
5290                                 goto out;
5291                         }
5292                 }
5293                 len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5294                 if (no_firmlink_path == NULL) {
5295                         GET_PATH(no_firmlink_path);
5296                         if (no_firmlink_path == NULL) {
5297                                 error = ENOMEM;
5298                                 goto out;
5299                         }
5300                 }
5301                 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5302         }
5303
5304 #if NAMEDRSRCFORK
5305         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5306                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5307         } else
5308 #endif
5309         {
5310                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
5311                 vp = nd.ni_vp;
5312                 if (error == EKEEPLOOKING) {
5313                         if (!batched) {
5314                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5315                         }
5316
5317                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
5318                                 panic("EKEEPLOOKING, but continue flag not set?");
5319                         }
5320
5321                         if (vnode_isdir(vp)) {
5322                                 error = EISDIR;
5323                                 goto out;
5324                         }
5325                         goto continue_lookup;
5326                 } else if (error == ENOENT && batched) {
5327                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5328                                 /*
5329                                  * For compound VNOPs, the authorization callback may
5330                                  * return ENOENT in case of racing hardlink lookups
5331                                  * hitting the name  cache, redrive the lookup.
5332                                  */
5333                                 do_retry = 1;
5334                                 retry_count += 1;
5335                                 goto out;
5336                         }
5337                 }
5338         }
5339
5340         /*
5341          * Call out to allow 3rd party notification of delete.
5342          * Ignore result of kauth_authorize_fileop call.
5343          */
5344         if (!error) {
5345                 if (has_listeners) {
5346                         kauth_authorize_fileop(vfs_context_ucred(ctx),
5347                             KAUTH_FILEOP_DELETE,
5348                             (uintptr_t)vp,
5349                             (uintptr_t)path);
5350                 }
5351
5352                 if (vp->v_flag & VISHARDLINK) {
5353                         //
5354                         // if a hardlink gets deleted we want to blow away the
5355                         // v_parent link because the path that got us to this
5356                         // instance of the link is no longer valid.  this will
5357                         // force the next call to get the path to ask the file
5358                         // system instead of just following the v_parent link.
5359                         //
5360                         vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5361                 }
5362
5363 #if CONFIG_FSE
5364                 if (need_event) {
5365                         if (vp->v_flag & VISHARDLINK) {
5366                                 get_fse_info(vp, &finfo, ctx);
5367                         } else if (vap) {
5368                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
5369                         }
5370                         if (truncated_path) {
5371                                 finfo.mode |= FSE_TRUNCATED_PATH;
5372                         }
5373                         add_fsevent(FSE_DELETE, ctx,
5374                             FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5375                             FSE_ARG_FINFO, &finfo,
5376                             FSE_ARG_DONE);
5377                 }
5378 #endif
5379         }
5380
5381 out:
5382         if (path != NULL) {
5383                 RELEASE_PATH(path);
5384                 path = NULL;
5385         }
5386
5387         if (no_firmlink_path != NULL) {
5388                 RELEASE_PATH(no_firmlink_path);
5389                 no_firmlink_path = NULL;
5390         }
5391 #if NAMEDRSRCFORK
5392         /* recycle the deleted rsrc fork vnode to force a reclaim, which
5393          * will cause its shadow file to go away if necessary.
5394          */
5395         if (vp && (vnode_isnamedstream(vp)) &&
5396             (vp->v_parent != NULLVP) &&
5397             vnode_isshadow(vp)) {
5398                 vnode_recycle(vp);
5399         }
5400 #endif
5401         /*
5402          * nameidone has to happen before we vnode_put(dvp)
5403          * since it may need to release the fs_nodelock on the dvp
5404          */
5405         nameidone(&nd);
5406         vnode_put(dvp);
5407         if (vp) {
5408                 vnode_put(vp);
5409         }
5410
5411         if (do_retry) {
5412                 goto retry;
5413         }
5414
5415         return error;
5416 }
5417
5418 int
5419 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5420     enum uio_seg segflg, int unlink_flags)
5421 {
5422         return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5423                    unlink_flags);
5424 }
5425
5426 /*
5427  * Delete a name from the filesystem using Carbon semantics.
5428  */
5429 int
5430 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5431 {
5432         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5433                    uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5434 }
5435
5436 /*
5437  * Delete a name from the filesystem using POSIX semantics.
5438  */
5439 int
5440 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5441 {
5442         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5443                    uap->path, UIO_USERSPACE, 0);
5444 }
5445
5446 int
5447 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5448 {
5449         if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5450                 return EINVAL;
5451         }
5452
5453         if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5454                 int unlink_flags = 0;
5455
5456                 if (uap->flag & AT_REMOVEDIR_DATALESS) {
5457                         unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5458                 }
5459                 return rmdirat_internal(vfs_context_current(), uap->fd,
5460                            uap->path, UIO_USERSPACE, unlink_flags);
5461         } else {
5462                 return unlinkat_internal(vfs_context_current(), uap->fd,
5463                            NULLVP, uap->path, UIO_USERSPACE, 0);
5464         }
5465 }
5466
5467 /*
5468  * Reposition read/write file offset.
5469  */
5470 int
5471 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5472 {
5473         struct fileproc *fp;
5474         vnode_t vp;
5475         struct vfs_context *ctx;
5476         off_t offset = uap->offset, file_size;
5477         int error;
5478
5479         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5480                 if (error == ENOTSUP) {
5481                         return ESPIPE;
5482                 }
5483                 return error;
5484         }
5485         if (vnode_isfifo(vp)) {
5486                 file_drop(uap->fd);
5487                 return ESPIPE;
5488         }
5489
5490
5491         ctx = vfs_context_current();
5492 #if CONFIG_MACF
5493         if (uap->whence == L_INCR && uap->offset == 0) {
5494                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5495                     fp->f_fglob);
5496         } else {
5497                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5498                     fp->f_fglob);
5499         }
5500         if (error) {
5501                 file_drop(uap->fd);
5502                 return error;
5503         }
5504 #endif
5505         if ((error = vnode_getwithref(vp))) {
5506                 file_drop(uap->fd);
5507                 return error;
5508         }
5509
5510         switch (uap->whence) {
5511         case L_INCR:
5512                 offset += fp->f_fglob->fg_offset;
5513                 break;
5514         case L_XTND:
5515                 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5516                         break;
5517                 }
5518                 offset += file_size;
5519                 break;
5520         case L_SET:
5521                 break;
5522         case SEEK_HOLE:
5523                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5524                 break;
5525         case SEEK_DATA:
5526                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5527                 break;
5528         default:
5529                 error = EINVAL;
5530         }
5531         if (error == 0) {
5532                 if (uap->offset > 0 && offset < 0) {
5533                         /* Incremented/relative move past max size */
5534                         error = EOVERFLOW;
5535                 } else {
5536                         /*
5537                          * Allow negative offsets on character devices, per
5538                          * POSIX 1003.1-2001.  Most likely for writing disk
5539                          * labels.
5540                          */
5541                         if (offset < 0 && vp->v_type != VCHR) {
5542                                 /* Decremented/relative move before start */
5543                                 error = EINVAL;
5544                         } else {
5545                                 /* Success */
5546                                 fp->f_fglob->fg_offset = offset;
5547                                 *retval = fp->f_fglob->fg_offset;
5548                         }
5549                 }
5550         }
5551
5552         /*
5553          * An lseek can affect whether data is "available to read."  Use
5554          * hint of NOTE_NONE so no EVFILT_VNODE events fire
5555          */
5556         post_event_if_success(vp, error, NOTE_NONE);
5557         (void)vnode_put(vp);
5558         file_drop(uap->fd);
5559         return error;
5560 }
5561
5562
5563 /*
5564  * Check access permissions.
5565  *
5566  * Returns:     0                       Success
5567  *              vnode_authorize:???
5568  */
5569 static int
5570 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5571 {
5572         kauth_action_t action;
5573         int error;
5574
5575         /*
5576          * If just the regular access bits, convert them to something
5577          * that vnode_authorize will understand.
5578          */
5579         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5580                 action = 0;
5581                 if (uflags & R_OK) {
5582                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
5583                 }
5584                 if (uflags & W_OK) {
5585                         if (vnode_isdir(vp)) {
5586                                 action |= KAUTH_VNODE_ADD_FILE |
5587                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
5588                                 /* might want delete rights here too */
5589                         } else {
5590                                 action |= KAUTH_VNODE_WRITE_DATA;
5591                         }
5592                 }
5593                 if (uflags & X_OK) {
5594                         if (vnode_isdir(vp)) {
5595                                 action |= KAUTH_VNODE_SEARCH;
5596                         } else {
5597                                 action |= KAUTH_VNODE_EXECUTE;
5598                         }
5599                 }
5600         } else {
5601                 /* take advantage of definition of uflags */
5602                 action = uflags >> 8;
5603         }
5604
5605 #if CONFIG_MACF
5606         error = mac_vnode_check_access(ctx, vp, uflags);
5607         if (error) {
5608                 return error;
5609         }
5610 #endif /* MAC */
5611
5612         /* action == 0 means only check for existence */
5613         if (action != 0) {
5614                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5615         } else {
5616                 error = 0;
5617         }
5618
5619         return error;
5620 }
5621
5622
5623
5624 /*
5625  * access_extended: Check access permissions in bulk.
5626  *
5627  * Description: uap->entries            Pointer to an array of accessx
5628  *                                      descriptor structs, plus one or
5629  *                                      more NULL terminated strings (see
5630  *                                      "Notes" section below).
5631  *              uap->size               Size of the area pointed to by
5632  *                                      uap->entries.
5633  *              uap->results            Pointer to the results array.
5634  *
5635  * Returns:     0                       Success
5636  *              ENOMEM                  Insufficient memory
5637  *              EINVAL                  Invalid arguments
5638  *              namei:EFAULT            Bad address
5639  *              namei:ENAMETOOLONG      Filename too long
5640  *              namei:ENOENT            No such file or directory
5641  *              namei:ELOOP             Too many levels of symbolic links
5642  *              namei:EBADF             Bad file descriptor
5643  *              namei:ENOTDIR           Not a directory
5644  *              namei:???
5645  *              access1:
5646  *
5647  * Implicit returns:
5648  *              uap->results            Array contents modified
5649  *
5650  * Notes:       The uap->entries are structured as an arbitrary length array
5651  *              of accessx descriptors, followed by one or more NULL terminated
5652  *              strings
5653  *
5654  *                      struct accessx_descriptor[0]
5655  *                      ...
5656  *                      struct accessx_descriptor[n]
5657  *                      char name_data[0];
5658  *
5659  *              We determine the entry count by walking the buffer containing
5660  *              the uap->entries argument descriptor.  For each descriptor we
5661  *              see, the valid values for the offset ad_name_offset will be
5662  *              in the byte range:
5663  *
5664  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
5665  *                                              to
5666  *                              [ uap->entries + uap->size - 2 ]
5667  *
5668  *              since we must have at least one string, and the string must
5669  *              be at least one character plus the NULL terminator in length.
5670  *
5671  * XXX:         Need to support the check-as uid argument
5672  */
5673 int
5674 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5675 {
5676         struct accessx_descriptor *input = NULL;
5677         errno_t *result = NULL;
5678         errno_t error = 0;
5679         int wantdelete = 0;
5680         unsigned int desc_max, desc_actual, i, j;
5681         struct vfs_context context;
5682         struct nameidata nd;
5683         int niopts;
5684         vnode_t vp = NULL;
5685         vnode_t dvp = NULL;
5686 #define ACCESSX_MAX_DESCR_ON_STACK 10
5687         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5688
5689         context.vc_ucred = NULL;
5690
5691         /*
5692          * Validate parameters; if valid, copy the descriptor array and string
5693          * arguments into local memory.  Before proceeding, the following
5694          * conditions must have been met:
5695          *
5696          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5697          * o    There must be sufficient room in the request for at least one
5698          *      descriptor and a one yte NUL terminated string.
5699          * o    The allocation of local storage must not fail.
5700          */
5701         if (uap->size > ACCESSX_MAX_TABLESIZE) {
5702                 return ENOMEM;
5703         }
5704         if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
5705                 return EINVAL;
5706         }
5707         if (uap->size <= sizeof(stack_input)) {
5708                 input = stack_input;
5709         } else {
5710                 MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5711                 if (input == NULL) {
5712                         error = ENOMEM;
5713                         goto out;
5714                 }
5715         }
5716         error = copyin(uap->entries, input, uap->size);
5717         if (error) {
5718                 goto out;
5719         }
5720
5721         AUDIT_ARG(opaque, input, uap->size);
5722
5723         /*
5724          * Force NUL termination of the copyin buffer to avoid nami() running
5725          * off the end.  If the caller passes us bogus data, they may get a
5726          * bogus result.
5727          */
5728         ((char *)input)[uap->size - 1] = 0;
5729
5730         /*
5731          * Access is defined as checking against the process' real identity,
5732          * even if operations are checking the effective identity.  This
5733          * requires that we use a local vfs context.
5734          */
5735         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5736         context.vc_thread = current_thread();
5737
5738         /*
5739          * Find out how many entries we have, so we can allocate the result
5740          * array by walking the list and adjusting the count downward by the
5741          * earliest string offset we see.
5742          */
5743         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5744         desc_actual = desc_max;
5745         for (i = 0; i < desc_actual; i++) {
5746                 /*
5747                  * Take the offset to the name string for this entry and
5748                  * convert to an input array index, which would be one off
5749                  * the end of the array if this entry was the lowest-addressed
5750                  * name string.
5751                  */
5752                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5753
5754                 /*
5755                  * An offset greater than the max allowable offset is an error.
5756                  * It is also an error for any valid entry to point
5757                  * to a location prior to the end of the current entry, if
5758                  * it's not a reference to the string of the previous entry.
5759                  */
5760                 if (j > desc_max || (j != 0 && j <= i)) {
5761                         error = EINVAL;
5762                         goto out;
5763                 }
5764
5765                 /* Also do not let ad_name_offset point to something beyond the size of the input */
5766                 if (input[i].ad_name_offset >= uap->size) {
5767                         error = EINVAL;
5768                         goto out;
5769                 }
5770
5771                 /*
5772                  * An offset of 0 means use the previous descriptor's offset;
5773                  * this is used to chain multiple requests for the same file
5774                  * to avoid multiple lookups.
5775                  */
5776                 if (j == 0) {
5777                         /* This is not valid for the first entry */
5778                         if (i == 0) {
5779                                 error = EINVAL;
5780                                 goto out;
5781                         }
5782                         continue;
5783                 }
5784
5785                 /*
5786                  * If the offset of the string for this descriptor is before
5787                  * what we believe is the current actual last descriptor,
5788                  * then we need to adjust our estimate downward; this permits
5789                  * the string table following the last descriptor to be out
5790                  * of order relative to the descriptor list.
5791                  */
5792                 if (j < desc_actual) {
5793                         desc_actual = j;
5794                 }
5795         }
5796
5797         /*
5798          * We limit the actual number of descriptors we are willing to process
5799          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5800          * requested does not exceed this limit,
5801          */
5802         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5803                 error = ENOMEM;
5804                 goto out;
5805         }
5806         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK | M_ZERO);
5807         if (result == NULL) {
5808                 error = ENOMEM;
5809                 goto out;
5810         }
5811
5812         /*
5813          * Do the work by iterating over the descriptor entries we know to
5814          * at least appear to contain valid data.
5815          */
5816         error = 0;
5817         for (i = 0; i < desc_actual; i++) {
5818                 /*
5819                  * If the ad_name_offset is 0, then we use the previous
5820                  * results to make the check; otherwise, we are looking up
5821                  * a new file name.
5822                  */
5823                 if (input[i].ad_name_offset != 0) {
5824                         /* discard old vnodes */
5825                         if (vp) {
5826                                 vnode_put(vp);
5827                                 vp = NULL;
5828                         }
5829                         if (dvp) {
5830                                 vnode_put(dvp);
5831                                 dvp = NULL;
5832                         }
5833
5834                         /*
5835                          * Scan forward in the descriptor list to see if we
5836                          * need the parent vnode.  We will need it if we are
5837                          * deleting, since we must have rights  to remove
5838                          * entries in the parent directory, as well as the
5839                          * rights to delete the object itself.
5840                          */
5841                         wantdelete = input[i].ad_flags & _DELETE_OK;
5842                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
5843                                 if (input[j].ad_flags & _DELETE_OK) {
5844                                         wantdelete = 1;
5845                                 }
5846                         }
5847
5848                         niopts = FOLLOW | AUDITVNPATH1;
5849
5850                         /* need parent for vnode_authorize for deletion test */
5851                         if (wantdelete) {
5852                                 niopts |= WANTPARENT;
5853                         }
5854
5855                         /* do the lookup */
5856                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5857                             CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5858                             &context);
5859                         error = namei(&nd);
5860                         if (!error) {
5861                                 vp = nd.ni_vp;
5862                                 if (wantdelete) {
5863                                         dvp = nd.ni_dvp;
5864                                 }
5865                         }
5866                         nameidone(&nd);
5867                 }
5868
5869                 /*
5870                  * Handle lookup errors.
5871                  */
5872                 switch (error) {
5873                 case ENOENT:
5874                 case EACCES:
5875                 case EPERM:
5876                 case ENOTDIR:
5877                         result[i] = error;
5878                         break;
5879                 case 0:
5880                         /* run this access check */
5881                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5882                         break;
5883                 default:
5884                         /* fatal lookup error */
5885
5886                         goto out;
5887                 }
5888         }
5889
5890         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5891
5892         /* copy out results */
5893         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5894
5895 out:
5896         if (input && input != stack_input) {
5897                 FREE(input, M_TEMP);
5898         }
5899         if (result) {
5900                 FREE(result, M_TEMP);
5901         }
5902         if (vp) {
5903                 vnode_put(vp);
5904         }
5905         if (dvp) {
5906                 vnode_put(dvp);
5907         }
5908         if (IS_VALID_CRED(context.vc_ucred)) {
5909                 kauth_cred_unref(&context.vc_ucred);
5910         }
5911         return error;
5912 }
5913
5914
5915 /*
5916  * Returns:     0                       Success
5917  *              namei:EFAULT            Bad address
5918  *              namei:ENAMETOOLONG      Filename too long
5919  *              namei:ENOENT            No such file or directory
5920  *              namei:ELOOP             Too many levels of symbolic links
5921  *              namei:EBADF             Bad file descriptor
5922  *              namei:ENOTDIR           Not a directory
5923  *              namei:???
5924  *              access1:
5925  */
5926 static int
5927 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5928     int flag, enum uio_seg segflg)
5929 {
5930         int error;
5931         struct nameidata nd;
5932         int niopts;
5933         struct vfs_context context;
5934 #if NAMEDRSRCFORK
5935         int is_namedstream = 0;
5936 #endif
5937
5938         /*
5939          * Unless the AT_EACCESS option is used, Access is defined as checking
5940          * against the process' real identity, even if operations are checking
5941          * the effective identity.  So we need to tweak the credential
5942          * in the context for that case.
5943          */
5944         if (!(flag & AT_EACCESS)) {
5945                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5946         } else {
5947                 context.vc_ucred = ctx->vc_ucred;
5948         }
5949         context.vc_thread = ctx->vc_thread;
5950
5951
5952         niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
5953         /* need parent for vnode_authorize for deletion test */
5954         if (amode & _DELETE_OK) {
5955                 niopts |= WANTPARENT;
5956         }
5957         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5958             path, &context);
5959
5960 #if NAMEDRSRCFORK
5961         /* access(F_OK) calls are allowed for resource forks. */
5962         if (amode == F_OK) {
5963                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5964         }
5965 #endif
5966         error = nameiat(&nd, fd);
5967         if (error) {
5968                 goto out;
5969         }
5970
5971 #if NAMEDRSRCFORK
5972         /* Grab reference on the shadow stream file vnode to
5973          * force an inactive on release which will mark it
5974          * for recycle.
5975          */
5976         if (vnode_isnamedstream(nd.ni_vp) &&
5977             (nd.ni_vp->v_parent != NULLVP) &&
5978             vnode_isshadow(nd.ni_vp)) {
5979                 is_namedstream = 1;
5980                 vnode_ref(nd.ni_vp);
5981         }
5982 #endif
5983
5984         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
5985
5986 #if NAMEDRSRCFORK
5987         if (is_namedstream) {
5988                 vnode_rele(nd.ni_vp);
5989         }
5990 #endif
5991
5992         vnode_put(nd.ni_vp);
5993         if (amode & _DELETE_OK) {
5994                 vnode_put(nd.ni_dvp);
5995         }
5996         nameidone(&nd);
5997
5998 out:
5999         if (!(flag & AT_EACCESS)) {
6000                 kauth_cred_unref(&context.vc_ucred);
6001         }
6002         return error;
6003 }
6004
6005 int
6006 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6007 {
6008         return faccessat_internal(vfs_context_current(), AT_FDCWD,
6009                    uap->path, uap->flags, 0, UIO_USERSPACE);
6010 }
6011
6012 int
6013 faccessat(__unused proc_t p, struct faccessat_args *uap,
6014     __unused int32_t *retval)
6015 {
6016         if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
6017                 return EINVAL;
6018         }
6019
6020         return faccessat_internal(vfs_context_current(), uap->fd,
6021                    uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6022 }
6023
6024 /*
6025  * Returns:     0                       Success
6026  *              EFAULT
6027  *      copyout:EFAULT
6028  *      namei:???
6029  *      vn_stat:???
6030  */
6031 static int
6032 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6033     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6034     enum uio_seg segflg, int fd, int flag)
6035 {
6036         struct nameidata nd;
6037         int follow;
6038         union {
6039                 struct stat sb;
6040                 struct stat64 sb64;
6041         } source = {};
6042         union {
6043                 struct user64_stat user64_sb;
6044                 struct user32_stat user32_sb;
6045                 struct user64_stat64 user64_sb64;
6046                 struct user32_stat64 user32_sb64;
6047         } dest = {};
6048         caddr_t sbp;
6049         int error, my_size;
6050         kauth_filesec_t fsec;
6051         size_t xsecurity_bufsize;
6052         void * statptr;
6053         struct fileproc *fp = NULL;
6054         int needsrealdev = 0;
6055
6056         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6057         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6058             segflg, path, ctx);
6059
6060 #if NAMEDRSRCFORK
6061         int is_namedstream = 0;
6062         /* stat calls are allowed for resource forks. */
6063         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6064 #endif
6065
6066         if (flag & AT_FDONLY) {
6067                 vnode_t fvp;
6068
6069                 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6070                 if (error) {
6071                         return error;
6072                 }
6073                 if ((error = vnode_getwithref(fvp))) {
6074                         file_drop(fd);
6075                         return error;
6076                 }
6077                 nd.ni_vp = fvp;
6078         } else {
6079                 error = nameiat(&nd, fd);
6080                 if (error) {
6081                         return error;
6082                 }
6083         }
6084         fsec = KAUTH_FILESEC_NONE;
6085
6086         statptr = (void *)&source;
6087
6088 #if NAMEDRSRCFORK
6089         /* Grab reference on the shadow stream file vnode to
6090          * force an inactive on release which will mark it
6091          * for recycle.
6092          */
6093         if (vnode_isnamedstream(nd.ni_vp) &&
6094             (nd.ni_vp->v_parent != NULLVP) &&
6095             vnode_isshadow(nd.ni_vp)) {
6096                 is_namedstream = 1;
6097                 vnode_ref(nd.ni_vp);
6098         }
6099 #endif
6100
6101         needsrealdev = flag & AT_REALDEV ? 1 : 0;
6102         if (fp && (xsecurity == USER_ADDR_NULL)) {
6103                 /*
6104                  * If the caller has the file open, and is not
6105                  * requesting extended security information, we are
6106                  * going to let them get the basic stat information.
6107                  */
6108                 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6109                     fp->f_fglob->fg_cred);
6110         } else {
6111                 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6112                     isstat64, needsrealdev, ctx);
6113         }
6114
6115 #if NAMEDRSRCFORK
6116         if (is_namedstream) {
6117                 vnode_rele(nd.ni_vp);
6118         }
6119 #endif
6120         vnode_put(nd.ni_vp);
6121         nameidone(&nd);
6122         if (fp) {
6123                 file_drop(fd);
6124                 fp = NULL;
6125         }
6126
6127         if (error) {
6128                 return error;
6129         }
6130         /* Zap spare fields */
6131         if (isstat64 != 0) {
6132                 source.sb64.st_lspare = 0;
6133                 source.sb64.st_qspare[0] = 0LL;
6134                 source.sb64.st_qspare[1] = 0LL;
6135                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6136                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6137                         my_size = sizeof(dest.user64_sb64);
6138                         sbp = (caddr_t)&dest.user64_sb64;
6139                 } else {
6140                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6141                         my_size = sizeof(dest.user32_sb64);
6142                         sbp = (caddr_t)&dest.user32_sb64;
6143                 }
6144                 /*
6145                  * Check if we raced (post lookup) against the last unlink of a file.
6146                  */
6147                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6148                         source.sb64.st_nlink = 1;
6149                 }
6150         } else {
6151                 source.sb.st_lspare = 0;
6152                 source.sb.st_qspare[0] = 0LL;
6153                 source.sb.st_qspare[1] = 0LL;
6154                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6155                         munge_user64_stat(&source.sb, &dest.user64_sb);
6156                         my_size = sizeof(dest.user64_sb);
6157                         sbp = (caddr_t)&dest.user64_sb;
6158                 } else {
6159                         munge_user32_stat(&source.sb, &dest.user32_sb);
6160                         my_size = sizeof(dest.user32_sb);
6161                         sbp = (caddr_t)&dest.user32_sb;
6162                 }
6163
6164                 /*
6165                  * Check if we raced (post lookup) against the last unlink of a file.
6166                  */
6167                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6168                         source.sb.st_nlink = 1;
6169                 }
6170         }
6171         if ((error = copyout(sbp, ub, my_size)) != 0) {
6172                 goto out;
6173         }
6174
6175         /* caller wants extended security information? */
6176         if (xsecurity != USER_ADDR_NULL) {
6177                 /* did we get any? */
6178                 if (fsec == KAUTH_FILESEC_NONE) {
6179                         if (susize(xsecurity_size, 0) != 0) {
6180                                 error = EFAULT;
6181                                 goto out;
6182                         }
6183                 } else {
6184                         /* find the user buffer size */
6185                         xsecurity_bufsize = fusize(xsecurity_size);
6186
6187                         /* copy out the actual data size */
6188                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6189                                 error = EFAULT;
6190                                 goto out;
6191                         }
6192
6193                         /* if the caller supplied enough room, copy out to it */
6194                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6195                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6196                         }
6197                 }
6198         }
6199 out:
6200         if (fsec != KAUTH_FILESEC_NONE) {
6201                 kauth_filesec_free(fsec);
6202         }
6203         return error;
6204 }
6205
6206 /*
6207  * stat_extended: Get file status; with extended security (ACL).
6208  *
6209  * Parameters:    p                       (ignored)
6210  *                uap                     User argument descriptor (see below)
6211  *                retval                  (ignored)
6212  *
6213  * Indirect:      uap->path               Path of file to get status from
6214  *                uap->ub                 User buffer (holds file status info)
6215  *                uap->xsecurity          ACL to get (extended security)
6216  *                uap->xsecurity_size     Size of ACL
6217  *
6218  * Returns:        0                      Success
6219  *                !0                      errno value
6220  *
6221  */
6222 int
6223 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6224     __unused int32_t *retval)
6225 {
6226         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6227                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6228                    0);
6229 }
6230
6231 /*
6232  * Returns:     0                       Success
6233  *      fstatat_internal:???            [see fstatat_internal() in this file]
6234  */
6235 int
6236 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6237 {
6238         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6239                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6240 }
6241
6242 int
6243 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6244 {
6245         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6246                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6247 }
6248
6249 /*
6250  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6251  *
6252  * Parameters:    p                       (ignored)
6253  *                uap                     User argument descriptor (see below)
6254  *                retval                  (ignored)
6255  *
6256  * Indirect:      uap->path               Path of file to get status from
6257  *                uap->ub                 User buffer (holds file status info)
6258  *                uap->xsecurity          ACL to get (extended security)
6259  *                uap->xsecurity_size     Size of ACL
6260  *
6261  * Returns:        0                      Success
6262  *                !0                      errno value
6263  *
6264  */
6265 int
6266 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6267 {
6268         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6269                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6270                    0);
6271 }
6272
6273 /*
6274  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6275  *
6276  * Parameters:    p                       (ignored)
6277  *                uap                     User argument descriptor (see below)
6278  *                retval                  (ignored)
6279  *
6280  * Indirect:      uap->path               Path of file to get status from
6281  *                uap->ub                 User buffer (holds file status info)
6282  *                uap->xsecurity          ACL to get (extended security)
6283  *                uap->xsecurity_size     Size of ACL
6284  *
6285  * Returns:        0                      Success
6286  *                !0                      errno value
6287  *
6288  */
6289 int
6290 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6291 {
6292         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6293                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6294                    AT_SYMLINK_NOFOLLOW);
6295 }
6296
6297 /*
6298  * Get file status; this version does not follow links.
6299  */
6300 int
6301 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6302 {
6303         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6304                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6305 }
6306
6307 int
6308 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6309 {
6310         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6311                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6312 }
6313
6314 /*
6315  * lstat64_extended: Get file status; can handle large inode numbers; does not
6316  * follow links; with extended security (ACL).
6317  *
6318  * Parameters:    p                       (ignored)
6319  *                uap                     User argument descriptor (see below)
6320  *                retval                  (ignored)
6321  *
6322  * Indirect:      uap->path               Path of file to get status from
6323  *                uap->ub                 User buffer (holds file status info)
6324  *                uap->xsecurity          ACL to get (extended security)
6325  *                uap->xsecurity_size     Size of ACL
6326  *
6327  * Returns:        0                      Success
6328  *                !0                      errno value
6329  *
6330  */
6331 int
6332 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6333 {
6334         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6335                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6336                    AT_SYMLINK_NOFOLLOW);
6337 }
6338
6339 int
6340 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6341 {
6342         if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6343                 return EINVAL;
6344         }
6345
6346         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6347                    0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6348 }
6349
6350 int
6351 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6352     __unused int32_t *retval)
6353 {
6354         if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6355                 return EINVAL;
6356         }
6357
6358         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6359                    0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6360 }
6361
6362 /*
6363  * Get configurable pathname variables.
6364  *
6365  * Returns:     0                       Success
6366  *      namei:???
6367  *      vn_pathconf:???
6368  *
6369  * Notes:       Global implementation  constants are intended to be
6370  *              implemented in this function directly; all other constants
6371  *              are per-FS implementation, and therefore must be handled in
6372  *              each respective FS, instead.
6373  *
6374  * XXX We implement some things globally right now that should actually be
6375  * XXX per-FS; we will need to deal with this at some point.
6376  */
6377 /* ARGSUSED */
6378 int
6379 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6380 {
6381         int error;
6382         struct nameidata nd;
6383         vfs_context_t ctx = vfs_context_current();
6384
6385         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6386             UIO_USERSPACE, uap->path, ctx);
6387         error = namei(&nd);
6388         if (error) {
6389                 return error;
6390         }
6391
6392         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6393
6394         vnode_put(nd.ni_vp);
6395         nameidone(&nd);
6396         return error;
6397 }
6398
6399 /*
6400  * Return target name of a symbolic link.
6401  */
6402 /* ARGSUSED */
6403 static int
6404 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
6405     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6406     int *retval)
6407 {
6408         vnode_t vp;
6409         uio_t auio;
6410         int error;
6411         struct nameidata nd;
6412         char uio_buf[UIO_SIZEOF(1)];
6413
6414         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6415             seg, path, ctx);
6416
6417         error = nameiat(&nd, fd);
6418         if (error) {
6419                 return error;
6420         }
6421         vp = nd.ni_vp;
6422
6423         nameidone(&nd);
6424
6425         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6426             &uio_buf[0], sizeof(uio_buf));
6427         uio_addiov(auio, buf, bufsize);
6428         if (vp->v_type != VLNK) {
6429                 error = EINVAL;
6430         } else {
6431 #if CONFIG_MACF
6432                 error = mac_vnode_check_readlink(ctx, vp);
6433 #endif
6434                 if (error == 0) {
6435                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6436                             ctx);
6437                 }
6438                 if (error == 0) {
6439                         error = VNOP_READLINK(vp, auio, ctx);
6440                 }
6441         }
6442         vnode_put(vp);
6443
6444         *retval = bufsize - (int)uio_resid(auio);
6445         return error;
6446 }
6447
6448 int
6449 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6450 {
6451         enum uio_seg procseg;
6452
6453         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6454         return readlinkat_internal(vfs_context_current(), AT_FDCWD,
6455                    CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6456                    uap->count, procseg, retval);
6457 }
6458
6459 int
6460 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6461 {
6462         enum uio_seg procseg;
6463
6464         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6465         return readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
6466                    procseg, uap->buf, uap->bufsize, procseg, retval);
6467 }
6468
6469 /*
6470  * Change file flags, the deep inner layer.
6471  */
6472 static int
6473 chflags0(vnode_t vp, struct vnode_attr *va,
6474     int (*setattr)(vnode_t, void *, vfs_context_t),
6475     void *arg, vfs_context_t ctx)
6476 {
6477         kauth_action_t action = 0;
6478         int error;
6479
6480 #if CONFIG_MACF
6481         error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6482         if (error) {
6483                 goto out;
6484         }
6485 #endif
6486
6487         /* request authorisation, disregard immutability */
6488         if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6489                 goto out;
6490         }
6491         /*
6492          * Request that the auth layer disregard those file flags it's allowed to when
6493          * authorizing this operation; we need to do this in order to be able to
6494          * clear immutable flags.
6495          */
6496         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6497                 goto out;
6498         }
6499         error = (*setattr)(vp, arg, ctx);
6500
6501 #if CONFIG_MACF
6502         if (error == 0) {
6503                 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6504         }
6505 #endif
6506
6507 out:
6508         return error;
6509 }
6510
6511 /*
6512  * Change file flags.
6513  *
6514  * NOTE: this will vnode_put() `vp'
6515  */
6516 static int
6517 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6518 {
6519         struct vnode_attr va;
6520         int error;
6521
6522         VATTR_INIT(&va);
6523         VATTR_SET(&va, va_flags, flags);
6524
6525         error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6526         vnode_put(vp);
6527
6528         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6529                 error = ENOTSUP;
6530         }
6531
6532         return error;
6533 }
6534
6535 /*
6536  * Change flags of a file given a path name.
6537  */
6538 /* ARGSUSED */
6539 int
6540 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6541 {
6542         vnode_t vp;
6543         vfs_context_t ctx = vfs_context_current();
6544         int error;
6545         struct nameidata nd;
6546
6547         AUDIT_ARG(fflags, uap->flags);
6548         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6549             UIO_USERSPACE, uap->path, ctx);
6550         error = namei(&nd);
6551         if (error) {
6552                 return error;
6553         }
6554         vp = nd.ni_vp;
6555         nameidone(&nd);
6556
6557         /* we don't vnode_put() here because chflags1 does internally */
6558         error = chflags1(vp, uap->flags, ctx);
6559
6560         return error;
6561 }
6562
6563 /*
6564  * Change flags of a file given a file descriptor.
6565  */
6566 /* ARGSUSED */
6567 int
6568 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6569 {
6570         vnode_t vp;
6571         int error;
6572
6573         AUDIT_ARG(fd, uap->fd);
6574         AUDIT_ARG(fflags, uap->flags);
6575         if ((error = file_vnode(uap->fd, &vp))) {
6576                 return error;
6577         }
6578
6579         if ((error = vnode_getwithref(vp))) {
6580                 file_drop(uap->fd);
6581                 return error;
6582         }
6583
6584         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6585
6586         /* we don't vnode_put() here because chflags1 does internally */
6587         error = chflags1(vp, uap->flags, vfs_context_current());
6588
6589         file_drop(uap->fd);
6590         return error;
6591 }
6592
6593 /*
6594  * Change security information on a filesystem object.
6595  *
6596  * Returns:     0                       Success
6597  *              EPERM                   Operation not permitted
6598  *              vnode_authattr:???      [anything vnode_authattr can return]
6599  *              vnode_authorize:???     [anything vnode_authorize can return]
6600  *              vnode_setattr:???       [anything vnode_setattr can return]
6601  *
6602  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
6603  *              translated to EPERM before being returned.
6604  */
6605 static int
6606 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6607 {
6608         kauth_action_t action;
6609         int error;
6610
6611         AUDIT_ARG(mode, vap->va_mode);
6612         /* XXX audit new args */
6613
6614 #if NAMEDSTREAMS
6615         /* chmod calls are not allowed for resource forks. */
6616         if (vp->v_flag & VISNAMEDSTREAM) {
6617                 return EPERM;
6618         }
6619 #endif
6620
6621 #if CONFIG_MACF
6622         if (VATTR_IS_ACTIVE(vap, va_mode) &&
6623             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6624                 return error;
6625         }
6626
6627         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6628                 if ((error = mac_vnode_check_setowner(ctx, vp,
6629                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6630                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6631                         return error;
6632                 }
6633         }
6634
6635         if (VATTR_IS_ACTIVE(vap, va_acl) &&
6636             (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
6637                 return error;
6638         }
6639 #endif
6640
6641         /* make sure that the caller is allowed to set this security information */
6642         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6643             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6644                 if (error == EACCES) {
6645                         error = EPERM;
6646                 }
6647                 return error;
6648         }
6649
6650         if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
6651                 return error;
6652         }
6653
6654 #if CONFIG_MACF
6655         if (VATTR_IS_ACTIVE(vap, va_mode)) {
6656                 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6657         }
6658
6659         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6660                 mac_vnode_notify_setowner(ctx, vp,
6661                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6662                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6663         }
6664
6665         if (VATTR_IS_ACTIVE(vap, va_acl)) {
6666                 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6667         }
6668 #endif
6669
6670         return error;
6671 }
6672
6673
6674 /*
6675  * Change mode of a file given a path name.
6676  *
6677  * Returns:     0                       Success
6678  *              namei:???               [anything namei can return]
6679  *              chmod_vnode:???         [anything chmod_vnode can return]
6680  */
6681 static int
6682 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6683     int fd, int flag, enum uio_seg segflg)
6684 {
6685         struct nameidata nd;
6686         int follow, error;
6687
6688         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6689         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
6690             segflg, path, ctx);
6691         if ((error = nameiat(&nd, fd))) {
6692                 return error;
6693         }
6694         error = chmod_vnode(ctx, nd.ni_vp, vap);
6695         vnode_put(nd.ni_vp);
6696         nameidone(&nd);
6697         return error;
6698 }
6699
6700 /*
6701  * chmod_extended: Change the mode of a file given a path name; with extended
6702  * argument list (including extended security (ACL)).
6703  *
6704  * Parameters:  p                       Process requesting the open
6705  *              uap                     User argument descriptor (see below)
6706  *              retval                  (ignored)
6707  *
6708  * Indirect:    uap->path               Path to object (same as 'chmod')
6709  *              uap->uid                UID to set
6710  *              uap->gid                GID to set
6711  *              uap->mode               File mode to set (same as 'chmod')
6712  *              uap->xsecurity          ACL to set (or delete)
6713  *
6714  * Returns:     0                       Success
6715  *              !0                      errno value
6716  *
6717  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
6718  *
6719  * XXX:         We should enummerate the possible errno values here, and where
6720  *              in the code they originated.
6721  */
6722 int
6723 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
6724 {
6725         int error;
6726         struct vnode_attr va;
6727         kauth_filesec_t xsecdst;
6728
6729         AUDIT_ARG(owner, uap->uid, uap->gid);
6730
6731         VATTR_INIT(&va);
6732         if (uap->mode != -1) {
6733                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6734         }
6735         if (uap->uid != KAUTH_UID_NONE) {
6736                 VATTR_SET(&va, va_uid, uap->uid);
6737         }
6738         if (uap->gid != KAUTH_GID_NONE) {
6739                 VATTR_SET(&va, va_gid, uap->gid);
6740         }
6741
6742         xsecdst = NULL;
6743         switch (uap->xsecurity) {
6744         /* explicit remove request */
6745         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6746                 VATTR_SET(&va, va_acl, NULL);
6747                 break;
6748         /* not being set */
6749         case USER_ADDR_NULL:
6750                 break;
6751         default:
6752                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6753                         return error;
6754                 }
6755                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6756                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6757         }
6758
6759         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6760             UIO_USERSPACE);
6761
6762         if (xsecdst != NULL) {
6763                 kauth_filesec_free(xsecdst);
6764         }
6765         return error;
6766 }
6767
6768 /*
6769  * Returns:     0                       Success
6770  *              chmodat:???             [anything chmodat can return]
6771  */
6772 static int
6773 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6774     int flag, enum uio_seg segflg)
6775 {
6776         struct vnode_attr va;
6777
6778         VATTR_INIT(&va);
6779         VATTR_SET(&va, va_mode, mode & ALLPERMS);
6780
6781         return chmodat(ctx, path, &va, fd, flag, segflg);
6782 }
6783
6784 int
6785 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6786 {
6787         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6788                    AT_FDCWD, 0, UIO_USERSPACE);
6789 }
6790
6791 int
6792 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6793 {
6794         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
6795                 return EINVAL;
6796         }
6797
6798         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6799                    uap->fd, uap->flag, UIO_USERSPACE);
6800 }
6801
6802 /*
6803  * Change mode of a file given a file descriptor.
6804  */
6805 static int
6806 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6807 {
6808         vnode_t vp;
6809         int error;
6810
6811         AUDIT_ARG(fd, fd);
6812
6813         if ((error = file_vnode(fd, &vp)) != 0) {
6814                 return error;
6815         }
6816         if ((error = vnode_getwithref(vp)) != 0) {
6817                 file_drop(fd);
6818                 return error;
6819         }
6820         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6821
6822         error = chmod_vnode(vfs_context_current(), vp, vap);
6823         (void)vnode_put(vp);
6824         file_drop(fd);
6825
6826         return error;
6827 }
6828
6829 /*
6830  * fchmod_extended: Change mode of a file given a file descriptor; with
6831  * extended argument list (including extended security (ACL)).
6832  *
6833  * Parameters:    p                       Process requesting to change file mode
6834  *                uap                     User argument descriptor (see below)
6835  *                retval                  (ignored)
6836  *
6837  * Indirect:      uap->mode               File mode to set (same as 'chmod')
6838  *                uap->uid                UID to set
6839  *                uap->gid                GID to set
6840  *                uap->xsecurity          ACL to set (or delete)
6841  *                uap->fd                 File descriptor of file to change mode
6842  *
6843  * Returns:        0                      Success
6844  *                !0                      errno value
6845  *
6846  */
6847 int
6848 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6849 {
6850         int error;
6851         struct vnode_attr va;
6852         kauth_filesec_t xsecdst;
6853
6854         AUDIT_ARG(owner, uap->uid, uap->gid);
6855
6856         VATTR_INIT(&va);
6857         if (uap->mode != -1) {
6858                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6859         }
6860         if (uap->uid != KAUTH_UID_NONE) {
6861                 VATTR_SET(&va, va_uid, uap->uid);
6862         }
6863         if (uap->gid != KAUTH_GID_NONE) {
6864                 VATTR_SET(&va, va_gid, uap->gid);
6865         }
6866
6867         xsecdst = NULL;
6868         switch (uap->xsecurity) {
6869         case USER_ADDR_NULL:
6870                 VATTR_SET(&va, va_acl, NULL);
6871                 break;
6872         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6873                 VATTR_SET(&va, va_acl, NULL);
6874                 break;
6875         /* not being set */
6876         case CAST_USER_ADDR_T(-1):
6877                 break;
6878         default:
6879                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6880                         return error;
6881                 }
6882                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6883         }
6884
6885         error = fchmod1(p, uap->fd, &va);
6886
6887
6888         switch (uap->xsecurity) {
6889         case USER_ADDR_NULL:
6890         case CAST_USER_ADDR_T(-1):
6891                 break;
6892         default:
6893                 if (xsecdst != NULL) {
6894                         kauth_filesec_free(xsecdst);
6895                 }
6896         }
6897         return error;
6898 }
6899
6900 int
6901 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6902 {
6903         struct vnode_attr va;
6904
6905         VATTR_INIT(&va);
6906         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6907
6908         return fchmod1(p, uap->fd, &va);
6909 }
6910
6911
6912 /*
6913  * Set ownership given a path name.
6914  */
6915 /* ARGSUSED */
6916 static int
6917 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6918     gid_t gid, int flag, enum uio_seg segflg)
6919 {
6920         vnode_t vp;
6921         struct vnode_attr va;
6922         int error;
6923         struct nameidata nd;
6924         int follow;
6925         kauth_action_t action;
6926
6927         AUDIT_ARG(owner, uid, gid);
6928
6929         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6930         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6931             path, ctx);
6932         error = nameiat(&nd, fd);
6933         if (error) {
6934                 return error;
6935         }
6936         vp = nd.ni_vp;
6937
6938         nameidone(&nd);
6939
6940         VATTR_INIT(&va);
6941         if (uid != (uid_t)VNOVAL) {
6942                 VATTR_SET(&va, va_uid, uid);
6943         }
6944         if (gid != (gid_t)VNOVAL) {
6945                 VATTR_SET(&va, va_gid, gid);
6946         }
6947
6948 #if CONFIG_MACF
6949         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6950         if (error) {
6951                 goto out;
6952         }
6953 #endif
6954
6955         /* preflight and authorize attribute changes */
6956         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6957                 goto out;
6958         }
6959         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6960                 goto out;
6961         }
6962         error = vnode_setattr(vp, &va, ctx);
6963
6964 #if CONFIG_MACF
6965         if (error == 0) {
6966                 mac_vnode_notify_setowner(ctx, vp, uid, gid);
6967         }
6968 #endif
6969
6970 out:
6971         /*
6972          * EACCES is only allowed from namei(); permissions failure should
6973          * return EPERM, so we need to translate the error code.
6974          */
6975         if (error == EACCES) {
6976                 error = EPERM;
6977         }
6978
6979         vnode_put(vp);
6980         return error;
6981 }
6982
6983 int
6984 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
6985 {
6986         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6987                    uap->uid, uap->gid, 0, UIO_USERSPACE);
6988 }
6989
6990 int
6991 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
6992 {
6993         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6994                    uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
6995 }
6996
6997 int
6998 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
6999 {
7000         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7001                 return EINVAL;
7002         }
7003
7004         return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7005                    uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7006 }
7007
7008 /*
7009  * Set ownership given a file descriptor.
7010  */
7011 /* ARGSUSED */
7012 int
7013 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7014 {
7015         struct vnode_attr va;
7016         vfs_context_t ctx = vfs_context_current();
7017         vnode_t vp;
7018         int error;
7019         kauth_action_t action;
7020
7021         AUDIT_ARG(owner, uap->uid, uap->gid);
7022         AUDIT_ARG(fd, uap->fd);
7023
7024         if ((error = file_vnode(uap->fd, &vp))) {
7025                 return error;
7026         }
7027
7028         if ((error = vnode_getwithref(vp))) {
7029                 file_drop(uap->fd);
7030                 return error;
7031         }
7032         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7033
7034         VATTR_INIT(&va);
7035         if (uap->uid != VNOVAL) {
7036                 VATTR_SET(&va, va_uid, uap->uid);
7037         }
7038         if (uap->gid != VNOVAL) {
7039                 VATTR_SET(&va, va_gid, uap->gid);
7040         }
7041
7042 #if NAMEDSTREAMS
7043         /* chown calls are not allowed for resource forks. */
7044         if (vp->v_flag & VISNAMEDSTREAM) {
7045                 error = EPERM;
7046                 goto out;
7047         }
7048 #endif
7049
7050 #if CONFIG_MACF
7051         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7052         if (error) {
7053                 goto out;
7054         }
7055 #endif
7056
7057         /* preflight and authorize attribute changes */
7058         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7059                 goto out;
7060         }
7061         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7062                 if (error == EACCES) {
7063                         error = EPERM;
7064                 }
7065                 goto out;
7066         }
7067         error = vnode_setattr(vp, &va, ctx);
7068
7069 #if CONFIG_MACF
7070         if (error == 0) {
7071                 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7072         }
7073 #endif
7074
7075 out:
7076         (void)vnode_put(vp);
7077         file_drop(uap->fd);
7078         return error;
7079 }
7080
7081 static int
7082 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7083 {
7084         int error;
7085
7086         if (usrtvp == USER_ADDR_NULL) {
7087                 struct timeval old_tv;
7088                 /* XXX Y2038 bug because of microtime argument */
7089                 microtime(&old_tv);
7090                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7091                 tsp[1] = tsp[0];
7092         } else {
7093                 if (IS_64BIT_PROCESS(current_proc())) {
7094                         struct user64_timeval tv[2];
7095                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
7096                         if (error) {
7097                                 return error;
7098                         }
7099                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7100                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7101                 } else {
7102                         struct user32_timeval tv[2];
7103                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
7104                         if (error) {
7105                                 return error;
7106                         }
7107                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7108                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7109                 }
7110         }
7111         return 0;
7112 }
7113
7114 static int
7115 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7116     int nullflag)
7117 {
7118         int error;
7119         struct vnode_attr va;
7120         kauth_action_t action;
7121
7122         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7123
7124         VATTR_INIT(&va);
7125         VATTR_SET(&va, va_access_time, ts[0]);
7126         VATTR_SET(&va, va_modify_time, ts[1]);
7127         if (nullflag) {
7128                 va.va_vaflags |= VA_UTIMES_NULL;
7129         }
7130
7131 #if NAMEDSTREAMS
7132         /* utimes calls are not allowed for resource forks. */
7133         if (vp->v_flag & VISNAMEDSTREAM) {
7134                 error = EPERM;
7135                 goto out;
7136         }
7137 #endif
7138
7139 #if CONFIG_MACF
7140         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7141         if (error) {
7142                 goto out;
7143         }
7144 #endif
7145         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7146                 if (!nullflag && error == EACCES) {
7147                         error = EPERM;
7148                 }
7149                 goto out;
7150         }
7151
7152         /* since we may not need to auth anything, check here */
7153         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7154                 if (!nullflag && error == EACCES) {
7155                         error = EPERM;
7156                 }
7157                 goto out;
7158         }
7159         error = vnode_setattr(vp, &va, ctx);
7160
7161 #if CONFIG_MACF
7162         if (error == 0) {
7163                 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7164         }
7165 #endif
7166
7167 out:
7168         return error;
7169 }
7170
7171 /*
7172  * Set the access and modification times of a file.
7173  */
7174 /* ARGSUSED */
7175 int
7176 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7177 {
7178         struct timespec ts[2];
7179         user_addr_t usrtvp;
7180         int error;
7181         struct nameidata nd;
7182         vfs_context_t ctx = vfs_context_current();
7183
7184         /*
7185          * AUDIT: Needed to change the order of operations to do the
7186          * name lookup first because auditing wants the path.
7187          */
7188         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7189             UIO_USERSPACE, uap->path, ctx);
7190         error = namei(&nd);
7191         if (error) {
7192                 return error;
7193         }
7194         nameidone(&nd);
7195
7196         /*
7197          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
7198          * the current time instead.
7199          */
7200         usrtvp = uap->tptr;
7201         if ((error = getutimes(usrtvp, ts)) != 0) {
7202                 goto out;
7203         }
7204
7205         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7206
7207 out:
7208         vnode_put(nd.ni_vp);
7209         return error;
7210 }
7211
7212 /*
7213  * Set the access and modification times of a file.
7214  */
7215 /* ARGSUSED */
7216 int
7217 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7218 {
7219         struct timespec ts[2];
7220         vnode_t vp;
7221         user_addr_t usrtvp;
7222         int error;
7223
7224         AUDIT_ARG(fd, uap->fd);
7225         usrtvp = uap->tptr;
7226         if ((error = getutimes(usrtvp, ts)) != 0) {
7227                 return error;
7228         }
7229         if ((error = file_vnode(uap->fd, &vp)) != 0) {
7230                 return error;
7231         }
7232         if ((error = vnode_getwithref(vp))) {
7233                 file_drop(uap->fd);
7234                 return error;
7235         }
7236
7237         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7238         vnode_put(vp);
7239         file_drop(uap->fd);
7240         return error;
7241 }
7242
7243 /*
7244  * Truncate a file given its path name.
7245  */
7246 /* ARGSUSED */
7247 int
7248 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7249 {
7250         vnode_t vp;
7251         struct vnode_attr va;
7252         vfs_context_t ctx = vfs_context_current();
7253         int error;
7254         struct nameidata nd;
7255         kauth_action_t action;
7256
7257         if (uap->length < 0) {
7258                 return EINVAL;
7259         }
7260         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7261             UIO_USERSPACE, uap->path, ctx);
7262         if ((error = namei(&nd))) {
7263                 return error;
7264         }
7265         vp = nd.ni_vp;
7266
7267         nameidone(&nd);
7268
7269         VATTR_INIT(&va);
7270         VATTR_SET(&va, va_data_size, uap->length);
7271
7272 #if CONFIG_MACF
7273         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7274         if (error) {
7275                 goto out;
7276         }
7277 #endif
7278
7279         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7280                 goto out;
7281         }
7282         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7283                 goto out;
7284         }
7285         error = vnode_setattr(vp, &va, ctx);
7286
7287 #if CONFIG_MACF
7288         if (error == 0) {
7289                 mac_vnode_notify_truncate(ctx, NOCRED, vp);
7290         }
7291 #endif
7292
7293 out:
7294         vnode_put(vp);
7295         return error;
7296 }
7297
7298 /*
7299  * Truncate a file given a file descriptor.
7300  */
7301 /* ARGSUSED */
7302 int
7303 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7304 {
7305         vfs_context_t ctx = vfs_context_current();
7306         struct vnode_attr va;
7307         vnode_t vp;
7308         struct fileproc *fp;
7309         int error;
7310         int fd = uap->fd;
7311
7312         AUDIT_ARG(fd, uap->fd);
7313         if (uap->length < 0) {
7314                 return EINVAL;
7315         }
7316
7317         if ((error = fp_lookup(p, fd, &fp, 0))) {
7318                 return error;
7319         }
7320
7321         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
7322         case DTYPE_PSXSHM:
7323                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7324                 goto out;
7325         case DTYPE_VNODE:
7326                 break;
7327         default:
7328                 error = EINVAL;
7329                 goto out;
7330         }
7331
7332         vp = (vnode_t)fp->f_fglob->fg_data;
7333
7334         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
7335                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7336                 error = EINVAL;
7337                 goto out;
7338         }
7339
7340         if ((error = vnode_getwithref(vp)) != 0) {
7341                 goto out;
7342         }
7343
7344         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7345
7346 #if CONFIG_MACF
7347         error = mac_vnode_check_truncate(ctx,
7348             fp->f_fglob->fg_cred, vp);
7349         if (error) {
7350                 (void)vnode_put(vp);
7351                 goto out;
7352         }
7353 #endif
7354         VATTR_INIT(&va);
7355         VATTR_SET(&va, va_data_size, uap->length);
7356         error = vnode_setattr(vp, &va, ctx);
7357
7358 #if CONFIG_MACF
7359         if (error == 0) {
7360                 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
7361         }
7362 #endif
7363
7364         (void)vnode_put(vp);
7365 out:
7366         file_drop(fd);
7367         return error;
7368 }
7369
7370
7371 /*
7372  * Sync an open file with synchronized I/O _file_ integrity completion
7373  */
7374 /* ARGSUSED */
7375 int
7376 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7377 {
7378         __pthread_testcancel(1);
7379         return fsync_common(p, uap, MNT_WAIT);
7380 }
7381
7382
7383 /*
7384  * Sync an open file with synchronized I/O _file_ integrity completion
7385  *
7386  * Notes:       This is a legacy support function that does not test for
7387  *              thread cancellation points.
7388  */
7389 /* ARGSUSED */
7390 int
7391 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7392 {
7393         return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7394 }
7395
7396
7397 /*
7398  * Sync an open file with synchronized I/O _data_ integrity completion
7399  */
7400 /* ARGSUSED */
7401 int
7402 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7403 {
7404         __pthread_testcancel(1);
7405         return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7406 }
7407
7408
7409 /*
7410  * fsync_common
7411  *
7412  * Common fsync code to support both synchronized I/O file integrity completion
7413  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7414  *
7415  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7416  * will only guarantee that the file data contents are retrievable.  If
7417  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7418  * includes additional metadata unnecessary for retrieving the file data
7419  * contents, such as atime, mtime, ctime, etc., also be committed to stable
7420  * storage.
7421  *
7422  * Parameters:  p                               The process
7423  *              uap->fd                         The descriptor to synchronize
7424  *              flags                           The data integrity flags
7425  *
7426  * Returns:     int                             Success
7427  *      fp_getfvp:EBADF                         Bad file descriptor
7428  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
7429  *      VNOP_FSYNC:???                          unspecified
7430  *
7431  * Notes:       We use struct fsync_args because it is a short name, and all
7432  *              caller argument structures are otherwise identical.
7433  */
7434 static int
7435 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7436 {
7437         vnode_t vp;
7438         struct fileproc *fp;
7439         vfs_context_t ctx = vfs_context_current();
7440         int error;
7441
7442         AUDIT_ARG(fd, uap->fd);
7443
7444         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7445                 return error;
7446         }
7447         if ((error = vnode_getwithref(vp))) {
7448                 file_drop(uap->fd);
7449                 return error;
7450         }
7451
7452         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7453
7454         error = VNOP_FSYNC(vp, flags, ctx);
7455
7456 #if NAMEDRSRCFORK
7457         /* Sync resource fork shadow file if necessary. */
7458         if ((error == 0) &&
7459             (vp->v_flag & VISNAMEDSTREAM) &&
7460             (vp->v_parent != NULLVP) &&
7461             vnode_isshadow(vp) &&
7462             (fp->f_flags & FP_WRITTEN)) {
7463                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7464         }
7465 #endif
7466
7467         (void)vnode_put(vp);
7468         file_drop(uap->fd);
7469         return error;
7470 }
7471
7472 /*
7473  * Duplicate files.  Source must be a file, target must be a file or
7474  * must not exist.
7475  *
7476  * XXX Copyfile authorisation checking is woefully inadequate, and will not
7477  *     perform inheritance correctly.
7478  */
7479 /* ARGSUSED */
7480 int
7481 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7482 {
7483         vnode_t tvp, fvp, tdvp, sdvp;
7484         struct nameidata fromnd, tond;
7485         int error;
7486         vfs_context_t ctx = vfs_context_current();
7487 #if CONFIG_MACF
7488         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
7489         struct vnode_attr va;
7490 #endif
7491
7492         /* Check that the flags are valid. */
7493
7494         if (uap->flags & ~CPF_MASK) {
7495                 return EINVAL;
7496         }
7497
7498         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7499             UIO_USERSPACE, uap->from, ctx);
7500         if ((error = namei(&fromnd))) {
7501                 return error;
7502         }
7503         fvp = fromnd.ni_vp;
7504
7505         NDINIT(&tond, CREATE, OP_LINK,
7506             LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7507             UIO_USERSPACE, uap->to, ctx);
7508         if ((error = namei(&tond))) {
7509                 goto out1;
7510         }
7511         tdvp = tond.ni_dvp;
7512         tvp = tond.ni_vp;
7513
7514         if (tvp != NULL) {
7515                 if (!(uap->flags & CPF_OVERWRITE)) {
7516                         error = EEXIST;
7517                         goto out;
7518                 }
7519         }
7520
7521         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7522                 error = EISDIR;
7523                 goto out;
7524         }
7525
7526         /* This calls existing MAC hooks for open  */
7527         if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
7528             NULL))) {
7529                 goto out;
7530         }
7531
7532         if (tvp) {
7533                 /*
7534                  * See unlinkat_internal for an explanation of the potential
7535                  * ENOENT from the MAC hook but the gist is that the MAC hook
7536                  * can fail because vn_getpath isn't able to return the full
7537                  * path. We choose to ignore this failure.
7538                  */
7539                 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
7540                 if (error && error != ENOENT) {
7541                         goto out;
7542                 }
7543                 error = 0;
7544         }
7545
7546 #if CONFIG_MACF
7547         VATTR_INIT(&va);
7548         VATTR_SET(&va, va_type, fvp->v_type);
7549         /* Mask off all but regular access permissions */
7550         VATTR_SET(&va, va_mode,
7551             ((((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
7552         error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
7553         if (error) {
7554                 goto out;
7555         }
7556 #endif /* CONFIG_MACF */
7557
7558         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7559                 goto out;
7560         }
7561
7562         if (fvp == tdvp) {
7563                 error = EINVAL;
7564         }
7565         /*
7566          * If source is the same as the destination (that is the
7567          * same inode number) then there is nothing to do.
7568          * (fixed to have POSIX semantics - CSM 3/2/98)
7569          */
7570         if (fvp == tvp) {
7571                 error = -1;
7572         }
7573         if (!error) {
7574                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7575         }
7576 out:
7577         sdvp = tond.ni_startdir;
7578         /*
7579          * nameidone has to happen before we vnode_put(tdvp)
7580          * since it may need to release the fs_nodelock on the tdvp
7581          */
7582         nameidone(&tond);
7583
7584         if (tvp) {
7585                 vnode_put(tvp);
7586         }
7587         vnode_put(tdvp);
7588         vnode_put(sdvp);
7589 out1:
7590         vnode_put(fvp);
7591
7592         nameidone(&fromnd);
7593
7594         if (error == -1) {
7595                 return 0;
7596         }
7597         return error;
7598 }
7599
7600 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7601
7602 /*
7603  * Helper function for doing clones. The caller is expected to provide an
7604  * iocounted source vnode and release it.
7605  */
7606 static int
7607 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7608     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7609 {
7610         vnode_t tvp, tdvp;
7611         struct nameidata tond;
7612         int error;
7613         int follow;
7614         boolean_t free_src_acl;
7615         boolean_t attr_cleanup;
7616         enum vtype v_type;
7617         kauth_action_t action;
7618         struct componentname *cnp;
7619         uint32_t defaulted;
7620         struct vnode_attr va;
7621         struct vnode_attr nva;
7622         uint32_t vnop_flags;
7623
7624         v_type = vnode_vtype(fvp);
7625         switch (v_type) {
7626         case VLNK:
7627         /* FALLTHRU */
7628         case VREG:
7629                 action = KAUTH_VNODE_ADD_FILE;
7630                 break;
7631         case VDIR:
7632                 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7633                     fvp->v_mountedhere) {
7634                         return EINVAL;
7635                 }
7636                 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7637                 break;
7638         default:
7639                 return EINVAL;
7640         }
7641
7642         AUDIT_ARG(fd2, dst_dirfd);
7643         AUDIT_ARG(value32, flags);
7644
7645         follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7646         NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7647             UIO_USERSPACE, dst, ctx);
7648         if ((error = nameiat(&tond, dst_dirfd))) {
7649                 return error;
7650         }
7651         cnp = &tond.ni_cnd;
7652         tdvp = tond.ni_dvp;
7653         tvp = tond.ni_vp;
7654
7655         free_src_acl = FALSE;
7656         attr_cleanup = FALSE;
7657
7658         if (tvp != NULL) {
7659                 error = EEXIST;
7660                 goto out;
7661         }
7662
7663         if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7664                 error = EXDEV;
7665                 goto out;
7666         }
7667
7668 #if CONFIG_MACF
7669         if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
7670                 goto out;
7671         }
7672 #endif
7673         if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
7674                 goto out;
7675         }
7676
7677         action = KAUTH_VNODE_GENERIC_READ_BITS;
7678         if (data_read_authorised) {
7679                 action &= ~KAUTH_VNODE_READ_DATA;
7680         }
7681         if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
7682                 goto out;
7683         }
7684
7685         /*
7686          * certain attributes may need to be changed from the source, we ask for
7687          * those here.
7688          */
7689         VATTR_INIT(&va);
7690         VATTR_WANTED(&va, va_uid);
7691         VATTR_WANTED(&va, va_gid);
7692         VATTR_WANTED(&va, va_mode);
7693         VATTR_WANTED(&va, va_flags);
7694         VATTR_WANTED(&va, va_acl);
7695
7696         if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
7697                 goto out;
7698         }
7699
7700         VATTR_INIT(&nva);
7701         VATTR_SET(&nva, va_type, v_type);
7702         if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
7703                 VATTR_SET(&nva, va_acl, va.va_acl);
7704                 free_src_acl = TRUE;
7705         }
7706
7707         /* Handle ACL inheritance, initialize vap. */
7708         if (v_type == VLNK) {
7709                 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
7710         } else {
7711                 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
7712                 if (error) {
7713                         goto out;
7714                 }
7715                 attr_cleanup = TRUE;
7716         }
7717
7718         vnop_flags = VNODE_CLONEFILE_DEFAULT;
7719         /*
7720          * We've got initial values for all security parameters,
7721          * If we are superuser, then we can change owners to be the
7722          * same as the source. Both superuser and the owner have default
7723          * WRITE_SECURITY privileges so all other fields can be taken
7724          * from source as well.
7725          */
7726         if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
7727                 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
7728                         VATTR_SET(&nva, va_uid, va.va_uid);
7729                 }
7730                 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
7731                         VATTR_SET(&nva, va_gid, va.va_gid);
7732                 }
7733         } else {
7734                 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
7735         }
7736
7737         if (VATTR_IS_SUPPORTED(&va, va_mode)) {
7738                 VATTR_SET(&nva, va_mode, va.va_mode);
7739         }
7740         if (VATTR_IS_SUPPORTED(&va, va_flags)) {
7741                 VATTR_SET(&nva, va_flags,
7742                     ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
7743                     (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
7744         }
7745
7746         error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
7747
7748         if (!error && tvp) {
7749                 int     update_flags = 0;
7750 #if CONFIG_FSE
7751                 int fsevent;
7752 #endif /* CONFIG_FSE */
7753
7754 #if CONFIG_MACF
7755                 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
7756                     VNODE_LABEL_CREATE, ctx);
7757 #endif
7758                 /*
7759                  * If some of the requested attributes weren't handled by the
7760                  * VNOP, use our fallback code.
7761                  */
7762                 if (!VATTR_ALL_SUPPORTED(&va)) {
7763                         (void)vnode_setattr_fallback(tvp, &nva, ctx);
7764                 }
7765
7766                 // Make sure the name & parent pointers are hooked up
7767                 if (tvp->v_name == NULL) {
7768                         update_flags |= VNODE_UPDATE_NAME;
7769                 }
7770                 if (tvp->v_parent == NULLVP) {
7771                         update_flags |= VNODE_UPDATE_PARENT;
7772                 }
7773
7774                 if (update_flags) {
7775                         (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
7776                             cnp->cn_namelen, cnp->cn_hash, update_flags);
7777                 }
7778
7779 #if CONFIG_FSE
7780                 switch (vnode_vtype(tvp)) {
7781                 case VLNK:
7782                 /* FALLTHRU */
7783                 case VREG:
7784                         fsevent = FSE_CREATE_FILE;
7785                         break;
7786                 case VDIR:
7787                         fsevent = FSE_CREATE_DIR;
7788                         break;
7789                 default:
7790                         goto out;
7791                 }
7792
7793                 if (need_fsevent(fsevent, tvp)) {
7794                         /*
7795                          * The following is a sequence of three explicit events.
7796                          * A pair of FSE_CLONE events representing the source and destination
7797                          * followed by an FSE_CREATE_[FILE | DIR] for the destination.
7798                          * fseventsd may coalesce the destination clone and create events
7799                          * into a single event resulting in the following sequence for a client
7800                          * FSE_CLONE (src)
7801                          * FSE_CLONE | FSE_CREATE (dst)
7802                          */
7803                         add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
7804                             FSE_ARG_DONE);
7805                         add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
7806                             FSE_ARG_DONE);
7807                 }
7808 #endif /* CONFIG_FSE */
7809         }
7810
7811 out:
7812         if (attr_cleanup) {
7813                 vn_attribute_cleanup(&nva, defaulted);
7814         }
7815         if (free_src_acl && va.va_acl) {
7816                 kauth_acl_free(va.va_acl);
7817         }
7818         nameidone(&tond);
7819         if (tvp) {
7820                 vnode_put(tvp);
7821         }
7822         vnode_put(tdvp);
7823         return error;
7824 }
7825
7826 /*
7827  * clone files or directories, target must not exist.
7828  */
7829 /* ARGSUSED */
7830 int
7831 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7832     __unused int32_t *retval)
7833 {
7834         vnode_t fvp;
7835         struct nameidata fromnd;
7836         int follow;
7837         int error;
7838         vfs_context_t ctx = vfs_context_current();
7839
7840         /* Check that the flags are valid. */
7841         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7842                 return EINVAL;
7843         }
7844
7845         AUDIT_ARG(fd, uap->src_dirfd);
7846
7847         follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7848         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7849             UIO_USERSPACE, uap->src, ctx);
7850         if ((error = nameiat(&fromnd, uap->src_dirfd))) {
7851                 return error;
7852         }
7853
7854         fvp = fromnd.ni_vp;
7855         nameidone(&fromnd);
7856
7857         error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7858             uap->flags, ctx);
7859
7860         vnode_put(fvp);
7861         return error;
7862 }
7863
7864 int
7865 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7866     __unused int32_t *retval)
7867 {
7868         vnode_t fvp;
7869         struct fileproc *fp;
7870         int error;
7871         vfs_context_t ctx = vfs_context_current();
7872
7873         /* Check that the flags are valid. */
7874         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7875                 return EINVAL;
7876         }
7877
7878         AUDIT_ARG(fd, uap->src_fd);
7879         error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7880         if (error) {
7881                 return error;
7882         }
7883
7884         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7885                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7886                 error = EBADF;
7887                 goto out;
7888         }
7889
7890         if ((error = vnode_getwithref(fvp))) {
7891                 goto out;
7892         }
7893
7894         AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7895
7896         error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7897             uap->flags, ctx);
7898
7899         vnode_put(fvp);
7900 out:
7901         file_drop(uap->src_fd);
7902         return error;
7903 }
7904
7905 static int
7906 rename_submounts_callback(mount_t mp, void *arg)
7907 {
7908         int error = 0;
7909         mount_t pmp = (mount_t)arg;
7910         int prefix_len = strlen(pmp->mnt_vfsstat.f_mntonname);
7911
7912         if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
7913                 return 0;
7914         }
7915
7916         if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
7917                 return 0;
7918         }
7919
7920         if ((error = vfs_busy(mp, LK_NOWAIT))) {
7921                 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
7922                 return -1;
7923         }
7924
7925         int pathlen = MAXPATHLEN;
7926         if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
7927                 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
7928         }
7929
7930         vfs_unbusy(mp);
7931
7932         return error;
7933 }
7934
7935 /*
7936  * Rename files.  Source and destination must either both be directories,
7937  * or both not be directories.  If target is a directory, it must be empty.
7938  */
7939 /* ARGSUSED */
7940 static int
7941 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
7942     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
7943 {
7944         if (flags & ~VFS_RENAME_FLAGS_MASK) {
7945                 return EINVAL;
7946         }
7947
7948         if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
7949                 return EINVAL;
7950         }
7951
7952         vnode_t tvp, tdvp;
7953         vnode_t fvp, fdvp;
7954         struct nameidata *fromnd, *tond;
7955         int error;
7956         int do_retry;
7957         int retry_count;
7958         int mntrename;
7959         int need_event;
7960         int need_kpath2;
7961         int has_listeners;
7962         const char *oname = NULL;
7963         char *from_name = NULL, *to_name = NULL;
7964         char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
7965         int from_len = 0, to_len = 0;
7966         int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
7967         int holding_mntlock;
7968         mount_t locked_mp = NULL;
7969         vnode_t oparent = NULLVP;
7970 #if CONFIG_FSE
7971         fse_info from_finfo, to_finfo;
7972 #endif
7973         int from_truncated = 0, to_truncated = 0;
7974         int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
7975         int batched = 0;
7976         struct vnode_attr *fvap, *tvap;
7977         int continuing = 0;
7978         /* carving out a chunk for structs that are too big to be on stack. */
7979         struct {
7980                 struct nameidata from_node, to_node;
7981                 struct vnode_attr fv_attr, tv_attr;
7982         } * __rename_data;
7983         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
7984         fromnd = &__rename_data->from_node;
7985         tond = &__rename_data->to_node;
7986
7987         holding_mntlock = 0;
7988         do_retry = 0;
7989         retry_count = 0;
7990 retry:
7991         fvp = tvp = NULL;
7992         fdvp = tdvp = NULL;
7993         fvap = tvap = NULL;
7994         mntrename = FALSE;
7995
7996         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
7997             segflg, from, ctx);
7998         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
7999
8000         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8001             segflg, to, ctx);
8002         tond->ni_flag = NAMEI_COMPOUNDRENAME;
8003
8004 continue_lookup:
8005         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8006                 if ((error = nameiat(fromnd, fromfd))) {
8007                         goto out1;
8008                 }
8009                 fdvp = fromnd->ni_dvp;
8010                 fvp  = fromnd->ni_vp;
8011
8012                 if (fvp && fvp->v_type == VDIR) {
8013                         tond->ni_cnd.cn_flags |= WILLBEDIR;
8014                 }
8015         }
8016
8017         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8018                 if ((error = nameiat(tond, tofd))) {
8019                         /*
8020                          * Translate error code for rename("dir1", "dir2/.").
8021                          */
8022                         if (error == EISDIR && fvp->v_type == VDIR) {
8023                                 error = EINVAL;
8024                         }
8025                         goto out1;
8026                 }
8027                 tdvp = tond->ni_dvp;
8028                 tvp  = tond->ni_vp;
8029         }
8030
8031 #if DEVELOPMENT || DEBUG
8032         /*
8033          * XXX VSWAP: Check for entitlements or special flag here
8034          * so we can restrict access appropriately.
8035          */
8036 #else /* DEVELOPMENT || DEBUG */
8037
8038         if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8039                 error = EPERM;
8040                 goto out1;
8041         }
8042
8043         if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8044                 error = EPERM;
8045                 goto out1;
8046         }
8047 #endif /* DEVELOPMENT || DEBUG */
8048
8049         if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8050                 error = ENOENT;
8051                 goto out1;
8052         }
8053
8054         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8055                 error = EEXIST;
8056                 goto out1;
8057         }
8058
8059         batched = vnode_compound_rename_available(fdvp);
8060
8061 #if CONFIG_FSE
8062         need_event = need_fsevent(FSE_RENAME, fdvp);
8063         if (need_event) {
8064                 if (fvp) {
8065                         get_fse_info(fvp, &from_finfo, ctx);
8066                 } else {
8067                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8068                         if (error) {
8069                                 goto out1;
8070                         }
8071
8072                         fvap = &__rename_data->fv_attr;
8073                 }
8074
8075                 if (tvp) {
8076                         get_fse_info(tvp, &to_finfo, ctx);
8077                 } else if (batched) {
8078                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8079                         if (error) {
8080                                 goto out1;
8081                         }
8082
8083                         tvap = &__rename_data->tv_attr;
8084                 }
8085         }
8086 #else
8087         need_event = 0;
8088 #endif /* CONFIG_FSE */
8089
8090         has_listeners = kauth_authorize_fileop_has_listeners();
8091
8092         need_kpath2 = 0;
8093 #if CONFIG_AUDIT
8094         if (AUDIT_RECORD_EXISTS()) {
8095                 need_kpath2 = 1;
8096         }
8097 #endif
8098
8099         if (need_event || has_listeners) {
8100                 if (from_name == NULL) {
8101                         GET_PATH(from_name);
8102                         if (from_name == NULL) {
8103                                 error = ENOMEM;
8104                                 goto out1;
8105                         }
8106                 }
8107
8108                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8109
8110                 if (from_name_no_firmlink == NULL) {
8111                         GET_PATH(from_name_no_firmlink);
8112                         if (from_name_no_firmlink == NULL) {
8113                                 error = ENOMEM;
8114                                 goto out1;
8115                         }
8116                 }
8117
8118                 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8119         }
8120
8121         if (need_event || need_kpath2 || has_listeners) {
8122                 if (to_name == NULL) {
8123                         GET_PATH(to_name);
8124                         if (to_name == NULL) {
8125                                 error = ENOMEM;
8126                                 goto out1;
8127                         }
8128                 }
8129
8130                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8131
8132                 if (to_name_no_firmlink == NULL) {
8133                         GET_PATH(to_name_no_firmlink);
8134                         if (to_name_no_firmlink == NULL) {
8135                                 error = ENOMEM;
8136                                 goto out1;
8137                         }
8138                 }
8139
8140                 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8141                 if (to_name && need_kpath2) {
8142                         AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8143                 }
8144         }
8145         if (!fvp) {
8146                 /*
8147                  * Claim: this check will never reject a valid rename.
8148                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8149                  * Suppose fdvp and tdvp are not on the same mount.
8150                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
8151                  *      then you can't move it to within another dir on the same mountpoint.
8152                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8153                  *
8154                  * If this check passes, then we are safe to pass these vnodes to the same FS.
8155                  */
8156                 if (fdvp->v_mount != tdvp->v_mount) {
8157                         error = EXDEV;
8158                         goto out1;
8159                 }
8160                 goto skipped_lookup;
8161         }
8162
8163         if (!batched) {
8164                 error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
8165                 if (error) {
8166                         if (error == ENOENT) {
8167                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8168                                         /*
8169                                          * We encountered a race where after doing the namei, tvp stops
8170                                          * being valid. If so, simply re-drive the rename call from the
8171                                          * top.
8172                                          */
8173                                         do_retry = 1;
8174                                         retry_count += 1;
8175                                 }
8176                         }
8177                         goto out1;
8178                 }
8179         }
8180
8181         /*
8182          * If the source and destination are the same (i.e. they're
8183          * links to the same vnode) and the target file system is
8184          * case sensitive, then there is nothing to do.
8185          *
8186          * XXX Come back to this.
8187          */
8188         if (fvp == tvp) {
8189                 int pathconf_val;
8190
8191                 /*
8192                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8193                  * then assume that this file system is case sensitive.
8194                  */
8195                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8196                     pathconf_val != 0) {
8197                         goto out1;
8198                 }
8199         }
8200
8201         /*
8202          * Allow the renaming of mount points.
8203          * - target must not exist
8204          * - target must reside in the same directory as source
8205          * - union mounts cannot be renamed
8206          * - "/" cannot be renamed
8207          *
8208          * XXX Handle this in VFS after a continued lookup (if we missed
8209          * in the cache to start off)
8210          *
8211          * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8212          * we'll skip past here.  The file system is responsible for
8213          * checking that @tvp is not a descendent of @fvp and vice versa
8214          * so it should always return EINVAL if either @tvp or @fvp is the
8215          * root of a volume.
8216          */
8217         if ((fvp->v_flag & VROOT) &&
8218             (fvp->v_type == VDIR) &&
8219             (tvp == NULL) &&
8220             (fvp->v_mountedhere == NULL) &&
8221             (fdvp == tdvp) &&
8222             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8223             ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8224             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8225                 vnode_t coveredvp;
8226
8227                 /* switch fvp to the covered vnode */
8228                 coveredvp = fvp->v_mount->mnt_vnodecovered;
8229                 if ((vnode_getwithref(coveredvp))) {
8230                         error = ENOENT;
8231                         goto out1;
8232                 }
8233                 vnode_put(fvp);
8234
8235                 fvp = coveredvp;
8236                 mntrename = TRUE;
8237         }
8238         /*
8239          * Check for cross-device rename.
8240          */
8241         if ((fvp->v_mount != tdvp->v_mount) ||
8242             (tvp && (fvp->v_mount != tvp->v_mount))) {
8243                 error = EXDEV;
8244                 goto out1;
8245         }
8246
8247         /*
8248          * If source is the same as the destination (that is the
8249          * same inode number) then there is nothing to do...
8250          * EXCEPT if the underlying file system supports case
8251          * insensitivity and is case preserving.  In this case
8252          * the file system needs to handle the special case of
8253          * getting the same vnode as target (fvp) and source (tvp).
8254          *
8255          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8256          * and _PC_CASE_PRESERVING can have this exception, and they need to
8257          * handle the special case of getting the same vnode as target and
8258          * source.  NOTE: Then the target is unlocked going into vnop_rename,
8259          * so not to cause locking problems. There is a single reference on tvp.
8260          *
8261          * NOTE - that fvp == tvp also occurs if they are hard linked and
8262          * that correct behaviour then is just to return success without doing
8263          * anything.
8264          *
8265          * XXX filesystem should take care of this itself, perhaps...
8266          */
8267         if (fvp == tvp && fdvp == tdvp) {
8268                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8269                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8270                     fromnd->ni_cnd.cn_namelen)) {
8271                         goto out1;
8272                 }
8273         }
8274
8275         if (holding_mntlock && fvp->v_mount != locked_mp) {
8276                 /*
8277                  * we're holding a reference and lock
8278                  * on locked_mp, but it no longer matches
8279                  * what we want to do... so drop our hold
8280                  */
8281                 mount_unlock_renames(locked_mp);
8282                 mount_drop(locked_mp, 0);
8283                 holding_mntlock = 0;
8284         }
8285         if (tdvp != fdvp && fvp->v_type == VDIR) {
8286                 /*
8287                  * serialize renames that re-shape
8288                  * the tree... if holding_mntlock is
8289                  * set, then we're ready to go...
8290                  * otherwise we
8291                  * first need to drop the iocounts
8292                  * we picked up, second take the
8293                  * lock to serialize the access,
8294                  * then finally start the lookup
8295                  * process over with the lock held
8296                  */
8297                 if (!holding_mntlock) {
8298                         /*
8299                          * need to grab a reference on
8300                          * the mount point before we
8301                          * drop all the iocounts... once
8302                          * the iocounts are gone, the mount
8303                          * could follow
8304                          */
8305                         locked_mp = fvp->v_mount;
8306                         mount_ref(locked_mp, 0);
8307
8308                         /*
8309                          * nameidone has to happen before we vnode_put(tvp)
8310                          * since it may need to release the fs_nodelock on the tvp
8311                          */
8312                         nameidone(tond);
8313
8314                         if (tvp) {
8315                                 vnode_put(tvp);
8316                         }
8317                         vnode_put(tdvp);
8318
8319                         /*
8320                          * nameidone has to happen before we vnode_put(fdvp)
8321                          * since it may need to release the fs_nodelock on the fvp
8322                          */
8323                         nameidone(fromnd);
8324
8325                         vnode_put(fvp);
8326                         vnode_put(fdvp);
8327
8328                         mount_lock_renames(locked_mp);
8329                         holding_mntlock = 1;
8330
8331                         goto retry;
8332                 }
8333         } else {
8334                 /*
8335                  * when we dropped the iocounts to take
8336                  * the lock, we allowed the identity of
8337                  * the various vnodes to change... if they did,
8338                  * we may no longer be dealing with a rename
8339                  * that reshapes the tree... once we're holding
8340                  * the iocounts, the vnodes can't change type
8341                  * so we're free to drop the lock at this point
8342                  * and continue on
8343                  */
8344                 if (holding_mntlock) {
8345                         mount_unlock_renames(locked_mp);
8346                         mount_drop(locked_mp, 0);
8347                         holding_mntlock = 0;
8348                 }
8349         }
8350
8351         // save these off so we can later verify that fvp is the same
8352         oname   = fvp->v_name;
8353         oparent = fvp->v_parent;
8354
8355 skipped_lookup:
8356         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8357             tdvp, &tvp, &tond->ni_cnd, tvap,
8358             flags, ctx);
8359
8360         if (holding_mntlock) {
8361                 /*
8362                  * we can drop our serialization
8363                  * lock now
8364                  */
8365                 mount_unlock_renames(locked_mp);
8366                 mount_drop(locked_mp, 0);
8367                 holding_mntlock = 0;
8368         }
8369         if (error) {
8370                 if (error == EDATALESS) {
8371                         /*
8372                          * If we've been here before, something has gone
8373                          * horribly wrong and we should just get out lest
8374                          * we spiral around the drain forever.
8375                          */
8376                         if (flags & VFS_RENAME_DATALESS) {
8377                                 error = EIO;
8378                                 goto out1;
8379                         }
8380
8381                         /*
8382                          * The object we're renaming is dataless (or has a
8383                          * dataless descendent) and requires materialization
8384                          * before the rename occurs.  But we're holding the
8385                          * mount point's rename lock, so it's not safe to
8386                          * make the upcall.
8387                          *
8388                          * In this case, we release the lock, perform the
8389                          * materialization, and start the whole thing over.
8390                          */
8391                         error = vnode_materialize_dataless_file(fvp,
8392                             NAMESPACE_HANDLER_RENAME_OP);
8393
8394                         if (error == 0) {
8395                                 /*
8396                                  * The next time around we need to tell the
8397                                  * file system that the materializtaion has
8398                                  * been performed.
8399                                  */
8400                                 flags |= VFS_RENAME_DATALESS;
8401                                 do_retry = 1;
8402                         }
8403                         goto out1;
8404                 }
8405                 if (error == EKEEPLOOKING) {
8406                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8407                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8408                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8409                                 }
8410                         }
8411
8412                         fromnd->ni_vp = fvp;
8413                         tond->ni_vp = tvp;
8414
8415                         goto continue_lookup;
8416                 }
8417
8418                 /*
8419                  * We may encounter a race in the VNOP where the destination didn't
8420                  * exist when we did the namei, but it does by the time we go and
8421                  * try to create the entry. In this case, we should re-drive this rename
8422                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
8423                  * but other filesystems susceptible to this race could return it, too.
8424                  */
8425                 if (error == ERECYCLE) {
8426                         do_retry = 1;
8427                 }
8428
8429                 /*
8430                  * For compound VNOPs, the authorization callback may return
8431                  * ENOENT in case of racing hardlink lookups hitting the name
8432                  * cache, redrive the lookup.
8433                  */
8434                 if (batched && error == ENOENT) {
8435                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8436                                 do_retry = 1;
8437                                 retry_count += 1;
8438                         }
8439                 }
8440
8441                 goto out1;
8442         }
8443
8444         /* call out to allow 3rd party notification of rename.
8445          * Ignore result of kauth_authorize_fileop call.
8446          */
8447         kauth_authorize_fileop(vfs_context_ucred(ctx),
8448             KAUTH_FILEOP_RENAME,
8449             (uintptr_t)from_name, (uintptr_t)to_name);
8450         if (flags & VFS_RENAME_SWAP) {
8451                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8452                     KAUTH_FILEOP_RENAME,
8453                     (uintptr_t)to_name, (uintptr_t)from_name);
8454         }
8455
8456 #if CONFIG_FSE
8457         if (from_name != NULL && to_name != NULL) {
8458                 if (from_truncated || to_truncated) {
8459                         // set it here since only the from_finfo gets reported up to user space
8460                         from_finfo.mode |= FSE_TRUNCATED_PATH;
8461                 }
8462
8463                 if (tvap && tvp) {
8464                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8465                 }
8466                 if (fvap) {
8467                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8468                 }
8469
8470                 if (tvp) {
8471                         add_fsevent(FSE_RENAME, ctx,
8472                             FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8473                             FSE_ARG_FINFO, &from_finfo,
8474                             FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8475                             FSE_ARG_FINFO, &to_finfo,
8476                             FSE_ARG_DONE);
8477                         if (flags & VFS_RENAME_SWAP) {
8478                                 /*
8479                                  * Strictly speaking, swap is the equivalent of
8480                                  * *three* renames.  FSEvents clients should only take
8481                                  * the events as a hint, so we only bother reporting
8482                                  * two.
8483                                  */
8484                                 add_fsevent(FSE_RENAME, ctx,
8485                                     FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8486                                     FSE_ARG_FINFO, &to_finfo,
8487                                     FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8488                                     FSE_ARG_FINFO, &from_finfo,
8489                                     FSE_ARG_DONE);
8490                         }
8491                 } else {
8492                         add_fsevent(FSE_RENAME, ctx,
8493                             FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8494                             FSE_ARG_FINFO, &from_finfo,
8495                             FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8496                             FSE_ARG_DONE);
8497                 }
8498         }
8499 #endif /* CONFIG_FSE */
8500
8501         /*
8502          * update filesystem's mount point data
8503          */
8504         if (mntrename) {
8505                 char *cp, *pathend, *mpname;
8506                 char * tobuf;
8507                 struct mount *mp;
8508                 int maxlen;
8509                 size_t len = 0;
8510
8511                 mp = fvp->v_mountedhere;
8512
8513                 if (vfs_busy(mp, LK_NOWAIT)) {
8514                         error = EBUSY;
8515                         goto out1;
8516                 }
8517                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
8518
8519                 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8520                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8521                 } else {
8522                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8523                 }
8524                 if (!error) {
8525                         /* find current mount point prefix */
8526                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
8527                         for (cp = pathend; *cp != '\0'; ++cp) {
8528                                 if (*cp == '/') {
8529                                         pathend = cp + 1;
8530                                 }
8531                         }
8532                         /* find last component of target name */
8533                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8534                                 if (*cp == '/') {
8535                                         mpname = cp + 1;
8536                                 }
8537                         }
8538
8539                         /* Update f_mntonname of sub mounts */
8540                         vfs_iterate(0, rename_submounts_callback, (void *)mp);
8541
8542                         /* append name to prefix */
8543                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
8544                         bzero(pathend, maxlen);
8545
8546                         strlcpy(pathend, mpname, maxlen);
8547                 }
8548                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
8549
8550                 vfs_unbusy(mp);
8551
8552                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8553         }
8554         /*
8555          * fix up name & parent pointers.  note that we first
8556          * check that fvp has the same name/parent pointers it
8557          * had before the rename call... this is a 'weak' check
8558          * at best...
8559          *
8560          * XXX oparent and oname may not be set in the compound vnop case
8561          */
8562         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8563                 int update_flags;
8564
8565                 update_flags = VNODE_UPDATE_NAME;
8566
8567                 if (fdvp != tdvp) {
8568                         update_flags |= VNODE_UPDATE_PARENT;
8569                 }
8570
8571                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8572         }
8573 out1:
8574         if (to_name != NULL) {
8575                 RELEASE_PATH(to_name);
8576                 to_name = NULL;
8577         }
8578         if (to_name_no_firmlink != NULL) {
8579                 RELEASE_PATH(to_name_no_firmlink);
8580                 to_name_no_firmlink = NULL;
8581         }
8582         if (from_name != NULL) {
8583                 RELEASE_PATH(from_name);
8584                 from_name = NULL;
8585         }
8586         if (from_name_no_firmlink != NULL) {
8587                 RELEASE_PATH(from_name_no_firmlink);
8588                 from_name_no_firmlink = NULL;
8589         }
8590         if (holding_mntlock) {
8591                 mount_unlock_renames(locked_mp);
8592                 mount_drop(locked_mp, 0);
8593                 holding_mntlock = 0;
8594         }
8595         if (tdvp) {
8596                 /*
8597                  * nameidone has to happen before we vnode_put(tdvp)
8598                  * since it may need to release the fs_nodelock on the tdvp
8599                  */
8600                 nameidone(tond);
8601
8602                 if (tvp) {
8603                         vnode_put(tvp);
8604                 }
8605                 vnode_put(tdvp);
8606         }
8607         if (fdvp) {
8608                 /*
8609                  * nameidone has to happen before we vnode_put(fdvp)
8610                  * since it may need to release the fs_nodelock on the fdvp
8611                  */
8612                 nameidone(fromnd);
8613
8614                 if (fvp) {
8615                         vnode_put(fvp);
8616                 }
8617                 vnode_put(fdvp);
8618         }
8619
8620         /*
8621          * If things changed after we did the namei, then we will re-drive
8622          * this rename call from the top.
8623          */
8624         if (do_retry) {
8625                 do_retry = 0;
8626                 goto retry;
8627         }
8628
8629         FREE(__rename_data, M_TEMP);
8630         return error;
8631 }
8632
8633 int
8634 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
8635 {
8636         return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
8637                    AT_FDCWD, uap->to, UIO_USERSPACE, 0);
8638 }
8639
8640 int
8641 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
8642 {
8643         return renameat_internal(
8644                 vfs_context_current(),
8645                 uap->fromfd, uap->from,
8646                 uap->tofd, uap->to,
8647                 UIO_USERSPACE, uap->flags);
8648 }
8649
8650 int
8651 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
8652 {
8653         return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
8654                    uap->tofd, uap->to, UIO_USERSPACE, 0);
8655 }
8656
8657 /*
8658  * Make a directory file.
8659  *
8660  * Returns:     0                       Success
8661  *              EEXIST
8662  *      namei:???
8663  *      vnode_authorize:???
8664  *      vn_create:???
8665  */
8666 /* ARGSUSED */
8667 static int
8668 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
8669     enum uio_seg segflg)
8670 {
8671         vnode_t vp, dvp;
8672         int error;
8673         int update_flags = 0;
8674         int batched;
8675         struct nameidata nd;
8676
8677         AUDIT_ARG(mode, vap->va_mode);
8678         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
8679             path, ctx);
8680         nd.ni_cnd.cn_flags |= WILLBEDIR;
8681         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
8682
8683 continue_lookup:
8684         error = nameiat(&nd, fd);
8685         if (error) {
8686                 return error;
8687         }
8688         dvp = nd.ni_dvp;
8689         vp = nd.ni_vp;
8690
8691         if (vp != NULL) {
8692                 error = EEXIST;
8693                 goto out;
8694         }
8695
8696         batched = vnode_compound_mkdir_available(dvp);
8697
8698         VATTR_SET(vap, va_type, VDIR);
8699
8700         /*
8701          * XXX
8702          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
8703          * only get EXISTS or EISDIR for existing path components, and not that it could see
8704          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
8705          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
8706          */
8707         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
8708                 if (error == EACCES || error == EPERM) {
8709                         int error2;
8710
8711                         nameidone(&nd);
8712                         vnode_put(dvp);
8713                         dvp = NULLVP;
8714
8715                         /*
8716                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
8717                          * rather than EACCESS if the target exists.
8718                          */
8719                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
8720                             path, ctx);
8721                         error2 = nameiat(&nd, fd);
8722                         if (error2) {
8723                                 goto out;
8724                         } else {
8725                                 vp = nd.ni_vp;
8726                                 error = EEXIST;
8727                                 goto out;
8728                         }
8729                 }
8730
8731                 goto out;
8732         }
8733
8734         /*
8735          * make the directory
8736          */
8737         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
8738                 if (error == EKEEPLOOKING) {
8739                         nd.ni_vp = vp;
8740                         goto continue_lookup;
8741                 }
8742
8743                 goto out;
8744         }
8745
8746         // Make sure the name & parent pointers are hooked up
8747         if (vp->v_name == NULL) {
8748                 update_flags |= VNODE_UPDATE_NAME;
8749         }
8750         if (vp->v_parent == NULLVP) {
8751                 update_flags |= VNODE_UPDATE_PARENT;
8752         }
8753
8754         if (update_flags) {
8755                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
8756         }
8757
8758 #if CONFIG_FSE
8759         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
8760 #endif
8761
8762 out:
8763         /*
8764          * nameidone has to happen before we vnode_put(dvp)
8765          * since it may need to release the fs_nodelock on the dvp
8766          */
8767         nameidone(&nd);
8768
8769         if (vp) {
8770                 vnode_put(vp);
8771         }
8772         if (dvp) {
8773                 vnode_put(dvp);
8774         }
8775
8776         return error;
8777 }
8778
8779 /*
8780  * mkdir_extended: Create a directory; with extended security (ACL).
8781  *
8782  * Parameters:    p                       Process requesting to create the directory
8783  *                uap                     User argument descriptor (see below)
8784  *                retval                  (ignored)
8785  *
8786  * Indirect:      uap->path               Path of directory to create
8787  *                uap->mode               Access permissions to set
8788  *                uap->xsecurity          ACL to set
8789  *
8790  * Returns:        0                      Success
8791  *                !0                      Not success
8792  *
8793  */
8794 int
8795 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
8796 {
8797         int ciferror;
8798         kauth_filesec_t xsecdst;
8799         struct vnode_attr va;
8800
8801         AUDIT_ARG(owner, uap->uid, uap->gid);
8802
8803         xsecdst = NULL;
8804         if ((uap->xsecurity != USER_ADDR_NULL) &&
8805             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
8806                 return ciferror;
8807         }
8808
8809         VATTR_INIT(&va);
8810         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8811         if (xsecdst != NULL) {
8812                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
8813         }
8814
8815         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8816             UIO_USERSPACE);
8817         if (xsecdst != NULL) {
8818                 kauth_filesec_free(xsecdst);
8819         }
8820         return ciferror;
8821 }
8822
8823 int
8824 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
8825 {
8826         struct vnode_attr va;
8827
8828         VATTR_INIT(&va);
8829         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8830
8831         return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8832                    UIO_USERSPACE);
8833 }
8834
8835 int
8836 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
8837 {
8838         struct vnode_attr va;
8839
8840         VATTR_INIT(&va);
8841         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8842
8843         return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
8844                    UIO_USERSPACE);
8845 }
8846
8847 static int
8848 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
8849     enum uio_seg segflg, int unlink_flags)
8850 {
8851         vnode_t vp, dvp;
8852         int error;
8853         struct nameidata nd;
8854         char     *path = NULL;
8855         char     *no_firmlink_path = NULL;
8856         int       len_path = 0;
8857         int       len_no_firmlink_path = 0;
8858         int has_listeners = 0;
8859         int need_event = 0;
8860         int truncated_path = 0;
8861         int truncated_no_firmlink_path = 0;
8862 #if CONFIG_FSE
8863         struct vnode_attr va;
8864 #endif /* CONFIG_FSE */
8865         struct vnode_attr *vap = NULL;
8866         int restart_count = 0;
8867         int batched;
8868
8869         int restart_flag;
8870
8871         /*
8872          * This loop exists to restart rmdir in the unlikely case that two
8873          * processes are simultaneously trying to remove the same directory
8874          * containing orphaned appleDouble files.
8875          */
8876         do {
8877                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
8878                     segflg, dirpath, ctx);
8879                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
8880 continue_lookup:
8881                 restart_flag = 0;
8882                 vap = NULL;
8883
8884                 error = nameiat(&nd, fd);
8885                 if (error) {
8886                         return error;
8887                 }
8888
8889                 dvp = nd.ni_dvp;
8890                 vp = nd.ni_vp;
8891
8892                 if (vp) {
8893                         batched = vnode_compound_rmdir_available(vp);
8894
8895                         if (vp->v_flag & VROOT) {
8896                                 /*
8897                                  * The root of a mounted filesystem cannot be deleted.
8898                                  */
8899                                 error = EBUSY;
8900                                 goto out;
8901                         }
8902
8903 #if DEVELOPMENT || DEBUG
8904                         /*
8905                          * XXX VSWAP: Check for entitlements or special flag here
8906                          * so we can restrict access appropriately.
8907                          */
8908 #else /* DEVELOPMENT || DEBUG */
8909
8910                         if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
8911                                 error = EPERM;
8912                                 goto out;
8913                         }
8914 #endif /* DEVELOPMENT || DEBUG */
8915
8916                         /*
8917                          * Removed a check here; we used to abort if vp's vid
8918                          * was not the same as what we'd seen the last time around.
8919                          * I do not think that check was valid, because if we retry
8920                          * and all dirents are gone, the directory could legitimately
8921                          * be recycled but still be present in a situation where we would
8922                          * have had permission to delete.  Therefore, we won't make
8923                          * an effort to preserve that check now that we may not have a
8924                          * vp here.
8925                          */
8926
8927                         if (!batched) {
8928                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
8929                                 if (error) {
8930                                         if (error == ENOENT) {
8931                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8932                                                         restart_flag = 1;
8933                                                         restart_count += 1;
8934                                                 }
8935                                         }
8936                                         goto out;
8937                                 }
8938                         }
8939                 } else {
8940                         batched = 1;
8941
8942                         if (!vnode_compound_rmdir_available(dvp)) {
8943                                 panic("No error, but no compound rmdir?");
8944                         }
8945                 }
8946
8947 #if CONFIG_FSE
8948                 fse_info  finfo;
8949
8950                 need_event = need_fsevent(FSE_DELETE, dvp);
8951                 if (need_event) {
8952                         if (!batched) {
8953                                 get_fse_info(vp, &finfo, ctx);
8954                         } else {
8955                                 error = vfs_get_notify_attributes(&va);
8956                                 if (error) {
8957                                         goto out;
8958                                 }
8959
8960                                 vap = &va;
8961                         }
8962                 }
8963 #endif
8964                 has_listeners = kauth_authorize_fileop_has_listeners();
8965                 if (need_event || has_listeners) {
8966                         if (path == NULL) {
8967                                 GET_PATH(path);
8968                                 if (path == NULL) {
8969                                         error = ENOMEM;
8970                                         goto out;
8971                                 }
8972                         }
8973
8974                         len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
8975
8976                         if (no_firmlink_path == NULL) {
8977                                 GET_PATH(no_firmlink_path);
8978                                 if (no_firmlink_path == NULL) {
8979                                         error = ENOMEM;
8980                                         goto out;
8981                                 }
8982                         }
8983
8984                         len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
8985 #if CONFIG_FSE
8986                         if (truncated_no_firmlink_path) {
8987                                 finfo.mode |= FSE_TRUNCATED_PATH;
8988                         }
8989 #endif
8990                 }
8991
8992                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8993                 nd.ni_vp = vp;
8994                 if (vp == NULLVP) {
8995                         /* Couldn't find a vnode */
8996                         goto out;
8997                 }
8998
8999                 if (error == EKEEPLOOKING) {
9000                         goto continue_lookup;
9001                 } else if (batched && error == ENOENT) {
9002                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9003                                 /*
9004                                  * For compound VNOPs, the authorization callback
9005                                  * may return ENOENT in case of racing hard link lookups
9006                                  * redrive the lookup.
9007                                  */
9008                                 restart_flag = 1;
9009                                 restart_count += 1;
9010                                 goto out;
9011                         }
9012                 }
9013
9014                 /*
9015                  * XXX There's no provision for passing flags
9016                  * to VNOP_RMDIR().  So, if vn_rmdir() fails
9017                  * because it's not empty, then we try again
9018                  * with VNOP_REMOVE(), passing in a special
9019                  * flag that clever file systems will know
9020                  * how to handle.
9021                  */
9022                 if (error == ENOTEMPTY &&
9023                     (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9024                         /*
9025                          * If this fails, we want to keep the original
9026                          * error.
9027                          */
9028                         if (vn_remove(dvp, &vp, &nd,
9029                             VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9030                                 error = 0;
9031                         }
9032                 }
9033
9034 #if CONFIG_APPLEDOUBLE
9035                 /*
9036                  * Special case to remove orphaned AppleDouble
9037                  * files. I don't like putting this in the kernel,
9038                  * but carbon does not like putting this in carbon either,
9039                  * so here we are.
9040                  */
9041                 if (error == ENOTEMPTY) {
9042                         int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9043                         if (ad_error == EBUSY) {
9044                                 error = ad_error;
9045                                 goto out;
9046                         }
9047
9048
9049                         /*
9050                          * Assuming everything went well, we will try the RMDIR again
9051                          */
9052                         if (!ad_error) {
9053                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9054                         }
9055                 }
9056 #endif /* CONFIG_APPLEDOUBLE */
9057                 /*
9058                  * Call out to allow 3rd party notification of delete.
9059                  * Ignore result of kauth_authorize_fileop call.
9060                  */
9061                 if (!error) {
9062                         if (has_listeners) {
9063                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
9064                                     KAUTH_FILEOP_DELETE,
9065                                     (uintptr_t)vp,
9066                                     (uintptr_t)path);
9067                         }
9068
9069                         if (vp->v_flag & VISHARDLINK) {
9070                                 // see the comment in unlink1() about why we update
9071                                 // the parent of a hard link when it is removed
9072                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9073                         }
9074
9075 #if CONFIG_FSE
9076                         if (need_event) {
9077                                 if (vap) {
9078                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
9079                                 }
9080                                 add_fsevent(FSE_DELETE, ctx,
9081                                     FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9082                                     FSE_ARG_FINFO, &finfo,
9083                                     FSE_ARG_DONE);
9084                         }
9085 #endif
9086                 }
9087
9088 out:
9089                 if (path != NULL) {
9090                         RELEASE_PATH(path);
9091                         path = NULL;
9092                 }
9093
9094                 if (no_firmlink_path != NULL) {
9095                         RELEASE_PATH(no_firmlink_path);
9096                         no_firmlink_path = NULL;
9097                 }
9098
9099                 /*
9100                  * nameidone has to happen before we vnode_put(dvp)
9101                  * since it may need to release the fs_nodelock on the dvp
9102                  */
9103                 nameidone(&nd);
9104                 vnode_put(dvp);
9105
9106                 if (vp) {
9107                         vnode_put(vp);
9108                 }
9109
9110                 if (restart_flag == 0) {
9111                         wakeup_one((caddr_t)vp);
9112                         return error;
9113                 }
9114                 tsleep(vp, PVFS, "rm AD", 1);
9115         } while (restart_flag != 0);
9116
9117         return error;
9118 }
9119
9120 /*
9121  * Remove a directory file.
9122  */
9123 /* ARGSUSED */
9124 int
9125 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9126 {
9127         return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9128                    CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9129 }
9130
9131 /* Get direntry length padded to 8 byte alignment */
9132 #define DIRENT64_LEN(namlen) \
9133         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9134
9135 /* Get dirent length padded to 4 byte alignment */
9136 #define DIRENT_LEN(namelen) \
9137         ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9138
9139 /* Get the end of this dirent */
9140 #define DIRENT_END(dep) \
9141         (((char *)(dep)) + (dep)->d_reclen - 1)
9142
9143 errno_t
9144 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9145     int *numdirent, vfs_context_t ctxp)
9146 {
9147         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
9148         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9149             ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9150                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9151         } else {
9152                 size_t bufsize;
9153                 void * bufptr;
9154                 uio_t auio;
9155                 struct direntry *entry64;
9156                 struct dirent *dep;
9157                 int bytesread;
9158                 int error;
9159
9160                 /*
9161                  * We're here because the underlying file system does not
9162                  * support direnties or we mounted denying support so we must
9163                  * fall back to dirents and convert them to direntries.
9164                  *
9165                  * Our kernel buffer needs to be smaller since re-packing will
9166                  * expand each dirent.  The worse case (when the name length
9167                  * is 3 or less) corresponds to a struct direntry size of 32
9168                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9169                  * (4-byte aligned).  So having a buffer that is 3/8 the size
9170                  * will prevent us from reading more than we can pack.
9171                  *
9172                  * Since this buffer is wired memory, we will limit the
9173                  * buffer size to a maximum of 32K. We would really like to
9174                  * use 32K in the MIN(), but we use magic number 87371 to
9175                  * prevent uio_resid() * 3 / 8 from overflowing.
9176                  */
9177                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9178                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
9179                 if (bufptr == NULL) {
9180                         return ENOMEM;
9181                 }
9182
9183                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9184                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9185                 auio->uio_offset = uio->uio_offset;
9186
9187                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9188
9189                 dep = (struct dirent *)bufptr;
9190                 bytesread = bufsize - uio_resid(auio);
9191
9192                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
9193                     M_TEMP, M_WAITOK);
9194                 /*
9195                  * Convert all the entries and copy them out to user's buffer.
9196                  */
9197                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9198                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
9199
9200                         if (DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9201                             DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9202                                 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9203                                     vp->v_mount->mnt_vfsstat.f_mntonname,
9204                                     vp->v_name ? vp->v_name : "<unknown>");
9205                                 error = EIO;
9206                                 break;
9207                         }
9208
9209                         bzero(entry64, enbufsize);
9210                         /* Convert a dirent to a dirent64. */
9211                         entry64->d_ino = dep->d_ino;
9212                         entry64->d_seekoff = 0;
9213                         entry64->d_reclen = enbufsize;
9214                         entry64->d_namlen = dep->d_namlen;
9215                         entry64->d_type = dep->d_type;
9216                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9217
9218                         /* Move to next entry. */
9219                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
9220
9221                         /* Copy entry64 to user's buffer. */
9222                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9223                 }
9224
9225                 /* Update the real offset using the offset we got from VNOP_READDIR. */
9226                 if (error == 0) {
9227                         uio->uio_offset = auio->uio_offset;
9228                 }
9229                 uio_free(auio);
9230                 FREE(bufptr, M_TEMP);
9231                 FREE(entry64, M_TEMP);
9232                 return error;
9233         }
9234 }
9235
9236 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
9237
9238 /*
9239  * Read a block of directory entries in a file system independent format.
9240  */
9241 static int
9242 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9243     off_t *offset, int *eofflag, int flags)
9244 {
9245         vnode_t vp;
9246         struct vfs_context context = *vfs_context_current();    /* local copy */
9247         struct fileproc *fp;
9248         uio_t auio;
9249         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9250         off_t loff;
9251         int error, numdirent;
9252         char uio_buf[UIO_SIZEOF(1)];
9253
9254         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9255         if (error) {
9256                 return error;
9257         }
9258         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9259                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9260                 error = EBADF;
9261                 goto out;
9262         }
9263
9264         if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9265                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
9266         }
9267
9268 #if CONFIG_MACF
9269         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
9270         if (error) {
9271                 goto out;
9272         }
9273 #endif
9274         if ((error = vnode_getwithref(vp))) {
9275                 goto out;
9276         }
9277         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9278
9279 unionread:
9280         if (vp->v_type != VDIR) {
9281                 (void)vnode_put(vp);
9282                 error = EINVAL;
9283                 goto out;
9284         }
9285
9286 #if CONFIG_MACF
9287         error = mac_vnode_check_readdir(&context, vp);
9288         if (error != 0) {
9289                 (void)vnode_put(vp);
9290                 goto out;
9291         }
9292 #endif /* MAC */
9293
9294         loff = fp->f_fglob->fg_offset;
9295         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9296         uio_addiov(auio, bufp, bufsize);
9297
9298         if (flags & VNODE_READDIR_EXTENDED) {
9299                 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9300                 fp->f_fglob->fg_offset = uio_offset(auio);
9301         } else {
9302                 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9303                 fp->f_fglob->fg_offset = uio_offset(auio);
9304         }
9305         if (error) {
9306                 (void)vnode_put(vp);
9307                 goto out;
9308         }
9309
9310         if ((user_ssize_t)bufsize == uio_resid(auio)) {
9311                 if (union_dircheckp) {
9312                         error = union_dircheckp(&vp, fp, &context);
9313                         if (error == -1) {
9314                                 goto unionread;
9315                         }
9316                         if (error) {
9317                                 (void)vnode_put(vp);
9318                                 goto out;
9319                         }
9320                 }
9321
9322                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
9323                         struct vnode *tvp = vp;
9324                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
9325                                 vnode_ref(vp);
9326                                 fp->f_fglob->fg_data = (caddr_t) vp;
9327                                 fp->f_fglob->fg_offset = 0;
9328                                 vnode_rele(tvp);
9329                                 vnode_put(tvp);
9330                                 goto unionread;
9331                         }
9332                         vp = tvp;
9333                 }
9334         }
9335
9336         vnode_put(vp);
9337         if (offset) {
9338                 *offset = loff;
9339         }
9340
9341         *bytesread = bufsize - uio_resid(auio);
9342 out:
9343         file_drop(fd);
9344         return error;
9345 }
9346
9347
9348 int
9349 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9350 {
9351         off_t offset;
9352         ssize_t bytesread;
9353         int error, eofflag;
9354
9355         AUDIT_ARG(fd, uap->fd);
9356         error = getdirentries_common(uap->fd, uap->buf, uap->count,
9357             &bytesread, &offset, &eofflag, 0);
9358
9359         if (error == 0) {
9360                 if (proc_is64bit(p)) {
9361                         user64_long_t base = (user64_long_t)offset;
9362                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9363                 } else {
9364                         user32_long_t base = (user32_long_t)offset;
9365                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9366                 }
9367                 *retval = bytesread;
9368         }
9369         return error;
9370 }
9371
9372 int
9373 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9374 {
9375         off_t offset;
9376         ssize_t bytesread;
9377         int error, eofflag;
9378         user_size_t bufsize;
9379
9380         AUDIT_ARG(fd, uap->fd);
9381
9382         /*
9383          * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9384          * then the kernel carves out the last 4 bytes to return extended
9385          * information to userspace (namely whether we reached EOF with this call).
9386          */
9387         if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9388                 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9389         } else {
9390                 bufsize = uap->bufsize;
9391         }
9392
9393         error = getdirentries_common(uap->fd, uap->buf, bufsize,
9394             &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9395
9396         if (error == 0) {
9397                 *retval = bytesread;
9398                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9399
9400                 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9401                         getdirentries64_flags_t flags = 0;
9402                         if (eofflag) {
9403                                 flags |= GETDIRENTRIES64_EOF;
9404                         }
9405                         error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9406                             sizeof(flags));
9407                 }
9408         }
9409         return error;
9410 }
9411
9412
9413 /*
9414  * Set the mode mask for creation of filesystem nodes.
9415  * XXX implement xsecurity
9416  */
9417 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
9418 static int
9419 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9420 {
9421         struct filedesc *fdp;
9422
9423         AUDIT_ARG(mask, newmask);
9424         proc_fdlock(p);
9425         fdp = p->p_fd;
9426         *retval = fdp->fd_cmask;
9427         fdp->fd_cmask = newmask & ALLPERMS;
9428         proc_fdunlock(p);
9429         return 0;
9430 }
9431
9432 /*
9433  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9434  *
9435  * Parameters:    p                       Process requesting to set the umask
9436  *                uap                     User argument descriptor (see below)
9437  *                retval                  umask of the process (parameter p)
9438  *
9439  * Indirect:      uap->newmask            umask to set
9440  *                uap->xsecurity          ACL to set
9441  *
9442  * Returns:        0                      Success
9443  *                !0                      Not success
9444  *
9445  */
9446 int
9447 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9448 {
9449         int ciferror;
9450         kauth_filesec_t xsecdst;
9451
9452         xsecdst = KAUTH_FILESEC_NONE;
9453         if (uap->xsecurity != USER_ADDR_NULL) {
9454                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
9455                         return ciferror;
9456                 }
9457         } else {
9458                 xsecdst = KAUTH_FILESEC_NONE;
9459         }
9460
9461         ciferror = umask1(p, uap->newmask, xsecdst, retval);
9462
9463         if (xsecdst != KAUTH_FILESEC_NONE) {
9464                 kauth_filesec_free(xsecdst);
9465         }
9466         return ciferror;
9467 }
9468
9469 int
9470 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9471 {
9472         return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9473 }
9474
9475 /*
9476  * Void all references to file by ripping underlying filesystem
9477  * away from vnode.
9478  */
9479 /* ARGSUSED */
9480 int
9481 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9482 {
9483         vnode_t vp;
9484         struct vnode_attr va;
9485         vfs_context_t ctx = vfs_context_current();
9486         int error;
9487         struct nameidata nd;
9488
9489         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9490             uap->path, ctx);
9491         error = namei(&nd);
9492         if (error) {
9493                 return error;
9494         }
9495         vp = nd.ni_vp;
9496
9497         nameidone(&nd);
9498
9499         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9500                 error = ENOTSUP;
9501                 goto out;
9502         }
9503
9504         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9505                 error = EBUSY;
9506                 goto out;
9507         }
9508
9509 #if CONFIG_MACF
9510         error = mac_vnode_check_revoke(ctx, vp);
9511         if (error) {
9512                 goto out;
9513         }
9514 #endif
9515
9516         VATTR_INIT(&va);
9517         VATTR_WANTED(&va, va_uid);
9518         if ((error = vnode_getattr(vp, &va, ctx))) {
9519                 goto out;
9520         }
9521         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9522             (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9523                 goto out;
9524         }
9525         if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9526                 VNOP_REVOKE(vp, REVOKEALL, ctx);
9527         }
9528 out:
9529         vnode_put(vp);
9530         return error;
9531 }
9532
9533
9534 /*
9535  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9536  *  The following system calls are designed to support features
9537  *  which are specific to the HFS & HFS Plus volume formats
9538  */
9539
9540
9541 /*
9542  * Obtain attribute information on objects in a directory while enumerating
9543  * the directory.
9544  */
9545 /* ARGSUSED */
9546 int
9547 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9548 {
9549         vnode_t vp;
9550         struct fileproc *fp;
9551         uio_t auio = NULL;
9552         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9553         uint32_t count = 0, savecount = 0;
9554         uint32_t newstate = 0;
9555         int error, eofflag;
9556         uint32_t loff = 0;
9557         struct attrlist attributelist;
9558         vfs_context_t ctx = vfs_context_current();
9559         int fd = uap->fd;
9560         char uio_buf[UIO_SIZEOF(1)];
9561         kauth_action_t action;
9562
9563         AUDIT_ARG(fd, fd);
9564
9565         /* Get the attributes into kernel space */
9566         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9567                 return error;
9568         }
9569         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9570                 return error;
9571         }
9572         savecount = count;
9573         if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9574                 return error;
9575         }
9576         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9577                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9578                 error = EBADF;
9579                 goto out;
9580         }
9581
9582
9583 #if CONFIG_MACF
9584         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
9585             fp->f_fglob);
9586         if (error) {
9587                 goto out;
9588         }
9589 #endif
9590
9591
9592         if ((error = vnode_getwithref(vp))) {
9593                 goto out;
9594         }
9595
9596         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9597
9598 unionread:
9599         if (vp->v_type != VDIR) {
9600                 (void)vnode_put(vp);
9601                 error = EINVAL;
9602                 goto out;
9603         }
9604
9605 #if CONFIG_MACF
9606         error = mac_vnode_check_readdir(ctx, vp);
9607         if (error != 0) {
9608                 (void)vnode_put(vp);
9609                 goto out;
9610         }
9611 #endif /* MAC */
9612
9613         /* set up the uio structure which will contain the users return buffer */
9614         loff = fp->f_fglob->fg_offset;
9615         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9616         uio_addiov(auio, uap->buffer, uap->buffersize);
9617
9618         /*
9619          * If the only item requested is file names, we can let that past with
9620          * just LIST_DIRECTORY.  If they want any other attributes, that means
9621          * they need SEARCH as well.
9622          */
9623         action = KAUTH_VNODE_LIST_DIRECTORY;
9624         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
9625             attributelist.fileattr || attributelist.dirattr) {
9626                 action |= KAUTH_VNODE_SEARCH;
9627         }
9628
9629         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
9630                 /* Believe it or not, uap->options only has 32-bits of valid
9631                  * info, so truncate before extending again */
9632
9633                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
9634                     (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
9635         }
9636
9637         if (error) {
9638                 (void) vnode_put(vp);
9639                 goto out;
9640         }
9641
9642         /*
9643          * If we've got the last entry of a directory in a union mount
9644          * then reset the eofflag and pretend there's still more to come.
9645          * The next call will again set eofflag and the buffer will be empty,
9646          * so traverse to the underlying directory and do the directory
9647          * read there.
9648          */
9649         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
9650                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
9651                         eofflag = 0;
9652                 } else {                                                // Empty buffer
9653                         struct vnode *tvp = vp;
9654                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
9655                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
9656                                 fp->f_fglob->fg_data = (caddr_t) vp;
9657                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
9658                                 count = savecount;
9659                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
9660                                 vnode_put(tvp);
9661                                 goto unionread;
9662                         }
9663                         vp = tvp;
9664                 }
9665         }
9666
9667         (void)vnode_put(vp);
9668
9669         if (error) {
9670                 goto out;
9671         }
9672         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
9673
9674         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
9675                 goto out;
9676         }
9677         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
9678                 goto out;
9679         }
9680         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
9681                 goto out;
9682         }
9683
9684         *retval = eofflag;  /* similar to getdirentries */
9685         error = 0;
9686 out:
9687         file_drop(fd);
9688         return error; /* return error earlier, an retval of 0 or 1 now */
9689 } /* end of getdirentriesattr system call */
9690
9691 /*
9692  * Exchange data between two files
9693  */
9694
9695 /* ARGSUSED */
9696 int
9697 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
9698 {
9699         struct nameidata fnd, snd;
9700         vfs_context_t ctx = vfs_context_current();
9701         vnode_t fvp;
9702         vnode_t svp;
9703         int error;
9704         u_int32_t nameiflags;
9705         char *fpath = NULL;
9706         char *spath = NULL;
9707         int   flen = 0, slen = 0;
9708         int from_truncated = 0, to_truncated = 0;
9709 #if CONFIG_FSE
9710         fse_info f_finfo, s_finfo;
9711 #endif
9712
9713         nameiflags = 0;
9714         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
9715                 nameiflags |= FOLLOW;
9716         }
9717
9718         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
9719             UIO_USERSPACE, uap->path1, ctx);
9720
9721         error = namei(&fnd);
9722         if (error) {
9723                 goto out2;
9724         }
9725
9726         nameidone(&fnd);
9727         fvp = fnd.ni_vp;
9728
9729         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
9730             UIO_USERSPACE, uap->path2, ctx);
9731
9732         error = namei(&snd);
9733         if (error) {
9734                 vnode_put(fvp);
9735                 goto out2;
9736         }
9737         nameidone(&snd);
9738         svp = snd.ni_vp;
9739
9740         /*
9741          * if the files are the same, return an inval error
9742          */
9743         if (svp == fvp) {
9744                 error = EINVAL;
9745                 goto out;
9746         }
9747
9748         /*
9749          * if the files are on different volumes, return an error
9750          */
9751         if (svp->v_mount != fvp->v_mount) {
9752                 error = EXDEV;
9753                 goto out;
9754         }
9755
9756         /* If they're not files, return an error */
9757         if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
9758                 error = EINVAL;
9759                 goto out;
9760         }
9761
9762 #if CONFIG_MACF
9763         error = mac_vnode_check_exchangedata(ctx,
9764             fvp, svp);
9765         if (error) {
9766                 goto out;
9767         }
9768 #endif
9769         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
9770             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
9771                 goto out;
9772         }
9773
9774         if (
9775 #if CONFIG_FSE
9776                 need_fsevent(FSE_EXCHANGE, fvp) ||
9777 #endif
9778                 kauth_authorize_fileop_has_listeners()) {
9779                 GET_PATH(fpath);
9780                 GET_PATH(spath);
9781                 if (fpath == NULL || spath == NULL) {
9782                         error = ENOMEM;
9783                         goto out;
9784                 }
9785
9786                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
9787                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
9788
9789 #if CONFIG_FSE
9790                 get_fse_info(fvp, &f_finfo, ctx);
9791                 get_fse_info(svp, &s_finfo, ctx);
9792                 if (from_truncated || to_truncated) {
9793                         // set it here since only the f_finfo gets reported up to user space
9794                         f_finfo.mode |= FSE_TRUNCATED_PATH;
9795                 }
9796 #endif
9797         }
9798         /* Ok, make the call */
9799         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
9800
9801         if (error == 0) {
9802                 const char *tmpname;
9803
9804                 if (fpath != NULL && spath != NULL) {
9805                         /* call out to allow 3rd party notification of exchangedata.
9806                          * Ignore result of kauth_authorize_fileop call.
9807                          */
9808                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
9809                             (uintptr_t)fpath, (uintptr_t)spath);
9810                 }
9811                 name_cache_lock();
9812
9813                 tmpname     = fvp->v_name;
9814                 fvp->v_name = svp->v_name;
9815                 svp->v_name = tmpname;
9816
9817                 if (fvp->v_parent != svp->v_parent) {
9818                         vnode_t tmp;
9819
9820                         tmp           = fvp->v_parent;
9821                         fvp->v_parent = svp->v_parent;
9822                         svp->v_parent = tmp;
9823                 }
9824                 name_cache_unlock();
9825
9826 #if CONFIG_FSE
9827                 if (fpath != NULL && spath != NULL) {
9828                         add_fsevent(FSE_EXCHANGE, ctx,
9829                             FSE_ARG_STRING, flen, fpath,
9830                             FSE_ARG_FINFO, &f_finfo,
9831                             FSE_ARG_STRING, slen, spath,
9832                             FSE_ARG_FINFO, &s_finfo,
9833                             FSE_ARG_DONE);
9834                 }
9835 #endif
9836         }
9837
9838 out:
9839         if (fpath != NULL) {
9840                 RELEASE_PATH(fpath);
9841         }
9842         if (spath != NULL) {
9843                 RELEASE_PATH(spath);
9844         }
9845         vnode_put(svp);
9846         vnode_put(fvp);
9847 out2:
9848         return error;
9849 }
9850
9851 /*
9852  * Return (in MB) the amount of freespace on the given vnode's volume.
9853  */
9854 uint32_t freespace_mb(vnode_t vp);
9855
9856 uint32_t
9857 freespace_mb(vnode_t vp)
9858 {
9859         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
9860         return ((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
9861                vp->v_mount->mnt_vfsstat.f_bsize) >> 20;
9862 }
9863
9864 #if CONFIG_SEARCHFS
9865
9866 /* ARGSUSED */
9867
9868 int
9869 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
9870 {
9871         vnode_t vp, tvp;
9872         int i, error = 0;
9873         int fserror = 0;
9874         struct nameidata nd;
9875         struct user64_fssearchblock searchblock;
9876         struct searchstate *state;
9877         struct attrlist *returnattrs;
9878         struct timeval timelimit;
9879         void *searchparams1, *searchparams2;
9880         uio_t auio = NULL;
9881         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9882         uint32_t nummatches;
9883         int mallocsize;
9884         uint32_t nameiflags;
9885         vfs_context_t ctx = vfs_context_current();
9886         char uio_buf[UIO_SIZEOF(1)];
9887
9888         /* Start by copying in fsearchblock parameter list */
9889         if (IS_64BIT_PROCESS(p)) {
9890                 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
9891                 timelimit.tv_sec = searchblock.timelimit.tv_sec;
9892                 timelimit.tv_usec = searchblock.timelimit.tv_usec;
9893         } else {
9894                 struct user32_fssearchblock tmp_searchblock;
9895
9896                 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
9897                 // munge into 64-bit version
9898                 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
9899                 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
9900                 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
9901                 searchblock.maxmatches = tmp_searchblock.maxmatches;
9902                 /*
9903                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
9904                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
9905                  */
9906                 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
9907                 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
9908                 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
9909                 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
9910                 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
9911                 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
9912                 searchblock.searchattrs = tmp_searchblock.searchattrs;
9913         }
9914         if (error) {
9915                 return error;
9916         }
9917
9918         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
9919          */
9920         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
9921             searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
9922                 return EINVAL;
9923         }
9924
9925         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
9926         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
9927         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
9928         /* block.                                                                                             */
9929         /*                                                                                                    */
9930         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
9931         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
9932         /*       assumes the size is still 556 bytes it will continue to work                                 */
9933
9934         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
9935             sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
9936
9937         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
9938
9939         /* Now set up the various pointers to the correct place in our newly allocated memory */
9940
9941         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
9942         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
9943         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
9944
9945         /* Now copy in the stuff given our local variables. */
9946
9947         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
9948                 goto freeandexit;
9949         }
9950
9951         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
9952                 goto freeandexit;
9953         }
9954
9955         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
9956                 goto freeandexit;
9957         }
9958
9959         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
9960                 goto freeandexit;
9961         }
9962
9963         /*
9964          * When searching a union mount, need to set the
9965          * start flag at the first call on each layer to
9966          * reset state for the new volume.
9967          */
9968         if (uap->options & SRCHFS_START) {
9969                 state->ss_union_layer = 0;
9970         } else {
9971                 uap->options |= state->ss_union_flags;
9972         }
9973         state->ss_union_flags = 0;
9974
9975         /*
9976          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
9977          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
9978          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
9979          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
9980          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
9981          */
9982
9983         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
9984                 attrreference_t* string_ref;
9985                 u_int32_t* start_length;
9986                 user64_size_t param_length;
9987
9988                 /* validate searchparams1 */
9989                 param_length = searchblock.sizeofsearchparams1;
9990                 /* skip the word that specifies length of the buffer */
9991                 start_length = (u_int32_t*) searchparams1;
9992                 start_length = start_length + 1;
9993                 string_ref = (attrreference_t*) start_length;
9994
9995                 /* ensure no negative offsets or too big offsets */
9996                 if (string_ref->attr_dataoffset < 0) {
9997                         error = EINVAL;
9998                         goto freeandexit;
9999                 }
10000                 if (string_ref->attr_length > MAXPATHLEN) {
10001                         error = EINVAL;
10002                         goto freeandexit;
10003                 }
10004
10005                 /* Check for pointer overflow in the string ref */
10006                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10007                         error = EINVAL;
10008                         goto freeandexit;
10009                 }
10010
10011                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10012                         error = EINVAL;
10013                         goto freeandexit;
10014                 }
10015                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10016                         error = EINVAL;
10017                         goto freeandexit;
10018                 }
10019         }
10020
10021         /* set up the uio structure which will contain the users return buffer */
10022         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10023         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10024
10025         nameiflags = 0;
10026         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10027                 nameiflags |= FOLLOW;
10028         }
10029         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10030             UIO_USERSPACE, uap->path, ctx);
10031
10032         error = namei(&nd);
10033         if (error) {
10034                 goto freeandexit;
10035         }
10036         vp = nd.ni_vp;
10037         nameidone(&nd);
10038
10039         /*
10040          * Switch to the root vnode for the volume
10041          */
10042         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10043         vnode_put(vp);
10044         if (error) {
10045                 goto freeandexit;
10046         }
10047         vp = tvp;
10048
10049         /*
10050          * If it's a union mount, the path lookup takes
10051          * us to the top layer. But we may need to descend
10052          * to a lower layer. For non-union mounts the layer
10053          * is always zero.
10054          */
10055         for (i = 0; i < (int) state->ss_union_layer; i++) {
10056                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10057                         break;
10058                 }
10059                 tvp = vp;
10060                 vp = vp->v_mount->mnt_vnodecovered;
10061                 if (vp == NULL) {
10062                         vnode_put(tvp);
10063                         error = ENOENT;
10064                         goto freeandexit;
10065                 }
10066                 error = vnode_getwithref(vp);
10067                 vnode_put(tvp);
10068                 if (error) {
10069                         goto freeandexit;
10070                 }
10071         }
10072
10073 #if CONFIG_MACF
10074         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
10075         if (error) {
10076                 vnode_put(vp);
10077                 goto freeandexit;
10078         }
10079 #endif
10080
10081
10082         /*
10083          * If searchblock.maxmatches == 0, then skip the search. This has happened
10084          * before and sometimes the underlying code doesnt deal with it well.
10085          */
10086         if (searchblock.maxmatches == 0) {
10087                 nummatches = 0;
10088                 goto saveandexit;
10089         }
10090
10091         /*
10092          * Allright, we have everything we need, so lets make that call.
10093          *
10094          * We keep special track of the return value from the file system:
10095          * EAGAIN is an acceptable error condition that shouldn't keep us
10096          * from copying out any results...
10097          */
10098
10099         fserror = VNOP_SEARCHFS(vp,
10100             searchparams1,
10101             searchparams2,
10102             &searchblock.searchattrs,
10103             (u_long)searchblock.maxmatches,
10104             &timelimit,
10105             returnattrs,
10106             &nummatches,
10107             (u_long)uap->scriptcode,
10108             (u_long)uap->options,
10109             auio,
10110             (struct searchstate *) &state->ss_fsstate,
10111             ctx);
10112
10113         /*
10114          * If it's a union mount we need to be called again
10115          * to search the mounted-on filesystem.
10116          */
10117         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10118                 state->ss_union_flags = SRCHFS_START;
10119                 state->ss_union_layer++;        // search next layer down
10120                 fserror = EAGAIN;
10121         }
10122
10123 saveandexit:
10124
10125         vnode_put(vp);
10126
10127         /* Now copy out the stuff that needs copying out. That means the number of matches, the
10128          *  search state.  Everything was already put into he return buffer by the vop call. */
10129
10130         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10131                 goto freeandexit;
10132         }
10133
10134         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10135                 goto freeandexit;
10136         }
10137
10138         error = fserror;
10139
10140 freeandexit:
10141
10142         FREE(searchparams1, M_TEMP);
10143
10144         return error;
10145 } /* end of searchfs system call */
10146
10147 #else /* CONFIG_SEARCHFS */
10148
10149 int
10150 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10151 {
10152         return ENOTSUP;
10153 }
10154
10155 #endif /* CONFIG_SEARCHFS */
10156
10157
10158 #if CONFIG_DATALESS_FILES
10159
10160 /*
10161  * === Namespace Resolver Up-call Mechanism ===
10162  *
10163  * When I/O is performed to a dataless file or directory (read, write,
10164  * lookup-in, etc.), the file system performs an upcall to the namespace
10165  * resolver (filecoordinationd) to materialize the object.
10166  *
10167  * We need multiple up-calls to be in flight at once, and we need these
10168  * up-calls to be interruptible, thus the following implementation:
10169  *
10170  * => The nspace_resolver_request represents the in-kernel request state.
10171  *    It contains a request ID, storage space for the errno code returned
10172  *    by filecoordinationd, and flags.
10173  *
10174  * => The request ID is simply a global monotonically incrementing 32-bit
10175  *    number.  Outstanding requests are stored in a hash table, and the
10176  *    hash function is extremely simple.
10177  *
10178  * => When an upcall is to be made to filecoordinationd, a request structure
10179  *    is allocated on the stack (it is small, and needs to live only during
10180  *    the duration of the call to resolve_nspace_item_ext()).  It is
10181  *    initialized and inserted into the table.  Some backpressure from
10182  *    filecoordinationd is applied by limiting the numnber of entries that
10183  *    can be inserted into the table (and thus limiting the number of
10184  *    outstanding requests issued to filecoordinationd); waiting for an
10185  *    available slot is interruptible.
10186  *
10187  * => Once the request has been inserted into the table, the up-call is made
10188  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
10189  *    immediately and filecoordinationd processes the request asynchronously.
10190  *
10191  * => The caller now waits for the request to complete.  Tnis is achieved by
10192  *    sleeping on the address of the request structure and waiting for
10193  *    filecoordinationd to mark the request structure as complete.  This
10194  *    is an interruptible sleep call; if interrupted, the request structure
10195  *    is removed from the table and EINTR is returned to the caller.  If
10196  *    this occurs, an advisory up-call is made to filecoordinationd with
10197  *    the request ID to indicate that the request can be aborted or
10198  *    de-prioritized at the discretion of filecoordinationd.
10199  *
10200  * => When filecoordinationd has completed the request, it signals completion
10201  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
10202  *    decorated as a namespace resolver can write to this sysctl node.  The
10203  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10204  *    The request ID is looked up in the table, and if the request is found,
10205  *    the error code is stored in the request structure and a wakeup()
10206  *    issued on the address of the request structure.  If the request is not
10207  *    found, we simply drop the completion notification, assuming that the
10208  *    caller was interrupted.
10209  *
10210  * => When the waiting thread wakes up, it extracts the error code from the
10211  *    request structure, removes the request from the table, and returns the
10212  *    error code to the calling function.  Fini!
10213  */
10214
10215 struct nspace_resolver_request {
10216         LIST_ENTRY(nspace_resolver_request) r_hashlink;
10217         uint32_t        r_req_id;
10218         int             r_resolver_error;
10219         int             r_flags;
10220 };
10221
10222 #define RRF_COMPLETE    0x0001
10223
10224 static uint32_t
10225 next_nspace_req_id(void)
10226 {
10227         static uint32_t next_req_id;
10228
10229         return OSAddAtomic(1, &next_req_id);
10230 }
10231
10232 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
10233 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
10234
10235 static LIST_HEAD(nspace_resolver_requesthead,
10236     nspace_resolver_request) * nspace_resolver_request_hashtbl;
10237 static u_long nspace_resolver_request_hashmask;
10238 static u_int nspace_resolver_request_count;
10239 static bool nspace_resolver_request_wait_slot;
10240 static lck_grp_t *nspace_resolver_request_lck_grp;
10241 static lck_mtx_t nspace_resolver_request_hash_mutex;
10242
10243 #define NSPACE_REQ_LOCK() \
10244         lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10245 #define NSPACE_REQ_UNLOCK() \
10246         lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10247
10248 #define NSPACE_RESOLVER_HASH(req_id)    \
10249         (&nspace_resolver_request_hashtbl[(req_id) & \
10250          nspace_resolver_request_hashmask])
10251
10252 static struct nspace_resolver_request *
10253 nspace_resolver_req_lookup(uint32_t req_id)
10254 {
10255         struct nspace_resolver_requesthead *bucket;
10256         struct nspace_resolver_request *req;
10257
10258         bucket = NSPACE_RESOLVER_HASH(req_id);
10259         LIST_FOREACH(req, bucket, r_hashlink) {
10260                 if (req->r_req_id == req_id) {
10261                         return req;
10262                 }
10263         }
10264
10265         return NULL;
10266 }
10267
10268 static int
10269 nspace_resolver_req_add(struct nspace_resolver_request *req)
10270 {
10271         struct nspace_resolver_requesthead *bucket;
10272         int error;
10273
10274         while (nspace_resolver_request_count >=
10275             NSPACE_RESOLVER_MAX_OUTSTANDING) {
10276                 nspace_resolver_request_wait_slot = true;
10277                 error = msleep(&nspace_resolver_request_count,
10278                     &nspace_resolver_request_hash_mutex,
10279                     PVFS | PCATCH, "nspacerq", NULL);
10280                 if (error) {
10281                         return error;
10282                 }
10283         }
10284
10285         bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10286 #if DIAGNOSTIC
10287         assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10288 #endif /* DIAGNOSTIC */
10289         LIST_INSERT_HEAD(bucket, req, r_hashlink);
10290         nspace_resolver_request_count++;
10291
10292         return 0;
10293 }
10294
10295 static void
10296 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10297 {
10298         struct nspace_resolver_requesthead *bucket;
10299
10300         bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10301 #if DIAGNOSTIC
10302         assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10303 #endif /* DIAGNOSTIC */
10304         LIST_REMOVE(req, r_hashlink);
10305         nspace_resolver_request_count--;
10306
10307         if (nspace_resolver_request_wait_slot) {
10308                 nspace_resolver_request_wait_slot = false;
10309                 wakeup(&nspace_resolver_request_count);
10310         }
10311 }
10312
10313 static void
10314 nspace_resolver_req_cancel(uint32_t req_id)
10315 {
10316         kern_return_t kr;
10317         mach_port_t mp;
10318
10319         // Failures here aren't fatal -- the cancellation message
10320         // sent to the resolver is merely advisory.
10321
10322         kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10323         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10324                 return;
10325         }
10326
10327         kr = send_nspace_resolve_cancel(mp, req_id);
10328         if (kr != KERN_SUCCESS) {
10329                 os_log_error(OS_LOG_DEFAULT,
10330                     "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10331         }
10332
10333         ipc_port_release_send(mp);
10334 }
10335
10336 static int
10337 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10338 {
10339         bool send_cancel_message = false;
10340         int error;
10341
10342         NSPACE_REQ_LOCK();
10343
10344         while ((req->r_flags & RRF_COMPLETE) == 0) {
10345                 error = msleep(req, &nspace_resolver_request_hash_mutex,
10346                     PVFS | PCATCH, "nspace", NULL);
10347                 if (error && error != ERESTART) {
10348                         req->r_resolver_error = (error == EINTR) ? EINTR :
10349                             ETIMEDOUT;
10350                         send_cancel_message = true;
10351                         break;
10352                 }
10353         }
10354
10355         nspace_resolver_req_remove(req);
10356
10357         NSPACE_REQ_UNLOCK();
10358
10359         if (send_cancel_message) {
10360                 nspace_resolver_req_cancel(req->r_req_id);
10361         }
10362
10363         return req->r_resolver_error;
10364 }
10365
10366 static void
10367 nspace_resolver_req_mark_complete(
10368         struct nspace_resolver_request *req,
10369         int resolver_error)
10370 {
10371         req->r_resolver_error = resolver_error;
10372         req->r_flags |= RRF_COMPLETE;
10373         wakeup(req);
10374 }
10375
10376 static void
10377 nspace_resolver_req_completed(uint32_t req_id, int resolver_error)
10378 {
10379         struct nspace_resolver_request *req;
10380
10381         NSPACE_REQ_LOCK();
10382
10383         // If we don't find the request corresponding to our req_id,
10384         // just drop the completion signal on the floor; it's likely
10385         // that the requester interrupted with a signal.
10386
10387         req = nspace_resolver_req_lookup(req_id);
10388         if (req) {
10389                 nspace_resolver_req_mark_complete(req, resolver_error);
10390         }
10391
10392         NSPACE_REQ_UNLOCK();
10393 }
10394
10395 static struct proc *nspace_resolver_proc;
10396
10397 static int
10398 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10399 {
10400         *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10401             p == nspace_resolver_proc) ? 1 : 0;
10402         return 0;
10403 }
10404
10405 static int
10406 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10407 {
10408         vfs_context_t ctx = vfs_context_current();
10409         int error = 0;
10410
10411         //
10412         // The system filecoordinationd runs as uid == 0.  This also
10413         // has the nice side-effect of filtering out filecoordinationd
10414         // running in the simulator.
10415         //
10416         if (!vfs_context_issuser(ctx)) {
10417                 return EPERM;
10418         }
10419
10420         error = priv_check_cred(vfs_context_ucred(ctx),
10421             PRIV_VFS_DATALESS_RESOLVER, 0);
10422         if (error) {
10423                 return error;
10424         }
10425
10426         if (is_resolver) {
10427                 NSPACE_REQ_LOCK();
10428
10429                 if (nspace_resolver_proc == NULL) {
10430                         proc_lock(p);
10431                         p->p_lflag |= P_LNSPACE_RESOLVER;
10432                         proc_unlock(p);
10433                         nspace_resolver_proc = p;
10434                 } else {
10435                         error = EBUSY;
10436                 }
10437
10438                 NSPACE_REQ_UNLOCK();
10439         } else {
10440                 // This is basically just like the exit case.
10441                 // nspace_resolver_exited() will verify that the
10442                 // process is the resolver, and will clear the
10443                 // global.
10444                 nspace_resolver_exited(p);
10445         }
10446
10447         return error;
10448 }
10449
10450 static int
10451 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10452 {
10453         if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10454             (p->p_vfs_iopolicy &
10455             P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10456                 *is_prevented = 1;
10457         } else {
10458                 *is_prevented = 0;
10459         }
10460         return 0;
10461 }
10462
10463 static int
10464 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10465 {
10466         if (p->p_lflag & P_LNSPACE_RESOLVER) {
10467                 return is_prevented ? 0 : EBUSY;
10468         }
10469
10470         if (is_prevented) {
10471                 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10472         } else {
10473                 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10474         }
10475         return 0;
10476 }
10477
10478 static int
10479 nspace_materialization_get_thread_state(int *is_prevented)
10480 {
10481         uthread_t ut = get_bsdthread_info(current_thread());
10482
10483         *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10484         return 0;
10485 }
10486
10487 static int
10488 nspace_materialization_set_thread_state(int is_prevented)
10489 {
10490         uthread_t ut = get_bsdthread_info(current_thread());
10491
10492         if (is_prevented) {
10493                 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10494         } else {
10495                 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10496         }
10497         return 0;
10498 }
10499
10500 static int
10501 nspace_materialization_is_prevented(void)
10502 {
10503         proc_t p = current_proc();
10504         uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
10505         vfs_context_t ctx = vfs_context_current();
10506
10507         /*
10508          * Kernel context ==> return EDEADLK, as we would with any random
10509          * process decorated as no-materialize.
10510          */
10511         if (ctx == vfs_context_kernel()) {
10512                 return EDEADLK;
10513         }
10514
10515         /*
10516          * If the process has the dataless-manipulation entitlement,
10517          * materialization is prevented, and depending on the kind
10518          * of file system operation, things get to proceed as if the
10519          * object is not dataless.
10520          */
10521         if (vfs_context_is_dataless_manipulator(ctx)) {
10522                 return EJUSTRETURN;
10523         }
10524
10525         /*
10526          * Per-thread decorations override any process-wide decorations.
10527          * (Foundation uses this, and this overrides even the dataless-
10528          * manipulation entitlement so as to make API contracts consistent.)
10529          */
10530         if (ut != NULL) {
10531                 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
10532                         return EDEADLK;
10533                 }
10534                 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
10535                         return 0;
10536                 }
10537         }
10538
10539         /*
10540          * If the process's iopolicy specifies that dataless files
10541          * can be materialized, then we let it go ahead.
10542          */
10543         if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
10544                 return 0;
10545         }
10546
10547         /*
10548          * The default behavior is to not materialize dataless files;
10549          * return to the caller that deadlock was detected.
10550          */
10551         return EDEADLK;
10552 }
10553
10554 /* the vfs.nspace branch */
10555 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10556
10557 static int
10558 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10559     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10560 {
10561         struct proc *p = req->p;
10562         int new_value, old_value, changed = 0;
10563         int error;
10564
10565         error = nspace_resolver_get_proc_state(p, &old_value);
10566         if (error) {
10567                 return error;
10568         }
10569
10570         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10571             &changed);
10572         if (error == 0 && changed) {
10573                 error = nspace_resolver_set_proc_state(p, new_value);
10574         }
10575         return error;
10576 }
10577
10578 /* decorate this process as the dataless file resolver */
10579 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
10580     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10581     0, 0, sysctl_nspace_resolver, "I", "");
10582
10583 static int
10584 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
10585     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10586 {
10587         struct proc *p = req->p;
10588         int new_value, old_value, changed = 0;
10589         int error;
10590
10591         error = nspace_materialization_get_proc_state(p, &old_value);
10592         if (error) {
10593                 return error;
10594         }
10595
10596         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10597             &changed);
10598         if (error == 0 && changed) {
10599                 error = nspace_materialization_set_proc_state(p, new_value);
10600         }
10601         return error;
10602 }
10603
10604 /* decorate this process as not wanting to materialize dataless files */
10605 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
10606     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10607     0, 0, sysctl_nspace_prevent_materialization, "I", "");
10608
10609 static int
10610 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
10611     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10612 {
10613         int new_value, old_value, changed = 0;
10614         int error;
10615
10616         error = nspace_materialization_get_thread_state(&old_value);
10617         if (error) {
10618                 return error;
10619         }
10620
10621         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10622             &changed);
10623         if (error == 0 && changed) {
10624                 error = nspace_materialization_set_thread_state(new_value);
10625         }
10626         return error;
10627 }
10628
10629 /* decorate this thread as not wanting to materialize dataless files */
10630 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
10631     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10632     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
10633
10634 static int
10635 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
10636     __unused int arg2, struct sysctl_req *req)
10637 {
10638         struct proc *p = req->p;
10639         uint32_t req_status[2] = { 0, 0 };
10640         int error, is_resolver, changed = 0;
10641
10642         error = nspace_resolver_get_proc_state(p, &is_resolver);
10643         if (error) {
10644                 return error;
10645         }
10646
10647         if (!is_resolver) {
10648                 return EPERM;
10649         }
10650
10651         error = sysctl_io_opaque(req, req_status, sizeof(req_status),
10652             &changed);
10653         if (error) {
10654                 return error;
10655         }
10656
10657         /*
10658          * req_status[0] is the req_id
10659          *
10660          * req_status[1] is the errno
10661          */
10662         if (error == 0 && changed) {
10663                 nspace_resolver_req_completed(req_status[0],
10664                     (int)req_status[1]);
10665         }
10666         return error;
10667 }
10668
10669 /* Resolver reports completed reqs here. */
10670 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
10671     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10672     0, 0, sysctl_nspace_complete, "-", "");
10673
10674 #endif /* CONFIG_DATALESS_FILES */
10675
10676 #if CONFIG_DATALESS_FILES
10677 #define __no_dataless_unused    /* nothing */
10678 #else
10679 #define __no_dataless_unused    __unused
10680 #endif
10681
10682 void
10683 nspace_resolver_init(void)
10684 {
10685 #if CONFIG_DATALESS_FILES
10686         nspace_resolver_request_lck_grp =
10687             lck_grp_alloc_init("file namespace resolver", NULL);
10688
10689         lck_mtx_init(&nspace_resolver_request_hash_mutex,
10690             nspace_resolver_request_lck_grp, NULL);
10691
10692         nspace_resolver_request_hashtbl =
10693             hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
10694             M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
10695 #endif /* CONFIG_DATALESS_FILES */
10696 }
10697
10698 void
10699 nspace_resolver_exited(struct proc *p __no_dataless_unused)
10700 {
10701 #if CONFIG_DATALESS_FILES
10702         struct nspace_resolver_requesthead *bucket;
10703         struct nspace_resolver_request *req;
10704         u_long idx;
10705
10706         NSPACE_REQ_LOCK();
10707
10708         if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10709             p == nspace_resolver_proc) {
10710                 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
10711                         bucket = &nspace_resolver_request_hashtbl[idx];
10712                         LIST_FOREACH(req, bucket, r_hashlink) {
10713                                 nspace_resolver_req_mark_complete(req,
10714                                     ETIMEDOUT);
10715                         }
10716                 }
10717                 nspace_resolver_proc = NULL;
10718         }
10719
10720         NSPACE_REQ_UNLOCK();
10721 #endif /* CONFIG_DATALESS_FILES */
10722 }
10723
10724 int
10725 resolve_nspace_item(struct vnode *vp, uint64_t op)
10726 {
10727         return resolve_nspace_item_ext(vp, op, NULL);
10728 }
10729
10730 #define DATALESS_RESOLVER_ENTITLEMENT     \
10731         "com.apple.private.vfs.dataless-resolver"
10732 #define DATALESS_MANIPULATION_ENTITLEMENT \
10733         "com.apple.private.vfs.dataless-manipulation"
10734
10735 /*
10736  * Return TRUE if the vfs context is associated with a process entitled
10737  * for dataless manipulation.
10738  *
10739  * XXX Arguably belongs in vfs_subr.c, but is here because of the
10740  * complication around CONFIG_DATALESS_FILES.
10741  */
10742 boolean_t
10743 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
10744 {
10745 #if CONFIG_DATALESS_FILES
10746         assert(ctx->vc_thread == current_thread());
10747         task_t const task = current_task();
10748         return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
10749                IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
10750 #else
10751         return false;
10752 #endif /* CONFIG_DATALESS_FILES */
10753 }
10754
10755 int
10756 resolve_nspace_item_ext(
10757         struct vnode *vp __no_dataless_unused,
10758         uint64_t op __no_dataless_unused,
10759         void *arg __unused)
10760 {
10761 #if CONFIG_DATALESS_FILES
10762         int error;
10763         mach_port_t mp;
10764         char *path = NULL;
10765         int path_len;
10766         kern_return_t kr;
10767         struct nspace_resolver_request req;
10768
10769         // only allow namespace events on regular files, directories and symlinks.
10770         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
10771                 return EFTYPE;
10772         }
10773
10774         //
10775         // if this is a snapshot event and the vnode is on a
10776         // disk image just pretend nothing happened since any
10777         // change to the disk image will cause the disk image
10778         // itself to get backed up and this avoids multi-way
10779         // deadlocks between the snapshot handler and the ever
10780         // popular diskimages-helper process.  the variable
10781         // nspace_allow_virtual_devs allows this behavior to
10782         // be overridden (for use by the Mobile TimeMachine
10783         // testing infrastructure which uses disk images)
10784         //
10785         if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
10786                 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
10787                 return ENOTSUP;
10788         }
10789
10790         error = nspace_materialization_is_prevented();
10791         if (error) {
10792                 os_log_debug(OS_LOG_DEFAULT,
10793                     "NSPACE process/thread is decorated as no-materialization");
10794                 return error;
10795         }
10796
10797         kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10798         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10799                 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
10800                 // Treat this like being unable to access the backing
10801                 // store server.
10802                 return ETIMEDOUT;
10803         }
10804
10805         MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
10806         if (path == NULL) {
10807                 error = ENOMEM;
10808                 goto out_release_port;
10809         }
10810         path_len = MAXPATHLEN;
10811
10812         error = vn_getpath(vp, path, &path_len);
10813         if (error == 0) {
10814                 int xxx_rdar44371223;   /* XXX Mig bug */
10815                 req.r_req_id = next_nspace_req_id();
10816                 req.r_resolver_error = 0;
10817                 req.r_flags = 0;
10818
10819                 NSPACE_REQ_LOCK();
10820                 error = nspace_resolver_req_add(&req);
10821                 NSPACE_REQ_UNLOCK();
10822                 if (error) {
10823                         goto out_release_port;
10824                 }
10825
10826                 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
10827                 kr = send_nspace_resolve_path(mp, req.r_req_id,
10828                     current_proc()->p_pid, (uint32_t)(op & 0xffffffff),
10829                     path, &xxx_rdar44371223);
10830                 if (kr != KERN_SUCCESS) {
10831                         // Also treat this like being unable to access
10832                         // the backing store server.
10833                         os_log_error(OS_LOG_DEFAULT,
10834                             "NSPACE resolve_path failure: %d", kr);
10835                         error = ETIMEDOUT;
10836
10837                         NSPACE_REQ_LOCK();
10838                         nspace_resolver_req_remove(&req);
10839                         NSPACE_REQ_UNLOCK();
10840                         goto out_release_port;
10841                 }
10842
10843                 // Give back the memory we allocated earlier while
10844                 // we wait; we no longer need it.
10845                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10846                 path = NULL;
10847
10848                 // Request has been submitted to the resolver.
10849                 // Now (interruptibly) wait for completion.
10850                 // Upon requrn, the request will have been removed
10851                 // from the lookup table.
10852                 error = nspace_resolver_req_wait(&req);
10853         }
10854
10855 out_release_port:
10856         if (path != NULL) {
10857                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10858         }
10859         ipc_port_release_send(mp);
10860
10861         return error;
10862 #else
10863         return ENOTSUP;
10864 #endif /* CONFIG_DATALESS_FILES */
10865 }
10866
10867 int
10868 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
10869     __unused uint64_t op_type, __unused void *arg)
10870 {
10871         return 0;
10872 }
10873
10874 #if 0
10875 static int
10876 build_volfs_path(struct vnode *vp, char *path, int *len)
10877 {
10878         struct vnode_attr va;
10879         int ret;
10880
10881         VATTR_INIT(&va);
10882         VATTR_WANTED(&va, va_fsid);
10883         VATTR_WANTED(&va, va_fileid);
10884
10885         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
10886                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
10887                 ret = -1;
10888         } else {
10889                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
10890                 ret = 0;
10891         }
10892
10893         return ret;
10894 }
10895 #endif
10896
10897 static unsigned long
10898 fsctl_bogus_command_compat(unsigned long cmd)
10899 {
10900         switch (cmd) {
10901         case IOCBASECMD(FSIOC_SYNC_VOLUME):
10902                 return FSIOC_SYNC_VOLUME;
10903         case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
10904                 return FSIOC_ROUTEFS_SETROUTEID;
10905         case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
10906                 return FSIOC_SET_PACKAGE_EXTS;
10907         case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
10908                 return FSIOC_SET_FSTYPENAME_OVERRIDE;
10909         case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
10910                 return DISK_CONDITIONER_IOC_GET;
10911         case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
10912                 return DISK_CONDITIONER_IOC_SET;
10913         case IOCBASECMD(FSIOC_FIOSEEKHOLE):
10914                 return FSIOC_FIOSEEKHOLE;
10915         case IOCBASECMD(FSIOC_FIOSEEKDATA):
10916                 return FSIOC_FIOSEEKDATA;
10917         case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
10918                 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
10919         case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
10920                 return SPOTLIGHT_IOC_GET_LAST_MTIME;
10921         }
10922
10923         return cmd;
10924 }
10925
10926 static int
10927 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
10928 {
10929         return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
10930 }
10931
10932 /*
10933  * Make a filesystem-specific control call:
10934  */
10935 /* ARGSUSED */
10936 static int
10937 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
10938 {
10939         int error = 0;
10940         boolean_t is64bit;
10941         u_int size;
10942 #define STK_PARAMS 128
10943         char stkbuf[STK_PARAMS] = {0};
10944         caddr_t data, memp;
10945         vnode_t vp = *arg_vp;
10946
10947         if (vp->v_type == VCHR || vp->v_type == VBLK) {
10948                 return ENOTTY;
10949         }
10950
10951         cmd = fsctl_bogus_command_compat(cmd);
10952
10953         size = IOCPARM_LEN(cmd);
10954         if (size > IOCPARM_MAX) {
10955                 return EINVAL;
10956         }
10957
10958         is64bit = proc_is64bit(p);
10959
10960         memp = NULL;
10961
10962         if (size > sizeof(stkbuf)) {
10963                 if ((memp = (caddr_t)kalloc(size)) == 0) {
10964                         return ENOMEM;
10965                 }
10966                 data = memp;
10967         } else {
10968                 data = &stkbuf[0];
10969         };
10970
10971         if (cmd & IOC_IN) {
10972                 if (size) {
10973                         error = copyin(udata, data, size);
10974                         if (error) {
10975                                 if (memp) {
10976                                         kfree(memp, size);
10977                                 }
10978                                 return error;
10979                         }
10980                 } else {
10981                         if (is64bit) {
10982                                 *(user_addr_t *)data = udata;
10983                         } else {
10984                                 *(uint32_t *)data = (uint32_t)udata;
10985                         }
10986                 };
10987         } else if ((cmd & IOC_OUT) && size) {
10988                 /*
10989                  * Zero the buffer so the user always
10990                  * gets back something deterministic.
10991                  */
10992                 bzero(data, size);
10993         } else if (cmd & IOC_VOID) {
10994                 if (is64bit) {
10995                         *(user_addr_t *)data = udata;
10996                 } else {
10997                         *(uint32_t *)data = (uint32_t)udata;
10998                 }
10999         }
11000
11001         /* Check to see if it's a generic command */
11002         switch (cmd) {
11003         case FSIOC_SYNC_VOLUME: {
11004                 struct vfs_attr vfa;
11005                 mount_t mp = vp->v_mount;
11006                 unsigned arg;
11007
11008
11009                 /* record vid of vp so we can drop it below. */
11010                 uint32_t vvid = vp->v_id;
11011
11012                 /*
11013                  * Then grab mount_iterref so that we can release the vnode.
11014                  * Without this, a thread may call vnode_iterate_prepare then
11015                  * get into a deadlock because we've never released the root vp
11016                  */
11017                 error = mount_iterref(mp, 0);
11018                 if (error) {
11019                         break;
11020                 }
11021                 vnode_put(vp);
11022
11023                 arg = MNT_NOWAIT;
11024                 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11025                         arg = MNT_WAIT;
11026                 }
11027
11028                 /*
11029                  * If the filessytem supports multiple filesytems in a
11030                  * partition (For eg APFS volumes in a container, it knows
11031                  * that the waitfor argument to VFS_SYNC are flags.
11032                  */
11033                 VFSATTR_INIT(&vfa);
11034                 VFSATTR_WANTED(&vfa, f_capabilities);
11035                 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11036                     VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11037                     ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11038                     ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11039                         arg |= MNT_VOLUME;
11040                 }
11041
11042                 /* issue the sync for this volume */
11043                 (void)sync_callback(mp, &arg);
11044
11045                 /*
11046                  * Then release the mount_iterref once we're done syncing; it's not
11047                  * needed for the VNOP_IOCTL below
11048                  */
11049                 mount_iterdrop(mp);
11050
11051                 if (arg & FSCTL_SYNC_FULLSYNC) {
11052                         /* re-obtain vnode iocount on the root vp, if possible */
11053                         error = vnode_getwithvid(vp, vvid);
11054                         if (error == 0) {
11055                                 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11056                                 vnode_put(vp);
11057                         }
11058                 }
11059                 /* mark the argument VP as having been released */
11060                 *arg_vp = NULL;
11061         }
11062         break;
11063
11064         case FSIOC_ROUTEFS_SETROUTEID: {
11065 #if ROUTEFS
11066                 char routepath[MAXPATHLEN];
11067                 size_t len = 0;
11068
11069                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11070                         break;
11071                 }
11072                 bzero(routepath, MAXPATHLEN);
11073                 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11074                 if (error) {
11075                         break;
11076                 }
11077                 error = routefs_kernel_mount(routepath);
11078                 if (error) {
11079                         break;
11080                 }
11081 #endif
11082         }
11083         break;
11084
11085         case FSIOC_SET_PACKAGE_EXTS: {
11086                 user_addr_t ext_strings;
11087                 uint32_t    num_entries;
11088                 uint32_t    max_width;
11089
11090                 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11091                         break;
11092                 }
11093
11094                 if ((is64bit && size != sizeof(user64_package_ext_info))
11095                     || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11096                         // either you're 64-bit and passed a 64-bit struct or
11097                         // you're 32-bit and passed a 32-bit struct.  otherwise
11098                         // it's not ok.
11099                         error = EINVAL;
11100                         break;
11101                 }
11102
11103                 if (is64bit) {
11104                         ext_strings = ((user64_package_ext_info *)data)->strings;
11105                         num_entries = ((user64_package_ext_info *)data)->num_entries;
11106                         max_width   = ((user64_package_ext_info *)data)->max_width;
11107                 } else {
11108                         ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11109                         num_entries = ((user32_package_ext_info *)data)->num_entries;
11110                         max_width   = ((user32_package_ext_info *)data)->max_width;
11111                 }
11112                 error = set_package_extensions_table(ext_strings, num_entries, max_width);
11113         }
11114         break;
11115
11116         case FSIOC_SET_FSTYPENAME_OVERRIDE:
11117         {
11118                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11119                         break;
11120                 }
11121                 if (vp->v_mount) {
11122                         mount_lock(vp->v_mount);
11123                         if (data[0] != 0) {
11124                                 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11125                                 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11126                                 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11127                                         vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
11128                                         vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
11129                                 }
11130                         } else {
11131                                 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11132                                         vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
11133                                 }
11134                                 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11135                                 vp->v_mount->fstypename_override[0] = '\0';
11136                         }
11137                         mount_unlock(vp->v_mount);
11138                 }
11139         }
11140         break;
11141
11142         case DISK_CONDITIONER_IOC_GET: {
11143                 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
11144         }
11145         break;
11146
11147         case DISK_CONDITIONER_IOC_SET: {
11148                 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
11149         }
11150         break;
11151
11152         case FSIOC_CAS_BSDFLAGS: {
11153                 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11154                 struct vnode_attr va;
11155
11156                 VATTR_INIT(&va);
11157                 VATTR_SET(&va, va_flags, cas->new_flags);
11158
11159                 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11160         }
11161         break;
11162
11163         case FSIOC_FD_ONLY_OPEN_ONCE: {
11164                 if (vnode_usecount(vp) > 1) {
11165                         error = EBUSY;
11166                 } else {
11167                         error = 0;
11168                 }
11169         }
11170         break;
11171
11172         default: {
11173                 /* other, known commands shouldn't be passed down here */
11174                 switch (cmd) {
11175                 case F_PUNCHHOLE:
11176                 case F_TRIM_ACTIVE_FILE:
11177                 case F_RDADVISE:
11178                 case F_TRANSCODEKEY:
11179                 case F_GETPROTECTIONLEVEL:
11180                 case F_GETDEFAULTPROTLEVEL:
11181                 case F_MAKECOMPRESSED:
11182                 case F_SET_GREEDY_MODE:
11183                 case F_SETSTATICCONTENT:
11184                 case F_SETIOTYPE:
11185                 case F_SETBACKINGSTORE:
11186                 case F_GETPATH_MTMINFO:
11187                 case APFSIOC_REVERT_TO_SNAPSHOT:
11188                 case FSIOC_FIOSEEKHOLE:
11189                 case FSIOC_FIOSEEKDATA:
11190                 case HFS_GET_BOOT_INFO:
11191                 case HFS_SET_BOOT_INFO:
11192                 case FIOPINSWAP:
11193                 case F_CHKCLEAN:
11194                 case F_FULLFSYNC:
11195                 case F_BARRIERFSYNC:
11196                 case F_FREEZE_FS:
11197                 case F_THAW_FS:
11198                         error = EINVAL;
11199                         goto outdrop;
11200                 }
11201                 /* Invoke the filesystem-specific code */
11202                 error = VNOP_IOCTL(vp, cmd, data, options, ctx);
11203         }
11204         } /* end switch stmt */
11205
11206         /*
11207          * if no errors, copy any data to user. Size was
11208          * already set and checked above.
11209          */
11210         if (error == 0 && (cmd & IOC_OUT) && size) {
11211                 error = copyout(data, udata, size);
11212         }
11213
11214 outdrop:
11215         if (memp) {
11216                 kfree(memp, size);
11217         }
11218
11219         return error;
11220 }
11221
11222 /* ARGSUSED */
11223 int
11224 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
11225 {
11226         int error;
11227         struct nameidata nd;
11228         u_long nameiflags;
11229         vnode_t vp = NULL;
11230         vfs_context_t ctx = vfs_context_current();
11231
11232         AUDIT_ARG(cmd, uap->cmd);
11233         AUDIT_ARG(value32, uap->options);
11234         /* Get the vnode for the file we are getting info on:  */
11235         nameiflags = 0;
11236         //
11237         // if we come through fsctl() then the file is by definition not open.
11238         // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
11239         // lest the caller mistakenly thinks the only open is their own (but in
11240         // reality it's someone elses).
11241         //
11242         if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
11243                 return EINVAL;
11244         }
11245         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11246                 nameiflags |= FOLLOW;
11247         }
11248         if (uap->cmd == FSIOC_FIRMLINK_CTL) {
11249                 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
11250         }
11251         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
11252             UIO_USERSPACE, uap->path, ctx);
11253         if ((error = namei(&nd))) {
11254                 goto done;
11255         }
11256         vp = nd.ni_vp;
11257         nameidone(&nd);
11258
11259 #if CONFIG_MACF
11260         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
11261         if (error) {
11262                 goto done;
11263         }
11264 #endif
11265
11266         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11267
11268 done:
11269         if (vp) {
11270                 vnode_put(vp);
11271         }
11272         return error;
11273 }
11274 /* ARGSUSED */
11275 int
11276 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
11277 {
11278         int error;
11279         vnode_t vp = NULL;
11280         vfs_context_t ctx = vfs_context_current();
11281         int fd = -1;
11282
11283         AUDIT_ARG(fd, uap->fd);
11284         AUDIT_ARG(cmd, uap->cmd);
11285         AUDIT_ARG(value32, uap->options);
11286
11287         /* Get the vnode for the file we are getting info on:  */
11288         if ((error = file_vnode(uap->fd, &vp))) {
11289                 return error;
11290         }
11291         fd = uap->fd;
11292         if ((error = vnode_getwithref(vp))) {
11293                 file_drop(fd);
11294                 return error;
11295         }
11296
11297 #if CONFIG_MACF
11298         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
11299                 file_drop(fd);
11300                 vnode_put(vp);
11301                 return error;
11302         }
11303 #endif
11304
11305         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11306
11307         file_drop(fd);
11308
11309         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
11310         if (vp) {
11311                 vnode_put(vp);
11312         }
11313
11314         return error;
11315 }
11316 /* end of fsctl system call */
11317
11318 /*
11319  *  Retrieve the data of an extended attribute.
11320  */
11321 int
11322 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
11323 {
11324         vnode_t vp;
11325         struct nameidata nd;
11326         char attrname[XATTR_MAXNAMELEN + 1];
11327         vfs_context_t ctx = vfs_context_current();
11328         uio_t auio = NULL;
11329         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11330         size_t attrsize = 0;
11331         size_t namelen;
11332         u_int32_t nameiflags;
11333         int error;
11334         char uio_buf[UIO_SIZEOF(1)];
11335
11336         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11337                 return EINVAL;
11338         }
11339
11340         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11341         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
11342         if ((error = namei(&nd))) {
11343                 return error;
11344         }
11345         vp = nd.ni_vp;
11346         nameidone(&nd);
11347
11348         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11349         if (error != 0) {
11350                 goto out;
11351         }
11352         if (xattr_protected(attrname)) {
11353                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
11354                         error = EPERM;
11355                         goto out;
11356                 }
11357         }
11358         /*
11359          * the specific check for 0xffffffff is a hack to preserve
11360          * binaray compatibilty in K64 with applications that discovered
11361          * that passing in a buf pointer and a size of -1 resulted in
11362          * just the size of the indicated extended attribute being returned.
11363          * this isn't part of the documented behavior, but because of the
11364          * original implemtation's check for "uap->size > 0", this behavior
11365          * was allowed. In K32 that check turned into a signed comparison
11366          * even though uap->size is unsigned...  in K64, we blow by that
11367          * check because uap->size is unsigned and doesn't get sign smeared
11368          * in the munger for a 32 bit user app.  we also need to add a
11369          * check to limit the maximum size of the buffer being passed in...
11370          * unfortunately, the underlying fileystems seem to just malloc
11371          * the requested size even if the actual extended attribute is tiny.
11372          * because that malloc is for kernel wired memory, we have to put a
11373          * sane limit on it.
11374          *
11375          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
11376          * U64 running on K64 will yield -1 (64 bits wide)
11377          * U32/U64 running on K32 will yield -1 (32 bits wide)
11378          */
11379         if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
11380                 goto no_uio;
11381         }
11382
11383         if (uap->value) {
11384                 if (uap->size > (size_t)XATTR_MAXSIZE) {
11385                         uap->size = XATTR_MAXSIZE;
11386                 }
11387
11388                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11389                     &uio_buf[0], sizeof(uio_buf));
11390                 uio_addiov(auio, uap->value, uap->size);
11391         }
11392 no_uio:
11393         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
11394 out:
11395         vnode_put(vp);
11396
11397         if (auio) {
11398                 *retval = uap->size - uio_resid(auio);
11399         } else {
11400                 *retval = (user_ssize_t)attrsize;
11401         }
11402
11403         return error;
11404 }
11405
11406 /*
11407  * Retrieve the data of an extended attribute.
11408  */
11409 int
11410 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
11411 {
11412         vnode_t vp;
11413         char attrname[XATTR_MAXNAMELEN + 1];
11414         uio_t auio = NULL;
11415         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11416         size_t attrsize = 0;
11417         size_t namelen;
11418         int error;
11419         char uio_buf[UIO_SIZEOF(1)];
11420
11421         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11422                 return EINVAL;
11423         }
11424
11425         if ((error = file_vnode(uap->fd, &vp))) {
11426                 return error;
11427         }
11428         if ((error = vnode_getwithref(vp))) {
11429                 file_drop(uap->fd);
11430                 return error;
11431         }
11432         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11433         if (error != 0) {
11434                 goto out;
11435         }
11436         if (xattr_protected(attrname)) {
11437                 error = EPERM;
11438                 goto out;
11439         }
11440         if (uap->value && uap->size > 0) {
11441                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11442                     &uio_buf[0], sizeof(uio_buf));
11443                 uio_addiov(auio, uap->value, uap->size);
11444         }
11445
11446         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
11447 out:
11448         (void)vnode_put(vp);
11449         file_drop(uap->fd);
11450
11451         if (auio) {
11452                 *retval = uap->size - uio_resid(auio);
11453         } else {
11454                 *retval = (user_ssize_t)attrsize;
11455         }
11456         return error;
11457 }
11458
11459 /*
11460  * Set the data of an extended attribute.
11461  */
11462 int
11463 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
11464 {
11465         vnode_t vp;
11466         struct nameidata nd;
11467         char attrname[XATTR_MAXNAMELEN + 1];
11468         vfs_context_t ctx = vfs_context_current();
11469         uio_t auio = NULL;
11470         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11471         size_t namelen;
11472         u_int32_t nameiflags;
11473         int error;
11474         char uio_buf[UIO_SIZEOF(1)];
11475
11476         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11477                 return EINVAL;
11478         }
11479
11480         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11481         if (error != 0) {
11482                 if (error == EPERM) {
11483                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11484                         return ENAMETOOLONG;
11485                 }
11486                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11487                 return error;
11488         }
11489         if (xattr_protected(attrname)) {
11490                 return EPERM;
11491         }
11492         if (uap->size != 0 && uap->value == 0) {
11493                 return EINVAL;
11494         }
11495
11496         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11497         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
11498         if ((error = namei(&nd))) {
11499                 return error;
11500         }
11501         vp = nd.ni_vp;
11502         nameidone(&nd);
11503
11504         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11505             &uio_buf[0], sizeof(uio_buf));
11506         uio_addiov(auio, uap->value, uap->size);
11507
11508         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
11509 #if CONFIG_FSE
11510         if (error == 0) {
11511                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11512                     FSE_ARG_VNODE, vp,
11513                     FSE_ARG_DONE);
11514         }
11515 #endif
11516         vnode_put(vp);
11517         *retval = 0;
11518         return error;
11519 }
11520
11521 /*
11522  * Set the data of an extended attribute.
11523  */
11524 int
11525 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
11526 {
11527         vnode_t vp;
11528         char attrname[XATTR_MAXNAMELEN + 1];
11529         uio_t auio = NULL;
11530         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11531         size_t namelen;
11532         int error;
11533         char uio_buf[UIO_SIZEOF(1)];
11534 #if CONFIG_FSE
11535         vfs_context_t ctx = vfs_context_current();
11536 #endif
11537
11538         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11539                 return EINVAL;
11540         }
11541
11542         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11543         if (error != 0) {
11544                 if (error == EPERM) {
11545                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11546                         return ENAMETOOLONG;
11547                 }
11548                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11549                 return error;
11550         }
11551         if (xattr_protected(attrname)) {
11552                 return EPERM;
11553         }
11554         if (uap->size != 0 && uap->value == 0) {
11555                 return EINVAL;
11556         }
11557         if ((error = file_vnode(uap->fd, &vp))) {
11558                 return error;
11559         }
11560         if ((error = vnode_getwithref(vp))) {
11561                 file_drop(uap->fd);
11562                 return error;
11563         }
11564         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11565             &uio_buf[0], sizeof(uio_buf));
11566         uio_addiov(auio, uap->value, uap->size);
11567
11568         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
11569 #if CONFIG_FSE
11570         if (error == 0) {
11571                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11572                     FSE_ARG_VNODE, vp,
11573                     FSE_ARG_DONE);
11574         }
11575 #endif
11576         vnode_put(vp);
11577         file_drop(uap->fd);
11578         *retval = 0;
11579         return error;
11580 }
11581
11582 /*
11583  * Remove an extended attribute.
11584  * XXX Code duplication here.
11585  */
11586 int
11587 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
11588 {
11589         vnode_t vp;
11590         struct nameidata nd;
11591         char attrname[XATTR_MAXNAMELEN + 1];
11592         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11593         vfs_context_t ctx = vfs_context_current();
11594         size_t namelen;
11595         u_int32_t nameiflags;
11596         int error;
11597
11598         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11599                 return EINVAL;
11600         }
11601
11602         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11603         if (error != 0) {
11604                 return error;
11605         }
11606         if (xattr_protected(attrname)) {
11607                 return EPERM;
11608         }
11609         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11610         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
11611         if ((error = namei(&nd))) {
11612                 return error;
11613         }
11614         vp = nd.ni_vp;
11615         nameidone(&nd);
11616
11617         error = vn_removexattr(vp, attrname, uap->options, ctx);
11618 #if CONFIG_FSE
11619         if (error == 0) {
11620                 add_fsevent(FSE_XATTR_REMOVED, ctx,
11621                     FSE_ARG_VNODE, vp,
11622                     FSE_ARG_DONE);
11623         }
11624 #endif
11625         vnode_put(vp);
11626         *retval = 0;
11627         return error;
11628 }
11629
11630 /*
11631  * Remove an extended attribute.
11632  * XXX Code duplication here.
11633  */
11634 int
11635 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
11636 {
11637         vnode_t vp;
11638         char attrname[XATTR_MAXNAMELEN + 1];
11639         size_t namelen;
11640         int error;
11641 #if CONFIG_FSE
11642         vfs_context_t ctx = vfs_context_current();
11643 #endif
11644
11645         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11646                 return EINVAL;
11647         }
11648
11649         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11650         if (error != 0) {
11651                 return error;
11652         }
11653         if (xattr_protected(attrname)) {
11654                 return EPERM;
11655         }
11656         if ((error = file_vnode(uap->fd, &vp))) {
11657                 return error;
11658         }
11659         if ((error = vnode_getwithref(vp))) {
11660                 file_drop(uap->fd);
11661                 return error;
11662         }
11663
11664         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
11665 #if CONFIG_FSE
11666         if (error == 0) {
11667                 add_fsevent(FSE_XATTR_REMOVED, ctx,
11668                     FSE_ARG_VNODE, vp,
11669                     FSE_ARG_DONE);
11670         }
11671 #endif
11672         vnode_put(vp);
11673         file_drop(uap->fd);
11674         *retval = 0;
11675         return error;
11676 }
11677
11678 /*
11679  * Retrieve the list of extended attribute names.
11680  * XXX Code duplication here.
11681  */
11682 int
11683 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
11684 {
11685         vnode_t vp;
11686         struct nameidata nd;
11687         vfs_context_t ctx = vfs_context_current();
11688         uio_t auio = NULL;
11689         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11690         size_t attrsize = 0;
11691         u_int32_t nameiflags;
11692         int error;
11693         char uio_buf[UIO_SIZEOF(1)];
11694
11695         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11696                 return EINVAL;
11697         }
11698
11699         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11700         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
11701         if ((error = namei(&nd))) {
11702                 return error;
11703         }
11704         vp = nd.ni_vp;
11705         nameidone(&nd);
11706         if (uap->namebuf != 0 && uap->bufsize > 0) {
11707                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
11708                     &uio_buf[0], sizeof(uio_buf));
11709                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11710         }
11711
11712         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
11713
11714         vnode_put(vp);
11715         if (auio) {
11716                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11717         } else {
11718                 *retval = (user_ssize_t)attrsize;
11719         }
11720         return error;
11721 }
11722
11723 /*
11724  * Retrieve the list of extended attribute names.
11725  * XXX Code duplication here.
11726  */
11727 int
11728 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
11729 {
11730         vnode_t vp;
11731         uio_t auio = NULL;
11732         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11733         size_t attrsize = 0;
11734         int error;
11735         char uio_buf[UIO_SIZEOF(1)];
11736
11737         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11738                 return EINVAL;
11739         }
11740
11741         if ((error = file_vnode(uap->fd, &vp))) {
11742                 return error;
11743         }
11744         if ((error = vnode_getwithref(vp))) {
11745                 file_drop(uap->fd);
11746                 return error;
11747         }
11748         if (uap->namebuf != 0 && uap->bufsize > 0) {
11749                 auio = uio_createwithbuffer(1, 0, spacetype,
11750                     UIO_READ, &uio_buf[0], sizeof(uio_buf));
11751                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11752         }
11753
11754         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
11755
11756         vnode_put(vp);
11757         file_drop(uap->fd);
11758         if (auio) {
11759                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11760         } else {
11761                 *retval = (user_ssize_t)attrsize;
11762         }
11763         return error;
11764 }
11765
11766 static int
11767 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
11768     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
11769 {
11770         int error;
11771         struct mount *mp = NULL;
11772         vnode_t vp;
11773         int length;
11774         int bpflags;
11775         /* maximum number of times to retry build_path */
11776         unsigned int retries = 0x10;
11777
11778         if (bufsize > PAGE_SIZE) {
11779                 return EINVAL;
11780         }
11781
11782         if (buf == NULL) {
11783                 return ENOMEM;
11784         }
11785
11786 retry:
11787         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
11788                 error = ENOTSUP;  /* unexpected failure */
11789                 return ENOTSUP;
11790         }
11791
11792 unionget:
11793         if (objid == 2) {
11794                 struct vfs_attr vfsattr;
11795                 int use_vfs_root = TRUE;
11796
11797                 VFSATTR_INIT(&vfsattr);
11798                 VFSATTR_WANTED(&vfsattr, f_capabilities);
11799                 if (!(options & FSOPT_ISREALFSID) &&
11800                     vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
11801                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
11802                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
11803                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
11804                                 use_vfs_root = FALSE;
11805                         }
11806                 }
11807
11808                 if (use_vfs_root) {
11809                         error = VFS_ROOT(mp, &vp, ctx);
11810                 } else {
11811                         error = VFS_VGET(mp, objid, &vp, ctx);
11812                 }
11813         } else {
11814                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
11815         }
11816
11817         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
11818                 /*
11819                  * If the fileid isn't found and we're in a union
11820                  * mount volume, then see if the fileid is in the
11821                  * mounted-on volume.
11822                  */
11823                 struct mount *tmp = mp;
11824                 mp = vnode_mount(tmp->mnt_vnodecovered);
11825                 vfs_unbusy(tmp);
11826                 if (vfs_busy(mp, LK_NOWAIT) == 0) {
11827                         goto unionget;
11828                 }
11829         } else {
11830                 vfs_unbusy(mp);
11831         }
11832
11833         if (error) {
11834                 return error;
11835         }
11836
11837 #if CONFIG_MACF
11838         error = mac_vnode_check_fsgetpath(ctx, vp);
11839         if (error) {
11840                 vnode_put(vp);
11841                 return error;
11842         }
11843 #endif
11844
11845         /* Obtain the absolute path to this vnode. */
11846         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
11847         if (options & FSOPT_NOFIRMLINKPATH) {
11848                 bpflags |= BUILDPATH_NO_FIRMLINK;
11849         }
11850         bpflags |= BUILDPATH_CHECK_MOVED;
11851         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
11852         vnode_put(vp);
11853
11854         if (error) {
11855                 /* there was a race building the path, try a few more times */
11856                 if (error == EAGAIN) {
11857                         --retries;
11858                         if (retries > 0) {
11859                                 goto retry;
11860                         }
11861
11862                         error = ENOENT;
11863                 }
11864                 goto out;
11865         }
11866
11867         AUDIT_ARG(text, buf);
11868
11869         if (kdebug_enable) {
11870                 long dbg_parms[NUMPARMS];
11871                 int  dbg_namelen;
11872
11873                 dbg_namelen = (int)sizeof(dbg_parms);
11874
11875                 if (length < dbg_namelen) {
11876                         memcpy((char *)dbg_parms, buf, length);
11877                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
11878
11879                         dbg_namelen = length;
11880                 } else {
11881                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
11882                 }
11883
11884                 kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
11885                     KDBG_VFS_LOOKUP_FLAG_LOOKUP);
11886         }
11887
11888         *pathlen = (user_ssize_t)length; /* may be superseded by error */
11889
11890 out:
11891         return error;
11892 }
11893
11894 /*
11895  * Obtain the full pathname of a file system object by id.
11896  */
11897 static int
11898 fsgetpath_extended(user_addr_t buf, int bufsize, user_addr_t user_fsid, uint64_t objid,
11899     uint32_t options, user_ssize_t *retval)
11900 {
11901         vfs_context_t ctx = vfs_context_current();
11902         fsid_t fsid;
11903         char *realpath;
11904         int length;
11905         int error;
11906
11907         if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
11908                 return EINVAL;
11909         }
11910
11911         if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
11912                 return error;
11913         }
11914         AUDIT_ARG(value32, fsid.val[0]);
11915         AUDIT_ARG(value64, objid);
11916         /* Restrict output buffer size for now. */
11917
11918         if (bufsize > PAGE_SIZE || bufsize <= 0) {
11919                 return EINVAL;
11920         }
11921         MALLOC(realpath, char *, bufsize, M_TEMP, M_WAITOK | M_ZERO);
11922         if (realpath == NULL) {
11923                 return ENOMEM;
11924         }
11925
11926         error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
11927             options, &length);
11928
11929         if (error) {
11930                 goto out;
11931         }
11932
11933         error = copyout((caddr_t)realpath, buf, length);
11934
11935         *retval = (user_ssize_t)length; /* may be superseded by error */
11936 out:
11937         if (realpath) {
11938                 FREE(realpath, M_TEMP);
11939         }
11940         return error;
11941 }
11942
11943 int
11944 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
11945 {
11946         return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
11947                    0, retval);
11948 }
11949
11950 int
11951 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
11952 {
11953         return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
11954                    uap->options, retval);
11955 }
11956
11957 /*
11958  * Common routine to handle various flavors of statfs data heading out
11959  *      to user space.
11960  *
11961  * Returns:     0                       Success
11962  *              EFAULT
11963  */
11964 static int
11965 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
11966     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
11967     boolean_t partial_copy)
11968 {
11969         int             error;
11970         int             my_size, copy_size;
11971
11972         if (is_64_bit) {
11973                 struct user64_statfs sfs;
11974                 my_size = copy_size = sizeof(sfs);
11975                 bzero(&sfs, my_size);
11976                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
11977                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
11978                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
11979                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
11980                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
11981                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
11982                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
11983                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
11984                 sfs.f_files = (user64_long_t)sfsp->f_files;
11985                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
11986                 sfs.f_fsid = sfsp->f_fsid;
11987                 sfs.f_owner = sfsp->f_owner;
11988                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11989                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
11990                 } else {
11991                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
11992                 }
11993                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
11994                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
11995
11996                 if (partial_copy) {
11997                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
11998                 }
11999                 error = copyout((caddr_t)&sfs, bufp, copy_size);
12000         } else {
12001                 struct user32_statfs sfs;
12002
12003                 my_size = copy_size = sizeof(sfs);
12004                 bzero(&sfs, my_size);
12005
12006                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12007                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12008                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12009
12010                 /*
12011                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12012                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
12013                  * to reflect the filesystem size as best we can.
12014                  */
12015                 if ((sfsp->f_blocks > INT_MAX)
12016                     /* Hack for 4061702 . I think the real fix is for Carbon to
12017                      * look for some volume capability and not depend on hidden
12018                      * semantics agreed between a FS and carbon.
12019                      * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12020                      * for Carbon to set bNoVolumeSizes volume attribute.
12021                      * Without this the webdavfs files cannot be copied onto
12022                      * disk as they look huge. This change should not affect
12023                      * XSAN as they should not setting these to -1..
12024                      */
12025                     && (sfsp->f_blocks != 0xffffffffffffffffULL)
12026                     && (sfsp->f_bfree != 0xffffffffffffffffULL)
12027                     && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12028                         int             shift;
12029
12030                         /*
12031                          * Work out how far we have to shift the block count down to make it fit.
12032                          * Note that it's possible to have to shift so far that the resulting
12033                          * blocksize would be unreportably large.  At that point, we will clip
12034                          * any values that don't fit.
12035                          *
12036                          * For safety's sake, we also ensure that f_iosize is never reported as
12037                          * being smaller than f_bsize.
12038                          */
12039                         for (shift = 0; shift < 32; shift++) {
12040                                 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12041                                         break;
12042                                 }
12043                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12044                                         break;
12045                                 }
12046                         }
12047 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12048                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12049                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12050                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12051 #undef __SHIFT_OR_CLIP
12052                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12053                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
12054                 } else {
12055                         /* filesystem is small enough to be reported honestly */
12056                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12057                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12058                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12059                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12060                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12061                 }
12062                 sfs.f_files = (user32_long_t)sfsp->f_files;
12063                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12064                 sfs.f_fsid = sfsp->f_fsid;
12065                 sfs.f_owner = sfsp->f_owner;
12066                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12067                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12068                 } else {
12069                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12070                 }
12071                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12072                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12073
12074                 if (partial_copy) {
12075                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12076                 }
12077                 error = copyout((caddr_t)&sfs, bufp, copy_size);
12078         }
12079
12080         if (sizep != NULL) {
12081                 *sizep = my_size;
12082         }
12083         return error;
12084 }
12085
12086 /*
12087  * copy stat structure into user_stat structure.
12088  */
12089 void
12090 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
12091 {
12092         bzero(usbp, sizeof(*usbp));
12093
12094         usbp->st_dev = sbp->st_dev;
12095         usbp->st_ino = sbp->st_ino;
12096         usbp->st_mode = sbp->st_mode;
12097         usbp->st_nlink = sbp->st_nlink;
12098         usbp->st_uid = sbp->st_uid;
12099         usbp->st_gid = sbp->st_gid;
12100         usbp->st_rdev = sbp->st_rdev;
12101 #ifndef _POSIX_C_SOURCE
12102         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12103         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12104         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12105         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12106         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12107         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12108 #else
12109         usbp->st_atime = sbp->st_atime;
12110         usbp->st_atimensec = sbp->st_atimensec;
12111         usbp->st_mtime = sbp->st_mtime;
12112         usbp->st_mtimensec = sbp->st_mtimensec;
12113         usbp->st_ctime = sbp->st_ctime;
12114         usbp->st_ctimensec = sbp->st_ctimensec;
12115 #endif
12116         usbp->st_size = sbp->st_size;
12117         usbp->st_blocks = sbp->st_blocks;
12118         usbp->st_blksize = sbp->st_blksize;
12119         usbp->st_flags = sbp->st_flags;
12120         usbp->st_gen = sbp->st_gen;
12121         usbp->st_lspare = sbp->st_lspare;
12122         usbp->st_qspare[0] = sbp->st_qspare[0];
12123         usbp->st_qspare[1] = sbp->st_qspare[1];
12124 }
12125
12126 void
12127 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
12128 {
12129         bzero(usbp, sizeof(*usbp));
12130
12131         usbp->st_dev = sbp->st_dev;
12132         usbp->st_ino = sbp->st_ino;
12133         usbp->st_mode = sbp->st_mode;
12134         usbp->st_nlink = sbp->st_nlink;
12135         usbp->st_uid = sbp->st_uid;
12136         usbp->st_gid = sbp->st_gid;
12137         usbp->st_rdev = sbp->st_rdev;
12138 #ifndef _POSIX_C_SOURCE
12139         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12140         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12141         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12142         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12143         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12144         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12145 #else
12146         usbp->st_atime = sbp->st_atime;
12147         usbp->st_atimensec = sbp->st_atimensec;
12148         usbp->st_mtime = sbp->st_mtime;
12149         usbp->st_mtimensec = sbp->st_mtimensec;
12150         usbp->st_ctime = sbp->st_ctime;
12151         usbp->st_ctimensec = sbp->st_ctimensec;
12152 #endif
12153         usbp->st_size = sbp->st_size;
12154         usbp->st_blocks = sbp->st_blocks;
12155         usbp->st_blksize = sbp->st_blksize;
12156         usbp->st_flags = sbp->st_flags;
12157         usbp->st_gen = sbp->st_gen;
12158         usbp->st_lspare = sbp->st_lspare;
12159         usbp->st_qspare[0] = sbp->st_qspare[0];
12160         usbp->st_qspare[1] = sbp->st_qspare[1];
12161 }
12162
12163 /*
12164  * copy stat64 structure into user_stat64 structure.
12165  */
12166 void
12167 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
12168 {
12169         bzero(usbp, sizeof(*usbp));
12170
12171         usbp->st_dev = sbp->st_dev;
12172         usbp->st_ino = sbp->st_ino;
12173         usbp->st_mode = sbp->st_mode;
12174         usbp->st_nlink = sbp->st_nlink;
12175         usbp->st_uid = sbp->st_uid;
12176         usbp->st_gid = sbp->st_gid;
12177         usbp->st_rdev = sbp->st_rdev;
12178 #ifndef _POSIX_C_SOURCE
12179         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12180         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12181         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12182         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12183         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12184         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12185         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12186         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12187 #else
12188         usbp->st_atime = sbp->st_atime;
12189         usbp->st_atimensec = sbp->st_atimensec;
12190         usbp->st_mtime = sbp->st_mtime;
12191         usbp->st_mtimensec = sbp->st_mtimensec;
12192         usbp->st_ctime = sbp->st_ctime;
12193         usbp->st_ctimensec = sbp->st_ctimensec;
12194         usbp->st_birthtime = sbp->st_birthtime;
12195         usbp->st_birthtimensec = sbp->st_birthtimensec;
12196 #endif
12197         usbp->st_size = sbp->st_size;
12198         usbp->st_blocks = sbp->st_blocks;
12199         usbp->st_blksize = sbp->st_blksize;
12200         usbp->st_flags = sbp->st_flags;
12201         usbp->st_gen = sbp->st_gen;
12202         usbp->st_lspare = sbp->st_lspare;
12203         usbp->st_qspare[0] = sbp->st_qspare[0];
12204         usbp->st_qspare[1] = sbp->st_qspare[1];
12205 }
12206
12207 void
12208 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
12209 {
12210         bzero(usbp, sizeof(*usbp));
12211
12212         usbp->st_dev = sbp->st_dev;
12213         usbp->st_ino = sbp->st_ino;
12214         usbp->st_mode = sbp->st_mode;
12215         usbp->st_nlink = sbp->st_nlink;
12216         usbp->st_uid = sbp->st_uid;
12217         usbp->st_gid = sbp->st_gid;
12218         usbp->st_rdev = sbp->st_rdev;
12219 #ifndef _POSIX_C_SOURCE
12220         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12221         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12222         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12223         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12224         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12225         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12226         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12227         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12228 #else
12229         usbp->st_atime = sbp->st_atime;
12230         usbp->st_atimensec = sbp->st_atimensec;
12231         usbp->st_mtime = sbp->st_mtime;
12232         usbp->st_mtimensec = sbp->st_mtimensec;
12233         usbp->st_ctime = sbp->st_ctime;
12234         usbp->st_ctimensec = sbp->st_ctimensec;
12235         usbp->st_birthtime = sbp->st_birthtime;
12236         usbp->st_birthtimensec = sbp->st_birthtimensec;
12237 #endif
12238         usbp->st_size = sbp->st_size;
12239         usbp->st_blocks = sbp->st_blocks;
12240         usbp->st_blksize = sbp->st_blksize;
12241         usbp->st_flags = sbp->st_flags;
12242         usbp->st_gen = sbp->st_gen;
12243         usbp->st_lspare = sbp->st_lspare;
12244         usbp->st_qspare[0] = sbp->st_qspare[0];
12245         usbp->st_qspare[1] = sbp->st_qspare[1];
12246 }
12247
12248 /*
12249  * Purge buffer cache for simulating cold starts
12250  */
12251 static int
12252 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
12253 {
12254         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
12255
12256         return VNODE_RETURNED;
12257 }
12258
12259 static int
12260 vfs_purge_callback(mount_t mp, __unused void * arg)
12261 {
12262         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
12263
12264         return VFS_RETURNED;
12265 }
12266
12267 int
12268 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
12269 {
12270         if (!kauth_cred_issuser(kauth_cred_get())) {
12271                 return EPERM;
12272         }
12273
12274         vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
12275
12276         return 0;
12277 }
12278
12279 /*
12280  * gets the vnode associated with the (unnamed) snapshot directory
12281  * for a Filesystem. The snapshot directory vnode is returned with
12282  * an iocount on it.
12283  */
12284 int
12285 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
12286 {
12287         return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
12288 }
12289
12290 /*
12291  * Get the snapshot vnode.
12292  *
12293  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
12294  * needs nameidone() on ndp.
12295  *
12296  * If the snapshot vnode exists it is returned in ndp->ni_vp.
12297  *
12298  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
12299  * not needed.
12300  */
12301 static int
12302 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
12303     user_addr_t name, struct nameidata *ndp, int32_t op,
12304 #if !CONFIG_TRIGGERS
12305     __unused
12306 #endif
12307     enum path_operation pathop,
12308     vfs_context_t ctx)
12309 {
12310         int error, i;
12311         caddr_t name_buf;
12312         size_t name_len;
12313         struct vfs_attr vfa;
12314
12315         *sdvpp = NULLVP;
12316         *rvpp = NULLVP;
12317
12318         error = vnode_getfromfd(ctx, dirfd, rvpp);
12319         if (error) {
12320                 return error;
12321         }
12322
12323         if (!vnode_isvroot(*rvpp)) {
12324                 error = EINVAL;
12325                 goto out;
12326         }
12327
12328         /* Make sure the filesystem supports snapshots */
12329         VFSATTR_INIT(&vfa);
12330         VFSATTR_WANTED(&vfa, f_capabilities);
12331         if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
12332             !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
12333             !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
12334             VOL_CAP_INT_SNAPSHOT)) ||
12335             !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
12336             VOL_CAP_INT_SNAPSHOT))) {
12337                 error = ENOTSUP;
12338                 goto out;
12339         }
12340
12341         error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
12342         if (error) {
12343                 goto out;
12344         }
12345
12346         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12347         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12348         if (error) {
12349                 goto out1;
12350         }
12351
12352         /*
12353          * Some sanity checks- name can't be empty, "." or ".." or have slashes.
12354          * (the length returned by copyinstr includes the terminating NUL)
12355          */
12356         if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
12357             (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
12358                 error = EINVAL;
12359                 goto out1;
12360         }
12361         for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
12362                 ;
12363         }
12364         if (i < (int)name_len) {
12365                 error = EINVAL;
12366                 goto out1;
12367         }
12368
12369 #if CONFIG_MACF
12370         if (op == CREATE) {
12371                 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
12372                     name_buf);
12373         } else if (op == DELETE) {
12374                 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
12375                     name_buf);
12376         }
12377         if (error) {
12378                 goto out1;
12379         }
12380 #endif
12381
12382         /* Check if the snapshot already exists ... */
12383         NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
12384             UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
12385         ndp->ni_dvp = *sdvpp;
12386
12387         error = namei(ndp);
12388 out1:
12389         FREE(name_buf, M_TEMP);
12390 out:
12391         if (error) {
12392                 if (*sdvpp) {
12393                         vnode_put(*sdvpp);
12394                         *sdvpp = NULLVP;
12395                 }
12396                 if (*rvpp) {
12397                         vnode_put(*rvpp);
12398                         *rvpp = NULLVP;
12399                 }
12400         }
12401         return error;
12402 }
12403
12404 /*
12405  * create a filesystem snapshot (for supporting filesystems)
12406  *
12407  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
12408  * We get to the (unnamed) snapshot directory vnode and create the vnode
12409  * for the snapshot in it.
12410  *
12411  * Restrictions:
12412  *
12413  *    a) Passed in name for snapshot cannot have slashes.
12414  *    b) name can't be "." or ".."
12415  *
12416  * Since this requires superuser privileges, vnode_authorize calls are not
12417  * made.
12418  */
12419 static int
12420 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
12421     vfs_context_t ctx)
12422 {
12423         vnode_t rvp, snapdvp;
12424         int error;
12425         struct nameidata namend;
12426
12427         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
12428             OP_LINK, ctx);
12429         if (error) {
12430                 return error;
12431         }
12432
12433         if (namend.ni_vp) {
12434                 vnode_put(namend.ni_vp);
12435                 error = EEXIST;
12436         } else {
12437                 struct vnode_attr va;
12438                 vnode_t vp = NULLVP;
12439
12440                 VATTR_INIT(&va);
12441                 VATTR_SET(&va, va_type, VREG);
12442                 VATTR_SET(&va, va_mode, 0);
12443
12444                 error = vn_create(snapdvp, &vp, &namend, &va,
12445                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
12446                 if (!error && vp) {
12447                         vnode_put(vp);
12448                 }
12449         }
12450
12451         nameidone(&namend);
12452         vnode_put(snapdvp);
12453         vnode_put(rvp);
12454         return error;
12455 }
12456
12457 /*
12458  * Delete a Filesystem snapshot
12459  *
12460  * get the vnode for the unnamed snapshot directory and the snapshot and
12461  * delete the snapshot.
12462  */
12463 static int
12464 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
12465     vfs_context_t ctx)
12466 {
12467         vnode_t rvp, snapdvp;
12468         int error;
12469         struct nameidata namend;
12470
12471         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
12472             OP_UNLINK, ctx);
12473         if (error) {
12474                 goto out;
12475         }
12476
12477         error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
12478             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
12479
12480         vnode_put(namend.ni_vp);
12481         nameidone(&namend);
12482         vnode_put(snapdvp);
12483         vnode_put(rvp);
12484 out:
12485         return error;
12486 }
12487
12488 /*
12489  * Revert a filesystem to a snapshot
12490  *
12491  * Marks the filesystem to revert to the given snapshot on next mount.
12492  */
12493 static int
12494 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
12495     vfs_context_t ctx)
12496 {
12497         int error;
12498         vnode_t rvp;
12499         mount_t mp;
12500         struct fs_snapshot_revert_args revert_data;
12501         struct componentname cnp;
12502         caddr_t name_buf;
12503         size_t name_len;
12504
12505         error = vnode_getfromfd(ctx, dirfd, &rvp);
12506         if (error) {
12507                 return error;
12508         }
12509         mp = vnode_mount(rvp);
12510
12511         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12512         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12513         if (error) {
12514                 FREE(name_buf, M_TEMP);
12515                 vnode_put(rvp);
12516                 return error;
12517         }
12518
12519 #if CONFIG_MACF
12520         error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
12521         if (error) {
12522                 FREE(name_buf, M_TEMP);
12523                 vnode_put(rvp);
12524                 return error;
12525         }
12526 #endif
12527
12528         /*
12529          * Grab mount_iterref so that we can release the vnode,
12530          * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
12531          */
12532         error = mount_iterref(mp, 0);
12533         vnode_put(rvp);
12534         if (error) {
12535                 FREE(name_buf, M_TEMP);
12536                 return error;
12537         }
12538
12539         memset(&cnp, 0, sizeof(cnp));
12540         cnp.cn_pnbuf = (char *)name_buf;
12541         cnp.cn_nameiop = LOOKUP;
12542         cnp.cn_flags = ISLASTCN | HASBUF;
12543         cnp.cn_pnlen = MAXPATHLEN;
12544         cnp.cn_nameptr = cnp.cn_pnbuf;
12545         cnp.cn_namelen = (int)name_len;
12546         revert_data.sr_cnp = &cnp;
12547
12548         error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
12549         mount_iterdrop(mp);
12550         FREE(name_buf, M_TEMP);
12551
12552         if (error) {
12553                 /* If there was any error, try again using VNOP_IOCTL */
12554
12555                 vnode_t snapdvp;
12556                 struct nameidata namend;
12557
12558                 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
12559                     OP_LOOKUP, ctx);
12560                 if (error) {
12561                         return error;
12562                 }
12563
12564
12565                 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
12566                     0, ctx);
12567
12568                 vnode_put(namend.ni_vp);
12569                 nameidone(&namend);
12570                 vnode_put(snapdvp);
12571                 vnode_put(rvp);
12572         }
12573
12574         return error;
12575 }
12576
12577 /*
12578  * rename a Filesystem snapshot
12579  *
12580  * get the vnode for the unnamed snapshot directory and the snapshot and
12581  * rename the snapshot. This is a very specialised (and simple) case of
12582  * rename(2) (which has to deal with a lot more complications). It differs
12583  * slightly from rename(2) in that EEXIST is returned if the new name exists.
12584  */
12585 static int
12586 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
12587     __unused uint32_t flags, vfs_context_t ctx)
12588 {
12589         vnode_t rvp, snapdvp;
12590         int error, i;
12591         caddr_t newname_buf;
12592         size_t name_len;
12593         vnode_t fvp;
12594         struct nameidata *fromnd, *tond;
12595         /* carving out a chunk for structs that are too big to be on stack. */
12596         struct {
12597                 struct nameidata from_node;
12598                 struct nameidata to_node;
12599         } * __rename_data;
12600
12601         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
12602         fromnd = &__rename_data->from_node;
12603         tond = &__rename_data->to_node;
12604
12605         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
12606             OP_UNLINK, ctx);
12607         if (error) {
12608                 goto out;
12609         }
12610         fvp  = fromnd->ni_vp;
12611
12612         MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12613         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
12614         if (error) {
12615                 goto out1;
12616         }
12617
12618         /*
12619          * Some sanity checks- new name can't be empty, "." or ".." or have
12620          * slashes.
12621          * (the length returned by copyinstr includes the terminating NUL)
12622          *
12623          * The FS rename VNOP is suppossed to handle this but we'll pick it
12624          * off here itself.
12625          */
12626         if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
12627             (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
12628                 error = EINVAL;
12629                 goto out1;
12630         }
12631         for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
12632                 ;
12633         }
12634         if (i < (int)name_len) {
12635                 error = EINVAL;
12636                 goto out1;
12637         }
12638
12639 #if CONFIG_MACF
12640         error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
12641             newname_buf);
12642         if (error) {
12643                 goto out1;
12644         }
12645 #endif
12646
12647         NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
12648             UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
12649         tond->ni_dvp = snapdvp;
12650
12651         error = namei(tond);
12652         if (error) {
12653                 goto out2;
12654         } else if (tond->ni_vp) {
12655                 /*
12656                  * snapshot rename behaves differently than rename(2) - if the
12657                  * new name exists, EEXIST is returned.
12658                  */
12659                 vnode_put(tond->ni_vp);
12660                 error = EEXIST;
12661                 goto out2;
12662         }
12663
12664         error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
12665             &tond->ni_cnd, ctx);
12666
12667 out2:
12668         nameidone(tond);
12669 out1:
12670         FREE(newname_buf, M_TEMP);
12671         vnode_put(fvp);
12672         vnode_put(snapdvp);
12673         vnode_put(rvp);
12674         nameidone(fromnd);
12675 out:
12676         FREE(__rename_data, M_TEMP);
12677         return error;
12678 }
12679
12680 /*
12681  * Mount a Filesystem snapshot
12682  *
12683  * get the vnode for the unnamed snapshot directory and the snapshot and
12684  * mount the snapshot.
12685  */
12686 static int
12687 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
12688     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
12689 {
12690         vnode_t rvp, snapdvp, snapvp, vp, pvp;
12691         int error;
12692         struct nameidata *snapndp, *dirndp;
12693         /* carving out a chunk for structs that are too big to be on stack. */
12694         struct {
12695                 struct nameidata snapnd;
12696                 struct nameidata dirnd;
12697         } * __snapshot_mount_data;
12698
12699         MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
12700             M_TEMP, M_WAITOK);
12701         snapndp = &__snapshot_mount_data->snapnd;
12702         dirndp = &__snapshot_mount_data->dirnd;
12703
12704         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
12705             OP_LOOKUP, ctx);
12706         if (error) {
12707                 goto out;
12708         }
12709
12710         snapvp  = snapndp->ni_vp;
12711         if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
12712                 error = EIO;
12713                 goto out1;
12714         }
12715
12716         /* Get the vnode to be covered */
12717         NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
12718             UIO_USERSPACE, directory, ctx);
12719         error = namei(dirndp);
12720         if (error) {
12721                 goto out1;
12722         }
12723
12724         vp = dirndp->ni_vp;
12725         pvp = dirndp->ni_dvp;
12726
12727         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
12728                 error = EINVAL;
12729         } else {
12730                 mount_t mp = vnode_mount(rvp);
12731                 struct fs_snapshot_mount_args smnt_data;
12732
12733                 smnt_data.sm_mp  = mp;
12734                 smnt_data.sm_cnp = &snapndp->ni_cnd;
12735                 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
12736                     &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
12737                     KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
12738         }
12739
12740         vnode_put(vp);
12741         vnode_put(pvp);
12742         nameidone(dirndp);
12743 out1:
12744         vnode_put(snapvp);
12745         vnode_put(snapdvp);
12746         vnode_put(rvp);
12747         nameidone(snapndp);
12748 out:
12749         FREE(__snapshot_mount_data, M_TEMP);
12750         return error;
12751 }
12752
12753 /*
12754  * Root from a snapshot of the filesystem
12755  *
12756  * Marks the filesystem to root from the given snapshot on next boot.
12757  */
12758 static int
12759 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
12760     vfs_context_t ctx)
12761 {
12762         int error;
12763         vnode_t rvp;
12764         mount_t mp;
12765         struct fs_snapshot_root_args root_data;
12766         struct componentname cnp;
12767         caddr_t name_buf;
12768         size_t name_len;
12769
12770         error = vnode_getfromfd(ctx, dirfd, &rvp);
12771         if (error) {
12772                 return error;
12773         }
12774         mp = vnode_mount(rvp);
12775
12776         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12777         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12778         if (error) {
12779                 FREE(name_buf, M_TEMP);
12780                 vnode_put(rvp);
12781                 return error;
12782         }
12783
12784         // XXX MAC checks ?
12785
12786         /*
12787          * Grab mount_iterref so that we can release the vnode,
12788          * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
12789          */
12790         error = mount_iterref(mp, 0);
12791         vnode_put(rvp);
12792         if (error) {
12793                 FREE(name_buf, M_TEMP);
12794                 return error;
12795         }
12796
12797         memset(&cnp, 0, sizeof(cnp));
12798         cnp.cn_pnbuf = (char *)name_buf;
12799         cnp.cn_nameiop = LOOKUP;
12800         cnp.cn_flags = ISLASTCN | HASBUF;
12801         cnp.cn_pnlen = MAXPATHLEN;
12802         cnp.cn_nameptr = cnp.cn_pnbuf;
12803         cnp.cn_namelen = (int)name_len;
12804         root_data.sr_cnp = &cnp;
12805
12806         error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
12807
12808         mount_iterdrop(mp);
12809         FREE(name_buf, M_TEMP);
12810
12811         return error;
12812 }
12813
12814 /*
12815  * FS snapshot operations dispatcher
12816  */
12817 int
12818 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
12819     __unused int32_t *retval)
12820 {
12821         int error;
12822         vfs_context_t ctx = vfs_context_current();
12823
12824         AUDIT_ARG(fd, uap->dirfd);
12825         AUDIT_ARG(value32, uap->op);
12826
12827         error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
12828         if (error) {
12829                 return error;
12830         }
12831
12832         /*
12833          * Enforce user authorization for snapshot modification operations
12834          */
12835         if ((uap->op != SNAPSHOT_OP_MOUNT) &&
12836             (uap->op != SNAPSHOT_OP_ROOT)) {
12837                 vnode_t dvp = NULLVP;
12838                 vnode_t devvp = NULLVP;
12839                 mount_t mp;
12840
12841                 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
12842                 if (error) {
12843                         return error;
12844                 }
12845                 mp = vnode_mount(dvp);
12846                 devvp = mp->mnt_devvp;
12847
12848                 /* get an iocount on devvp */
12849                 if (devvp == NULLVP) {
12850                         error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
12851                         /* for mounts which arent block devices */
12852                         if (error == ENOENT) {
12853                                 error = ENXIO;
12854                         }
12855                 } else {
12856                         error = vnode_getwithref(devvp);
12857                 }
12858
12859                 if (error) {
12860                         vnode_put(dvp);
12861                         return error;
12862                 }
12863
12864                 if ((vfs_context_issuser(ctx) == 0) &&
12865                     (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0)) {
12866                         error = EPERM;
12867                 }
12868                 vnode_put(dvp);
12869                 vnode_put(devvp);
12870
12871                 if (error) {
12872                         return error;
12873                 }
12874         }
12875
12876         switch (uap->op) {
12877         case SNAPSHOT_OP_CREATE:
12878                 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
12879                 break;
12880         case SNAPSHOT_OP_DELETE:
12881                 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
12882                 break;
12883         case SNAPSHOT_OP_RENAME:
12884                 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
12885                     uap->flags, ctx);
12886                 break;
12887         case SNAPSHOT_OP_MOUNT:
12888                 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
12889                     uap->data, uap->flags, ctx);
12890                 break;
12891         case SNAPSHOT_OP_REVERT:
12892                 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
12893                 break;
12894 #if CONFIG_MNT_ROOTSNAP
12895         case SNAPSHOT_OP_ROOT:
12896                 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
12897                 break;
12898 #endif /* CONFIG_MNT_ROOTSNAP */
12899         default:
12900                 error = ENOSYS;
12901         }
12902
12903         return error;
12904 }