bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/sysctl.h>
  98 #include <sys/xattr.h>
  99 #include <sys/fcntl.h>
 100 #include <sys/fsctl.h>
 101 #include <sys/ubc_internal.h>
 102 #include <sys/disk.h>
 103 #include <sys/content_protection.h>
 104 #include <sys/clonefile.h>
 105 #include <sys/snapshot.h>
 106 #include <sys/priv.h>
 107 #include <sys/fsgetpath.h>
 108 #include <machine/cons.h>
 109 #include <machine/limits.h>
 110 #include <miscfs/specfs/specdev.h>
 111
 112 #include <vfs/vfs_disk_conditioner.h>
 113
 114 #include <security/audit/audit.h>
 115 #include <bsm/audit_kevents.h>
 116
 117 #include <mach/mach_types.h>
 118 #include <kern/kern_types.h>
 119 #include <kern/kalloc.h>
 120 #include <kern/task.h>
 121
 122 #include <vm/vm_pageout.h>
 123 #include <vm/vm_protos.h>
 124
 125 #include <libkern/OSAtomic.h>
 126 #include <pexpert/pexpert.h>
 127 #include <IOKit/IOBSD.h>
 128
 129 // deps for MIG call
 130 #include <kern/host.h>
 131 #include <kern/ipc_misc.h>
 132 #include <mach/host_priv.h>
 133 #include <mach/vfs_nspace.h>
 134 #include <os/log.h>
 135
 136 #if ROUTEFS
 137 #include <miscfs/routefs/routefs.h>
 138 #endif /* ROUTEFS */
 139
 140 #if CONFIG_MACF
 141 #include <security/mac.h>
 142 #include <security/mac_framework.h>
 143 #endif
 144
 145 #if CONFIG_FSE
 146 #define GET_PATH(x) \
 147         (x) = get_pathbuff();
 148 #define RELEASE_PATH(x) \
 149         release_pathbuff(x);
 150 #else
 151 #define GET_PATH(x)     \
 152         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 153 #define RELEASE_PATH(x) \
 154         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 155 #endif /* CONFIG_FSE */
 156
 157 #ifndef HFS_GET_BOOT_INFO
 158 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
 159 #endif
 160
 161 #ifndef HFS_SET_BOOT_INFO
 162 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
 163 #endif
 164
 165 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
 166 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
 167 #endif
 168
 169 extern void disk_conditioner_unmount(mount_t mp);
 170
 171 /* struct for checkdirs iteration */
 172 struct cdirargs {
 173         vnode_t olddp;
 174         vnode_t newdp;
 175 };
 176 /* callback  for checkdirs iteration */
 177 static int checkdirs_callback(proc_t p, void * arg);
 178
 179 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 180 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 181 void enablequotas(struct mount *mp, vfs_context_t ctx);
 182 static int getfsstat_callback(mount_t mp, void * arg);
 183 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 184 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 185 static int sync_callback(mount_t, void *);
 186 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 187     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 188     boolean_t partial_copy);
 189 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 190 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 191     struct componentname *cnp, user_addr_t fsmountargs,
 192     int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 193     vfs_context_t ctx);
 194 void vfs_notify_mount(vnode_t pdvp);
 195
 196 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 197
 198 struct fd_vn_data * fg_vn_data_alloc(void);
 199
 200 /*
 201  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 202  * Concurrent lookups (or lookups by ids) on hard links can cause the
 203  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 204  * does) to return ENOENT as the path cannot be returned from the name cache
 205  * alone. We have no option but to retry and hope to get one namei->reverse path
 206  * generation done without an intervening lookup, lookup by id on the hard link
 207  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 208  * which currently are the MAC hooks for rename, unlink and rmdir.
 209  */
 210 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 211
 212 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
 213     int unlink_flags);
 214
 215 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, uint32_t options, int *);
 216
 217 #ifdef CONFIG_IMGSRC_ACCESS
 218 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 219 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 220 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 221 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 222 static void mount_end_update(mount_t mp);
 223 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 224 #endif /* CONFIG_IMGSRC_ACCESS */
 225
 226 #if CONFIG_LOCKERBOOT
 227 int mount_locker_protoboot(const char *fsname, const char *mntpoint,
 228     const char *pbdevpath);
 229 #endif
 230
 231 //snapshot functions
 232 #if CONFIG_MNT_ROOTSNAP
 233 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
 234 #else
 235 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
 236 #endif
 237
 238 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 239
 240 __private_extern__
 241 int sync_internal(void);
 242
 243 __private_extern__
 244 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 245
 246 extern lck_grp_t *fd_vn_lck_grp;
 247 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 248 extern lck_attr_t *fd_vn_lck_attr;
 249
 250 /*
 251  * incremented each time a mount or unmount operation occurs
 252  * used to invalidate the cached value of the rootvp in the
 253  * mount structure utilized by cache_lookup_path
 254  */
 255 uint32_t mount_generation = 0;
 256
 257 /* counts number of mount and unmount operations */
 258 unsigned int vfs_nummntops = 0;
 259
 260 extern const struct fileops vnops;
 261 #if CONFIG_APPLEDOUBLE
 262 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 263 #endif /* CONFIG_APPLEDOUBLE */
 264
 265 /*
 266  * Virtual File System System Calls
 267  */
 268
 269 #if NFSCLIENT || DEVFS || ROUTEFS
 270 /*
 271  * Private in-kernel mounting spi (NFS only, not exported)
 272  */
 273 __private_extern__
 274 boolean_t
 275 vfs_iskernelmount(mount_t mp)
 276 {
 277         return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
 278 }
 279
 280 __private_extern__
 281 int
 282 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 283     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags, vfs_context_t ctx)
 284 {
 285         struct nameidata nd;
 286         boolean_t did_namei;
 287         int error;
 288
 289         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 290             UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 291
 292         /*
 293          * Get the vnode to be covered if it's not supplied
 294          */
 295         if (vp == NULLVP) {
 296                 error = namei(&nd);
 297                 if (error) {
 298                         if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VMVOL | KERNEL_MOUNT_DATAVOL)) {
 299                                 printf("failed to locate mount-on path: %s ", path);
 300                         }
 301                         return error;
 302                 }
 303                 vp = nd.ni_vp;
 304                 pvp = nd.ni_dvp;
 305                 did_namei = TRUE;
 306         } else {
 307                 char *pnbuf = CAST_DOWN(char *, path);
 308
 309                 nd.ni_cnd.cn_pnbuf = pnbuf;
 310                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 311                 did_namei = FALSE;
 312         }
 313
 314         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 315             syscall_flags, kern_flags, NULL, TRUE, ctx);
 316
 317         if (did_namei) {
 318                 vnode_put(vp);
 319                 vnode_put(pvp);
 320                 nameidone(&nd);
 321         }
 322
 323         return error;
 324 }
 325 #endif /* NFSCLIENT || DEVFS */
 326
 327 /*
 328  * Mount a file system.
 329  */
 330 /* ARGSUSED */
 331 int
 332 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 333 {
 334         struct __mac_mount_args muap;
 335
 336         muap.type = uap->type;
 337         muap.path = uap->path;
 338         muap.flags = uap->flags;
 339         muap.data = uap->data;
 340         muap.mac_p = USER_ADDR_NULL;
 341         return __mac_mount(p, &muap, retval);
 342 }
 343
 344 int
 345 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
 346 {
 347         struct componentname    cn;
 348         vfs_context_t           ctx = vfs_context_current();
 349         size_t                  dummy = 0;
 350         int                     error;
 351         int                     flags = uap->flags;
 352         char                    fstypename[MFSNAMELEN];
 353         char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
 354         vnode_t                 pvp;
 355         vnode_t                 vp;
 356
 357         AUDIT_ARG(fd, uap->fd);
 358         AUDIT_ARG(fflags, flags);
 359         /* fstypename will get audited by mount_common */
 360
 361         /* Sanity check the flags */
 362         if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
 363                 return ENOTSUP;
 364         }
 365
 366         if (flags & MNT_UNION) {
 367                 return EPERM;
 368         }
 369
 370         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 371         if (error) {
 372                 return error;
 373         }
 374
 375         if ((error = file_vnode(uap->fd, &vp)) != 0) {
 376                 return error;
 377         }
 378
 379         if ((error = vnode_getwithref(vp)) != 0) {
 380                 file_drop(uap->fd);
 381                 return error;
 382         }
 383
 384         pvp = vnode_getparent(vp);
 385         if (pvp == NULL) {
 386                 vnode_put(vp);
 387                 file_drop(uap->fd);
 388                 return EINVAL;
 389         }
 390
 391         memset(&cn, 0, sizeof(struct componentname));
 392         MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
 393         cn.cn_pnlen = MAXPATHLEN;
 394
 395         if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
 396                 FREE(cn.cn_pnbuf, M_TEMP);
 397                 vnode_put(pvp);
 398                 vnode_put(vp);
 399                 file_drop(uap->fd);
 400                 return error;
 401         }
 402
 403         error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
 404
 405         FREE(cn.cn_pnbuf, M_TEMP);
 406         vnode_put(pvp);
 407         vnode_put(vp);
 408         file_drop(uap->fd);
 409
 410         return error;
 411 }
 412
 413 void
 414 vfs_notify_mount(vnode_t pdvp)
 415 {
 416         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 417         lock_vnode_and_post(pdvp, NOTE_WRITE);
 418 }
 419
 420 /*
 421  * __mac_mount:
 422  *      Mount a file system taking into account MAC label behavior.
 423  *      See mount(2) man page for more information
 424  *
 425  * Parameters:    p                        Process requesting the mount
 426  *                uap                      User argument descriptor (see below)
 427  *                retval                   (ignored)
 428  *
 429  * Indirect:      uap->type                Filesystem type
 430  *                uap->path                Path to mount
 431  *                uap->data                Mount arguments
 432  *                uap->mac_p               MAC info
 433  *                uap->flags               Mount flags
 434  *
 435  *
 436  * Returns:        0                       Success
 437  *                !0                       Not success
 438  */
 439 boolean_t root_fs_upgrade_try = FALSE;
 440
 441 int
 442 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 443 {
 444         vnode_t pvp = NULL;
 445         vnode_t vp = NULL;
 446         int need_nameidone = 0;
 447         vfs_context_t ctx = vfs_context_current();
 448         char fstypename[MFSNAMELEN];
 449         struct nameidata nd;
 450         size_t dummy = 0;
 451         char *labelstr = NULL;
 452         int flags = uap->flags;
 453         int error;
 454 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 455         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 456 #else
 457 #pragma unused(p)
 458 #endif
 459         /*
 460          * Get the fs type name from user space
 461          */
 462         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 463         if (error) {
 464                 return error;
 465         }
 466
 467         /*
 468          * Get the vnode to be covered
 469          */
 470         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 471             UIO_USERSPACE, uap->path, ctx);
 472         error = namei(&nd);
 473         if (error) {
 474                 goto out;
 475         }
 476         need_nameidone = 1;
 477         vp = nd.ni_vp;
 478         pvp = nd.ni_dvp;
 479
 480 #ifdef CONFIG_IMGSRC_ACCESS
 481         /* Mounting image source cannot be batched with other operations */
 482         if (flags == MNT_IMGSRC_BY_INDEX) {
 483                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 484                     ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 485                 goto out;
 486         }
 487 #endif /* CONFIG_IMGSRC_ACCESS */
 488
 489 #if CONFIG_MACF
 490         /*
 491          * Get the label string (if any) from user space
 492          */
 493         if (uap->mac_p != USER_ADDR_NULL) {
 494                 struct user_mac mac;
 495                 size_t ulen = 0;
 496
 497                 if (is_64bit) {
 498                         struct user64_mac mac64;
 499                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 500                         mac.m_buflen = mac64.m_buflen;
 501                         mac.m_string = mac64.m_string;
 502                 } else {
 503                         struct user32_mac mac32;
 504                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 505                         mac.m_buflen = mac32.m_buflen;
 506                         mac.m_string = mac32.m_string;
 507                 }
 508                 if (error) {
 509                         goto out;
 510                 }
 511                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 512                     (mac.m_buflen < 2)) {
 513                         error = EINVAL;
 514                         goto out;
 515                 }
 516                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 517                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 518                 if (error) {
 519                         goto out;
 520                 }
 521                 AUDIT_ARG(mac_string, labelstr);
 522         }
 523 #endif /* CONFIG_MACF */
 524
 525         AUDIT_ARG(fflags, flags);
 526
 527 #if SECURE_KERNEL
 528         if (flags & MNT_UNION) {
 529                 /* No union mounts on release kernels */
 530                 error = EPERM;
 531                 goto out;
 532         }
 533 #endif
 534
 535         if ((vp->v_flag & VROOT) &&
 536             (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 537                 if (!(flags & MNT_UNION)) {
 538                         flags |= MNT_UPDATE;
 539                 } else {
 540                         /*
 541                          * For a union mount on '/', treat it as fresh
 542                          * mount instead of update.
 543                          * Otherwise, union mouting on '/' used to panic the
 544                          * system before, since mnt_vnodecovered was found to
 545                          * be NULL for '/' which is required for unionlookup
 546                          * after it gets ENOENT on union mount.
 547                          */
 548                         flags = (flags & ~(MNT_UPDATE));
 549                 }
 550
 551 #if SECURE_KERNEL
 552                 if ((flags & MNT_RDONLY) == 0) {
 553                         /* Release kernels are not allowed to mount "/" as rw */
 554                         error = EPERM;
 555                         goto out;
 556                 }
 557 #endif
 558                 /*
 559                  * See 7392553 for more details on why this check exists.
 560                  * Suffice to say: If this check is ON and something tries
 561                  * to mount the rootFS RW, we'll turn off the codesign
 562                  * bitmap optimization.
 563                  */
 564 #if CHECK_CS_VALIDATION_BITMAP
 565                 if ((flags & MNT_RDONLY) == 0) {
 566                         root_fs_upgrade_try = TRUE;
 567                 }
 568 #endif
 569         }
 570
 571         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 572             labelstr, FALSE, ctx);
 573
 574 out:
 575
 576 #if CONFIG_MACF
 577         if (labelstr) {
 578                 FREE(labelstr, M_MACTEMP);
 579         }
 580 #endif /* CONFIG_MACF */
 581
 582         if (vp) {
 583                 vnode_put(vp);
 584         }
 585         if (pvp) {
 586                 vnode_put(pvp);
 587         }
 588         if (need_nameidone) {
 589                 nameidone(&nd);
 590         }
 591
 592         return error;
 593 }
 594
 595 /*
 596  * common mount implementation (final stage of mounting)
 597  *
 598  * Arguments:
 599  *  fstypename  file system type (ie it's vfs name)
 600  *  pvp         parent of covered vnode
 601  *  vp          covered vnode
 602  *  cnp         component name (ie path) of covered vnode
 603  *  flags       generic mount flags
 604  *  fsmountargs file system specific data
 605  *  labelstr    optional MAC label
 606  *  kernelmount TRUE for mounts initiated from inside the kernel
 607  *  ctx         caller's context
 608  */
 609 static int
 610 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 611     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 612     char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 613 {
 614 #if !CONFIG_MACF
 615 #pragma unused(labelstr)
 616 #endif
 617         struct vnode *devvp = NULLVP;
 618         struct vnode *device_vnode = NULLVP;
 619 #if CONFIG_MACF
 620         struct vnode *rvp;
 621 #endif
 622         struct mount *mp;
 623         struct vfstable *vfsp = (struct vfstable *)0;
 624         struct proc *p = vfs_context_proc(ctx);
 625         int error, flag = 0;
 626         user_addr_t devpath = USER_ADDR_NULL;
 627         int ronly = 0;
 628         int mntalloc = 0;
 629         boolean_t vfsp_ref = FALSE;
 630         boolean_t is_rwlock_locked = FALSE;
 631         boolean_t did_rele = FALSE;
 632         boolean_t have_usecount = FALSE;
 633
 634 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM
 635         /* Check for mutually-exclusive flag bits */
 636         uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL));
 637         int bitcount = 0;
 638         while (checkflags != 0) {
 639                 checkflags &= (checkflags - 1);
 640                 bitcount++;
 641         }
 642
 643         if (bitcount > 1) {
 644                 //not allowed to request multiple mount-by-role flags
 645                 error = EINVAL;
 646                 goto out1;
 647         }
 648 #endif
 649
 650         /*
 651          * Process an update for an existing mount
 652          */
 653         if (flags & MNT_UPDATE) {
 654                 if ((vp->v_flag & VROOT) == 0) {
 655                         error = EINVAL;
 656                         goto out1;
 657                 }
 658                 mp = vp->v_mount;
 659
 660                 /* unmount in progress return error */
 661                 mount_lock_spin(mp);
 662                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 663                         mount_unlock(mp);
 664                         error = EBUSY;
 665                         goto out1;
 666                 }
 667                 mount_unlock(mp);
 668                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 669                 is_rwlock_locked = TRUE;
 670                 /*
 671                  * We only allow the filesystem to be reloaded if it
 672                  * is currently mounted read-only.
 673                  */
 674                 if ((flags & MNT_RELOAD) &&
 675                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 676                         error = ENOTSUP;
 677                         goto out1;
 678                 }
 679
 680                 /*
 681                  * If content protection is enabled, update mounts are not
 682                  * allowed to turn it off.
 683                  */
 684                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 685                     ((flags & MNT_CPROTECT) == 0)) {
 686                         error = EINVAL;
 687                         goto out1;
 688                 }
 689
 690                 /*
 691                  * can't turn off MNT_REMOVABLE either but it may be an unexpected
 692                  * failure to return an error for this so we'll just silently
 693                  * add it if it is not passed in.
 694                  */
 695                 if ((mp->mnt_flag & MNT_REMOVABLE) &&
 696                     ((flags & MNT_REMOVABLE) == 0)) {
 697                         flags |= MNT_REMOVABLE;
 698                 }
 699
 700 #ifdef CONFIG_IMGSRC_ACCESS
 701                 /* Can't downgrade the backer of the root FS */
 702                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 703                     (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 704                         error = ENOTSUP;
 705                         goto out1;
 706                 }
 707 #endif /* CONFIG_IMGSRC_ACCESS */
 708
 709                 /*
 710                  * Only root, or the user that did the original mount is
 711                  * permitted to update it.
 712                  */
 713                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 714                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 715                         goto out1;
 716                 }
 717 #if CONFIG_MACF
 718                 error = mac_mount_check_remount(ctx, mp);
 719                 if (error != 0) {
 720                         goto out1;
 721                 }
 722 #endif
 723                 /*
 724                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 725                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 726                  */
 727                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 728                         flags |= MNT_NOSUID | MNT_NODEV;
 729                         if (mp->mnt_flag & MNT_NOEXEC) {
 730                                 flags |= MNT_NOEXEC;
 731                         }
 732                 }
 733                 flag = mp->mnt_flag;
 734
 735
 736
 737                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 738
 739                 vfsp = mp->mnt_vtable;
 740                 goto update;
 741         } // MNT_UPDATE
 742
 743         /*
 744          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 745          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 746          */
 747         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 748                 flags |= MNT_NOSUID | MNT_NODEV;
 749                 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
 750                         flags |= MNT_NOEXEC;
 751                 }
 752         }
 753
 754         /* XXXAUDIT: Should we capture the type on the error path as well? */
 755         AUDIT_ARG(text, fstypename);
 756         mount_list_lock();
 757         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 758                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 759                         vfsp->vfc_refcount++;
 760                         vfsp_ref = TRUE;
 761                         break;
 762                 }
 763         }
 764         mount_list_unlock();
 765         if (vfsp == NULL) {
 766                 error = ENODEV;
 767                 goto out1;
 768         }
 769
 770         /*
 771          * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
 772          * except in ROSV configs.
 773          */
 774         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
 775             ((internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL)) == 0)) {
 776                 error = EINVAL;  /* unsupported request */
 777                 goto out1;
 778         }
 779
 780         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 781         if (error != 0) {
 782                 goto out1;
 783         }
 784
 785         /*
 786          * Allocate and initialize the filesystem (mount_t)
 787          */
 788         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 789             M_MOUNT, M_WAITOK);
 790         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 791         mntalloc = 1;
 792
 793         /* Initialize the default IO constraints */
 794         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 795         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 796         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 797         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 798         mp->mnt_devblocksize = DEV_BSIZE;
 799         mp->mnt_alignmentmask = PAGE_MASK;
 800         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 801         mp->mnt_ioscale = 1;
 802         mp->mnt_ioflags = 0;
 803         mp->mnt_realrootvp = NULLVP;
 804         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 805
 806         TAILQ_INIT(&mp->mnt_vnodelist);
 807         TAILQ_INIT(&mp->mnt_workerqueue);
 808         TAILQ_INIT(&mp->mnt_newvnodes);
 809         mount_lock_init(mp);
 810         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 811         is_rwlock_locked = TRUE;
 812         mp->mnt_op = vfsp->vfc_vfsops;
 813         mp->mnt_vtable = vfsp;
 814         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 815         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 816         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 817         do {
 818                 int pathlen = MAXPATHLEN;
 819
 820                 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
 821                         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 822                 }
 823         } while (0);
 824         mp->mnt_vnodecovered = vp;
 825         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 826         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 827         mp->mnt_devbsdunit = 0;
 828
 829         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 830         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 831
 832 #if NFSCLIENT || DEVFS || ROUTEFS
 833         if (kernelmount) {
 834                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 835         }
 836         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
 837                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 838         }
 839 #endif /* NFSCLIENT || DEVFS */
 840
 841 update:
 842
 843         /*
 844          * Set the mount level flags.
 845          */
 846         if (flags & MNT_RDONLY) {
 847                 mp->mnt_flag |= MNT_RDONLY;
 848         } else if (mp->mnt_flag & MNT_RDONLY) {
 849                 // disallow read/write upgrades of file systems that
 850                 // had the TYPENAME_OVERRIDE feature set.
 851                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 852                         error = EPERM;
 853                         goto out1;
 854                 }
 855                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 856         }
 857         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 858             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 859             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 860             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
 861             MNT_QUARANTINE | MNT_CPROTECT);
 862
 863 #if SECURE_KERNEL
 864 #if !CONFIG_MNT_SUID
 865         /*
 866          * On release builds of iOS based platforms, always enforce NOSUID on
 867          * all mounts. We do this here because we can catch update mounts as well as
 868          * non-update mounts in this case.
 869          */
 870         mp->mnt_flag |= (MNT_NOSUID);
 871 #endif
 872 #endif
 873
 874         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 875             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 876             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 877             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
 878             MNT_QUARANTINE | MNT_CPROTECT);
 879
 880 #if CONFIG_MACF
 881         if (flags & MNT_MULTILABEL) {
 882                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 883                         error = EINVAL;
 884                         goto out1;
 885                 }
 886                 mp->mnt_flag |= MNT_MULTILABEL;
 887         }
 888 #endif
 889         /*
 890          * Process device path for local file systems if requested
 891          */
 892         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
 893             !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL))) {
 894                 //snapshot, vm, datavolume mounts are special
 895                 if (vfs_context_is64bit(ctx)) {
 896                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
 897                                 goto out1;
 898                         }
 899                         fsmountargs += sizeof(devpath);
 900                 } else {
 901                         user32_addr_t tmp;
 902                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
 903                                 goto out1;
 904                         }
 905                         /* munge into LP64 addr */
 906                         devpath = CAST_USER_ADDR_T(tmp);
 907                         fsmountargs += sizeof(tmp);
 908                 }
 909
 910                 /* Lookup device and authorize access to it */
 911                 if ((devpath)) {
 912                         struct nameidata nd;
 913
 914                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 915                         if ((error = namei(&nd))) {
 916                                 goto out1;
 917                         }
 918
 919                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 920                         devvp = nd.ni_vp;
 921
 922                         nameidone(&nd);
 923
 924                         if (devvp->v_type != VBLK) {
 925                                 error = ENOTBLK;
 926                                 goto out2;
 927                         }
 928                         if (major(devvp->v_rdev) >= nblkdev) {
 929                                 error = ENXIO;
 930                                 goto out2;
 931                         }
 932                         /*
 933                          * If mount by non-root, then verify that user has necessary
 934                          * permissions on the device.
 935                          */
 936                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 937                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 938
 939                                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 940                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 941                                 }
 942                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
 943                                         goto out2;
 944                                 }
 945                         }
 946                 }
 947                 /* On first mount, preflight and open device */
 948                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 949                         if ((error = vnode_ref(devvp))) {
 950                                 goto out2;
 951                         }
 952                         /*
 953                          * Disallow multiple mounts of the same device.
 954                          * Disallow mounting of a device that is currently in use
 955                          * (except for root, which might share swap device for miniroot).
 956                          * Flush out any old buffers remaining from a previous use.
 957                          */
 958                         if ((error = vfs_mountedon(devvp))) {
 959                                 goto out3;
 960                         }
 961
 962                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 963                                 error = EBUSY;
 964                                 goto out3;
 965                         }
 966                         if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
 967                                 error = ENOTBLK;
 968                                 goto out3;
 969                         }
 970                         if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
 971                                 goto out3;
 972                         }
 973
 974                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 975 #if CONFIG_MACF
 976                         error = mac_vnode_check_open(ctx,
 977                             devvp,
 978                             ronly ? FREAD : FREAD | FWRITE);
 979                         if (error) {
 980                                 goto out3;
 981                         }
 982 #endif /* MAC */
 983                         if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
 984                                 goto out3;
 985                         }
 986
 987                         mp->mnt_devvp = devvp;
 988                         device_vnode = devvp;
 989                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 990                     (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 991                     (device_vnode = mp->mnt_devvp)) {
 992                         dev_t dev;
 993                         int maj;
 994                         /*
 995                          * If upgrade to read-write by non-root, then verify
 996                          * that user has necessary permissions on the device.
 997                          */
 998                         vnode_getalways(device_vnode);
 999
1000                         if (suser(vfs_context_ucred(ctx), NULL) &&
1001                             (error = vnode_authorize(device_vnode, NULL,
1002                             KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1003                             ctx)) != 0) {
1004                                 vnode_put(device_vnode);
1005                                 goto out2;
1006                         }
1007
1008                         /* Tell the device that we're upgrading */
1009                         dev = (dev_t)device_vnode->v_rdev;
1010                         maj = major(dev);
1011
1012                         if ((u_int)maj >= (u_int)nblkdev) {
1013                                 panic("Volume mounted on a device with invalid major number.");
1014                         }
1015
1016                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1017                         vnode_put(device_vnode);
1018                         device_vnode = NULLVP;
1019                         if (error != 0) {
1020                                 goto out2;
1021                         }
1022                 }
1023         } // localargs && !(snapshot | data | vm)
1024
1025 #if CONFIG_MACF
1026         if ((flags & MNT_UPDATE) == 0) {
1027                 mac_mount_label_init(mp);
1028                 mac_mount_label_associate(ctx, mp);
1029         }
1030         if (labelstr) {
1031                 if ((flags & MNT_UPDATE) != 0) {
1032                         error = mac_mount_check_label_update(ctx, mp);
1033                         if (error != 0) {
1034                                 goto out3;
1035                         }
1036                 }
1037         }
1038 #endif
1039         /*
1040          * Mount the filesystem.  We already asserted that internal_flags
1041          * cannot have more than one mount-by-role bit set.
1042          */
1043         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1044                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1045                     (caddr_t)fsmountargs, 0, ctx);
1046         } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1047 #if CONFIG_ROSV_STARTUP
1048                 struct mount *origin_mp = (struct mount*)fsmountargs;
1049                 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1050                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1051                 if (error) {
1052                         printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1053                 } else {
1054                         /* Mark volume associated with system volume */
1055                         mp->mnt_kern_flag |= MNTK_SYSTEM;
1056
1057                         /* Attempt to acquire the mnt_devvp and set it up */
1058                         struct vnode *mp_devvp = NULL;
1059                         if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1060                                 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1061                                     0, &mp_devvp, vfs_context_kernel());
1062                                 if (!lerr) {
1063                                         mp->mnt_devvp = mp_devvp;
1064                                         //vnode_lookup took an iocount, need to drop it.
1065                                         vnode_put(mp_devvp);
1066                                         // now set `device_vnode` to the devvp that was acquired.
1067                                         // this is needed in order to ensure vfs_init_io_attributes is invoked.
1068                                         // note that though the iocount above was dropped, the mount acquires
1069                                         // an implicit reference against the device.
1070                                         device_vnode = mp_devvp;
1071                                 }
1072                         }
1073                 }
1074 #else
1075                 error = EINVAL;
1076 #endif
1077         } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1078 #if CONFIG_MOUNT_VM
1079                 struct mount *origin_mp = (struct mount*)fsmountargs;
1080                 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1081                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1082                 if (error) {
1083                         printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1084                 } else {
1085                         /* Mark volume associated with system volume and a swap mount */
1086                         mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1087                         /* Attempt to acquire the mnt_devvp and set it up */
1088                         struct vnode *mp_devvp = NULL;
1089                         if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1090                                 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1091                                     0, &mp_devvp, vfs_context_kernel());
1092                                 if (!lerr) {
1093                                         mp->mnt_devvp = mp_devvp;
1094                                         //vnode_lookup took an iocount, need to drop it.
1095                                         vnode_put(mp_devvp);
1096
1097                                         // now set `device_vnode` to the devvp that was acquired.
1098                                         // note that though the iocount above was dropped, the mount acquires
1099                                         // an implicit reference against the device.
1100                                         device_vnode = mp_devvp;
1101                                 }
1102                         }
1103                 }
1104 #else
1105                 error = EINVAL;
1106 #endif
1107         } else {
1108                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1109         }
1110
1111         if (flags & MNT_UPDATE) {
1112                 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1113                         mp->mnt_flag &= ~MNT_RDONLY;
1114                 }
1115                 mp->mnt_flag &= ~
1116                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1117                 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1118                 if (error) {
1119                         mp->mnt_flag = flag;  /* restore flag value */
1120                 }
1121                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1122                 lck_rw_done(&mp->mnt_rwlock);
1123                 is_rwlock_locked = FALSE;
1124                 if (!error) {
1125                         enablequotas(mp, ctx);
1126                 }
1127                 goto exit;
1128         }
1129
1130         /*
1131          * Put the new filesystem on the mount list after root.
1132          */
1133         if (error == 0) {
1134                 struct vfs_attr vfsattr;
1135 #if CONFIG_MACF
1136                 error = mac_mount_check_mount_late(ctx, mp);
1137                 if (error != 0) {
1138                         goto out3;
1139                 }
1140
1141                 if (vfs_flags(mp) & MNT_MULTILABEL) {
1142                         error = VFS_ROOT(mp, &rvp, ctx);
1143                         if (error) {
1144                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1145                                 goto out3;
1146                         }
1147                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1148                         /*
1149                          * drop reference provided by VFS_ROOT
1150                          */
1151                         vnode_put(rvp);
1152
1153                         if (error) {
1154                                 goto out3;
1155                         }
1156                 }
1157 #endif  /* MAC */
1158
1159                 vnode_lock_spin(vp);
1160                 CLR(vp->v_flag, VMOUNT);
1161                 vp->v_mountedhere = mp;
1162                 vnode_unlock(vp);
1163
1164                 /*
1165                  * taking the name_cache_lock exclusively will
1166                  * insure that everyone is out of the fast path who
1167                  * might be trying to use a now stale copy of
1168                  * vp->v_mountedhere->mnt_realrootvp
1169                  * bumping mount_generation causes the cached values
1170                  * to be invalidated
1171                  */
1172                 name_cache_lock();
1173                 mount_generation++;
1174                 name_cache_unlock();
1175
1176                 error = vnode_ref(vp);
1177                 if (error != 0) {
1178                         goto out4;
1179                 }
1180
1181                 have_usecount = TRUE;
1182
1183                 error = checkdirs(vp, ctx);
1184                 if (error != 0) {
1185                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
1186                         goto out4;
1187                 }
1188                 /*
1189                  * there is no cleanup code here so I have made it void
1190                  * we need to revisit this
1191                  */
1192                 (void)VFS_START(mp, 0, ctx);
1193
1194                 if (mount_list_add(mp) != 0) {
1195                         /*
1196                          * The system is shutting down trying to umount
1197                          * everything, so fail with a plausible errno.
1198                          */
1199                         error = EBUSY;
1200                         goto out4;
1201                 }
1202                 lck_rw_done(&mp->mnt_rwlock);
1203                 is_rwlock_locked = FALSE;
1204
1205                 /* Check if this mounted file system supports EAs or named streams. */
1206                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1207                 VFSATTR_INIT(&vfsattr);
1208                 VFSATTR_WANTED(&vfsattr, f_capabilities);
1209                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1210                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1211                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1212                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1213                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1214                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1215                         }
1216 #if NAMEDSTREAMS
1217                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1218                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1219                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1220                         }
1221 #endif
1222                         /* Check if this file system supports path from id lookups. */
1223                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1224                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1225                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1226                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
1227                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1228                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1229                         }
1230
1231                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1232                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1233                                 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1234                         }
1235                 }
1236                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1237                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1238                 }
1239                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1240                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1241                 }
1242                 /* increment the operations count */
1243                 OSAddAtomic(1, &vfs_nummntops);
1244                 enablequotas(mp, ctx);
1245
1246                 if (device_vnode) {
1247                         device_vnode->v_specflags |= SI_MOUNTEDON;
1248
1249                         /*
1250                          *   cache the IO attributes for the underlying physical media...
1251                          *   an error return indicates the underlying driver doesn't
1252                          *   support all the queries necessary... however, reasonable
1253                          *   defaults will have been set, so no reason to bail or care
1254                          */
1255                         vfs_init_io_attributes(device_vnode, mp);
1256                 }
1257
1258                 /* Now that mount is setup, notify the listeners */
1259                 vfs_notify_mount(pvp);
1260                 IOBSDMountChange(mp, kIOMountChangeMount);
1261         } else {
1262                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1263                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1264                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1265                             mp->mnt_vtable->vfc_name, error);
1266                 }
1267
1268                 vnode_lock_spin(vp);
1269                 CLR(vp->v_flag, VMOUNT);
1270                 vnode_unlock(vp);
1271                 mount_list_lock();
1272                 mp->mnt_vtable->vfc_refcount--;
1273                 mount_list_unlock();
1274
1275                 if (device_vnode) {
1276                         vnode_rele(device_vnode);
1277                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1278                 }
1279                 lck_rw_done(&mp->mnt_rwlock);
1280                 is_rwlock_locked = FALSE;
1281
1282                 /*
1283                  * if we get here, we have a mount structure that needs to be freed,
1284                  * but since the coveredvp hasn't yet been updated to point at it,
1285                  * no need to worry about other threads holding a crossref on this mp
1286                  * so it's ok to just free it
1287                  */
1288                 mount_lock_destroy(mp);
1289 #if CONFIG_MACF
1290                 mac_mount_label_destroy(mp);
1291 #endif
1292                 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1293         }
1294 exit:
1295         /*
1296          * drop I/O count on the device vp if there was one
1297          */
1298         if (devpath && devvp) {
1299                 vnode_put(devvp);
1300         }
1301
1302         return error;
1303
1304 /* Error condition exits */
1305 out4:
1306         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1307
1308         /*
1309          * If the mount has been placed on the covered vp,
1310          * it may have been discovered by now, so we have
1311          * to treat this just like an unmount
1312          */
1313         mount_lock_spin(mp);
1314         mp->mnt_lflag |= MNT_LDEAD;
1315         mount_unlock(mp);
1316
1317         if (device_vnode != NULLVP) {
1318                 vnode_rele(device_vnode);
1319                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1320                     ctx);
1321                 did_rele = TRUE;
1322         }
1323
1324         vnode_lock_spin(vp);
1325
1326         mp->mnt_crossref++;
1327         vp->v_mountedhere = (mount_t) 0;
1328
1329         vnode_unlock(vp);
1330
1331         if (have_usecount) {
1332                 vnode_rele(vp);
1333         }
1334 out3:
1335         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1336                 vnode_rele(devvp);
1337         }
1338 out2:
1339         if (devpath && devvp) {
1340                 vnode_put(devvp);
1341         }
1342 out1:
1343         /* Release mnt_rwlock only when it was taken */
1344         if (is_rwlock_locked == TRUE) {
1345                 lck_rw_done(&mp->mnt_rwlock);
1346         }
1347
1348         if (mntalloc) {
1349                 if (mp->mnt_crossref) {
1350                         mount_dropcrossref(mp, vp, 0);
1351                 } else {
1352                         mount_lock_destroy(mp);
1353 #if CONFIG_MACF
1354                         mac_mount_label_destroy(mp);
1355 #endif
1356                         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1357                 }
1358         }
1359         if (vfsp_ref) {
1360                 mount_list_lock();
1361                 vfsp->vfc_refcount--;
1362                 mount_list_unlock();
1363         }
1364
1365         return error;
1366 }
1367
1368 /*
1369  * Flush in-core data, check for competing mount attempts,
1370  * and set VMOUNT
1371  */
1372 int
1373 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1374 {
1375 #if !CONFIG_MACF
1376 #pragma unused(cnp,fsname)
1377 #endif
1378         struct vnode_attr va;
1379         int error;
1380
1381         if (!skip_auth) {
1382                 /*
1383                  * If the user is not root, ensure that they own the directory
1384                  * onto which we are attempting to mount.
1385                  */
1386                 VATTR_INIT(&va);
1387                 VATTR_WANTED(&va, va_uid);
1388                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1389                     (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1390                     (!vfs_context_issuser(ctx)))) {
1391                         error = EPERM;
1392                         goto out;
1393                 }
1394         }
1395
1396         if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1397                 goto out;
1398         }
1399
1400         if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1401                 goto out;
1402         }
1403
1404         if (vp->v_type != VDIR) {
1405                 error = ENOTDIR;
1406                 goto out;
1407         }
1408
1409         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1410                 error = EBUSY;
1411                 goto out;
1412         }
1413
1414 #if CONFIG_MACF
1415         error = mac_mount_check_mount(ctx, vp,
1416             cnp, fsname);
1417         if (error != 0) {
1418                 goto out;
1419         }
1420 #endif
1421
1422         vnode_lock_spin(vp);
1423         SET(vp->v_flag, VMOUNT);
1424         vnode_unlock(vp);
1425
1426 out:
1427         return error;
1428 }
1429
1430 #if CONFIG_IMGSRC_ACCESS
1431
1432 #define DEBUG_IMGSRC 0
1433
1434 #if DEBUG_IMGSRC
1435 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1436 #else
1437 #define IMGSRC_DEBUG(args...) do { } while(0)
1438 #endif
1439
1440 static int
1441 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1442 {
1443         struct nameidata nd;
1444         vnode_t vp, realdevvp;
1445         mode_t accessmode;
1446         int error;
1447         enum uio_seg uio = UIO_USERSPACE;
1448
1449         if (ctx == vfs_context_kernel()) {
1450                 uio = UIO_SYSSPACE;
1451         }
1452
1453         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1454         if ((error = namei(&nd))) {
1455                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1456                 return error;
1457         }
1458
1459         vp = nd.ni_vp;
1460
1461         if (!vnode_isblk(vp)) {
1462                 IMGSRC_DEBUG("Not block device.\n");
1463                 error = ENOTBLK;
1464                 goto out;
1465         }
1466
1467         realdevvp = mp->mnt_devvp;
1468         if (realdevvp == NULLVP) {
1469                 IMGSRC_DEBUG("No device backs the mount.\n");
1470                 error = ENXIO;
1471                 goto out;
1472         }
1473
1474         error = vnode_getwithref(realdevvp);
1475         if (error != 0) {
1476                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1477                 goto out;
1478         }
1479
1480         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1481                 IMGSRC_DEBUG("Wrong dev_t.\n");
1482                 error = ENXIO;
1483                 goto out1;
1484         }
1485
1486         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1487
1488         /*
1489          * If mount by non-root, then verify that user has necessary
1490          * permissions on the device.
1491          */
1492         if (!vfs_context_issuser(ctx)) {
1493                 accessmode = KAUTH_VNODE_READ_DATA;
1494                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1495                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1496                 }
1497                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1498                         IMGSRC_DEBUG("Access denied.\n");
1499                         goto out1;
1500                 }
1501         }
1502
1503         *devvpp = vp;
1504
1505 out1:
1506         vnode_put(realdevvp);
1507
1508 out:
1509         nameidone(&nd);
1510
1511         if (error) {
1512                 vnode_put(vp);
1513         }
1514
1515         return error;
1516 }
1517
1518 /*
1519  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1520  * and call checkdirs()
1521  */
1522 static int
1523 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1524 {
1525         int error;
1526
1527         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1528
1529         IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1530             mp->mnt_vtable->vfc_name, vnode_getname(vp));
1531
1532         vnode_lock_spin(vp);
1533         CLR(vp->v_flag, VMOUNT);
1534         vp->v_mountedhere = mp;
1535         vnode_unlock(vp);
1536
1537         /*
1538          * taking the name_cache_lock exclusively will
1539          * insure that everyone is out of the fast path who
1540          * might be trying to use a now stale copy of
1541          * vp->v_mountedhere->mnt_realrootvp
1542          * bumping mount_generation causes the cached values
1543          * to be invalidated
1544          */
1545         name_cache_lock();
1546         mount_generation++;
1547         name_cache_unlock();
1548
1549         error = vnode_ref(vp);
1550         if (error != 0) {
1551                 goto out;
1552         }
1553
1554         error = checkdirs(vp, ctx);
1555         if (error != 0) {
1556                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1557                 vnode_rele(vp);
1558                 goto out;
1559         }
1560
1561 out:
1562         if (error != 0) {
1563                 mp->mnt_vnodecovered = NULLVP;
1564         }
1565         return error;
1566 }
1567
1568 static void
1569 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1570 {
1571         vnode_rele(vp);
1572         vnode_lock_spin(vp);
1573         vp->v_mountedhere = (mount_t)NULL;
1574         vnode_unlock(vp);
1575
1576         mp->mnt_vnodecovered = NULLVP;
1577 }
1578
1579 static int
1580 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1581 {
1582         int error;
1583
1584         /* unmount in progress return error */
1585         mount_lock_spin(mp);
1586         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1587                 mount_unlock(mp);
1588                 return EBUSY;
1589         }
1590         mount_unlock(mp);
1591         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1592
1593         /*
1594          * We only allow the filesystem to be reloaded if it
1595          * is currently mounted read-only.
1596          */
1597         if ((flags & MNT_RELOAD) &&
1598             ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1599                 error = ENOTSUP;
1600                 goto out;
1601         }
1602
1603         /*
1604          * Only root, or the user that did the original mount is
1605          * permitted to update it.
1606          */
1607         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1608             (!vfs_context_issuser(ctx))) {
1609                 error = EPERM;
1610                 goto out;
1611         }
1612 #if CONFIG_MACF
1613         error = mac_mount_check_remount(ctx, mp);
1614         if (error != 0) {
1615                 goto out;
1616         }
1617 #endif
1618
1619 out:
1620         if (error) {
1621                 lck_rw_done(&mp->mnt_rwlock);
1622         }
1623
1624         return error;
1625 }
1626
1627 static void
1628 mount_end_update(mount_t mp)
1629 {
1630         lck_rw_done(&mp->mnt_rwlock);
1631 }
1632
1633 static int
1634 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1635 {
1636         vnode_t vp;
1637
1638         if (height >= MAX_IMAGEBOOT_NESTING) {
1639                 return EINVAL;
1640         }
1641
1642         vp = imgsrc_rootvnodes[height];
1643         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1644                 *rvpp = vp;
1645                 return 0;
1646         } else {
1647                 return ENOENT;
1648         }
1649 }
1650
1651 static int
1652 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1653     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1654     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1655 {
1656         int error;
1657         mount_t mp;
1658         boolean_t placed = FALSE;
1659         struct vfstable *vfsp;
1660         user_addr_t devpath;
1661         char *old_mntonname;
1662         vnode_t rvp;
1663         vnode_t devvp;
1664         uint32_t height;
1665         uint32_t flags;
1666
1667         /* If we didn't imageboot, nothing to move */
1668         if (imgsrc_rootvnodes[0] == NULLVP) {
1669                 return EINVAL;
1670         }
1671
1672         /* Only root can do this */
1673         if (!vfs_context_issuser(ctx)) {
1674                 return EPERM;
1675         }
1676
1677         IMGSRC_DEBUG("looking for root vnode.\n");
1678
1679         /*
1680          * Get root vnode of filesystem we're moving.
1681          */
1682         if (by_index) {
1683                 if (is64bit) {
1684                         struct user64_mnt_imgsrc_args mia64;
1685                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1686                         if (error != 0) {
1687                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1688                                 return error;
1689                         }
1690
1691                         height = mia64.mi_height;
1692                         flags = mia64.mi_flags;
1693                         devpath = mia64.mi_devpath;
1694                 } else {
1695                         struct user32_mnt_imgsrc_args mia32;
1696                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1697                         if (error != 0) {
1698                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1699                                 return error;
1700                         }
1701
1702                         height = mia32.mi_height;
1703                         flags = mia32.mi_flags;
1704                         devpath = mia32.mi_devpath;
1705                 }
1706         } else {
1707                 /*
1708                  * For binary compatibility--assumes one level of nesting.
1709                  */
1710                 if (is64bit) {
1711                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1712                                 return error;
1713                         }
1714                 } else {
1715                         user32_addr_t tmp;
1716                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1717                                 return error;
1718                         }
1719
1720                         /* munge into LP64 addr */
1721                         devpath = CAST_USER_ADDR_T(tmp);
1722                 }
1723
1724                 height = 0;
1725                 flags = 0;
1726         }
1727
1728         if (flags != 0) {
1729                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1730                 return EINVAL;
1731         }
1732
1733         error = get_imgsrc_rootvnode(height, &rvp);
1734         if (error != 0) {
1735                 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1736                 return error;
1737         }
1738
1739         IMGSRC_DEBUG("got old root vnode\n");
1740
1741         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1742
1743         /* Can only move once */
1744         mp = vnode_mount(rvp);
1745         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1746                 IMGSRC_DEBUG("Already moved.\n");
1747                 error = EBUSY;
1748                 goto out0;
1749         }
1750
1751         IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1752         IMGSRC_DEBUG("Starting updated.\n");
1753
1754         /* Get exclusive rwlock on mount, authorize update on mp */
1755         error = mount_begin_update(mp, ctx, 0);
1756         if (error != 0) {
1757                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1758                 goto out0;
1759         }
1760
1761         /*
1762          * It can only be moved once.  Flag is set under the rwlock,
1763          * so we're now safe to proceed.
1764          */
1765         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1766                 IMGSRC_DEBUG("Already moved [2]\n");
1767                 goto out1;
1768         }
1769
1770         IMGSRC_DEBUG("Preparing coveredvp.\n");
1771
1772         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1773         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1774         if (error != 0) {
1775                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1776                 goto out1;
1777         }
1778
1779         IMGSRC_DEBUG("Covered vp OK.\n");
1780
1781         /* Sanity check the name caller has provided */
1782         vfsp = mp->mnt_vtable;
1783         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1784                 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1785                     vfsp->vfc_name, fsname);
1786                 error = EINVAL;
1787                 goto out2;
1788         }
1789
1790         /* Check the device vnode and update mount-from name, for local filesystems */
1791         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1792                 IMGSRC_DEBUG("Local, doing device validation.\n");
1793
1794                 if (devpath != USER_ADDR_NULL) {
1795                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1796                         if (error) {
1797                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1798                                 goto out2;
1799                         }
1800
1801                         vnode_put(devvp);
1802                 }
1803         }
1804
1805         /*
1806          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1807          * and increment the name cache's mount generation
1808          */
1809
1810         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1811         error = place_mount_and_checkdirs(mp, vp, ctx);
1812         if (error != 0) {
1813                 goto out2;
1814         }
1815
1816         placed = TRUE;
1817
1818         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1819         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1820
1821         /* Forbid future moves */
1822         mount_lock(mp);
1823         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1824         mount_unlock(mp);
1825
1826         /* Finally, add to mount list, completely ready to go */
1827         if (mount_list_add(mp) != 0) {
1828                 /*
1829                  * The system is shutting down trying to umount
1830                  * everything, so fail with a plausible errno.
1831                  */
1832                 error = EBUSY;
1833                 goto out3;
1834         }
1835
1836         mount_end_update(mp);
1837         vnode_put(rvp);
1838         FREE(old_mntonname, M_TEMP);
1839
1840         vfs_notify_mount(pvp);
1841
1842         return 0;
1843 out3:
1844         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1845
1846         mount_lock(mp);
1847         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1848         mount_unlock(mp);
1849
1850 out2:
1851         /*
1852          * Placing the mp on the vnode clears VMOUNT,
1853          * so cleanup is different after that point
1854          */
1855         if (placed) {
1856                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1857                 undo_place_on_covered_vp(mp, vp);
1858         } else {
1859                 vnode_lock_spin(vp);
1860                 CLR(vp->v_flag, VMOUNT);
1861                 vnode_unlock(vp);
1862         }
1863 out1:
1864         mount_end_update(mp);
1865
1866 out0:
1867         vnode_put(rvp);
1868         FREE(old_mntonname, M_TEMP);
1869         return error;
1870 }
1871
1872 #if CONFIG_LOCKERBOOT
1873 __private_extern__
1874 int
1875 mount_locker_protoboot(const char *fsname, const char *mntpoint,
1876     const char *pbdevpath)
1877 {
1878         int error = -1;
1879         struct nameidata nd;
1880         boolean_t cleanup_nd = FALSE;
1881         vfs_context_t ctx = vfs_context_kernel();
1882         boolean_t is64 = TRUE;
1883         boolean_t by_index = TRUE;
1884         struct user64_mnt_imgsrc_args mia64 = {
1885                 .mi_height = 0,
1886                 .mi_flags = 0,
1887                 .mi_devpath = CAST_USER_ADDR_T(pbdevpath),
1888         };
1889         user_addr_t mia64addr = CAST_USER_ADDR_T(&mia64);
1890
1891         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
1892             UIO_SYSSPACE, CAST_USER_ADDR_T(mntpoint), ctx);
1893         error = namei(&nd);
1894         if (error) {
1895                 IMGSRC_DEBUG("namei: %d\n", error);
1896                 goto out;
1897         }
1898
1899         cleanup_nd = TRUE;
1900         error = relocate_imageboot_source(nd.ni_dvp, nd.ni_vp,
1901             &nd.ni_cnd, fsname, ctx, is64, mia64addr, by_index);
1902
1903 out:
1904         if (cleanup_nd) {
1905                 int stashed = error;
1906
1907                 error = vnode_put(nd.ni_vp);
1908                 if (error) {
1909                         panic("vnode_put() returned non-zero: %d", error);
1910                 }
1911
1912                 if (nd.ni_dvp) {
1913                         error = vnode_put(nd.ni_dvp);
1914                         if (error) {
1915                                 panic("vnode_put() returned non-zero: %d", error);
1916                         }
1917                 }
1918                 nameidone(&nd);
1919
1920                 error = stashed;
1921         }
1922         return error;
1923 }
1924 #endif /* CONFIG_LOCKERBOOT */
1925 #endif /* CONFIG_IMGSRC_ACCESS */
1926
1927 void
1928 enablequotas(struct mount *mp, vfs_context_t ctx)
1929 {
1930         struct nameidata qnd;
1931         int type;
1932         char qfpath[MAXPATHLEN];
1933         const char *qfname = QUOTAFILENAME;
1934         const char *qfopsname = QUOTAOPSNAME;
1935         const char *qfextension[] = INITQFNAMES;
1936
1937         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1938         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
1939                 return;
1940         }
1941         /*
1942          * Enable filesystem disk quotas if necessary.
1943          * We ignore errors as this should not interfere with final mount
1944          */
1945         for (type = 0; type < MAXQUOTAS; type++) {
1946                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1947                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1948                     CAST_USER_ADDR_T(qfpath), ctx);
1949                 if (namei(&qnd) != 0) {
1950                         continue;           /* option file to trigger quotas is not present */
1951                 }
1952                 vnode_put(qnd.ni_vp);
1953                 nameidone(&qnd);
1954                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1955
1956                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1957         }
1958         return;
1959 }
1960
1961
1962 static int
1963 checkdirs_callback(proc_t p, void * arg)
1964 {
1965         struct cdirargs * cdrp = (struct cdirargs *)arg;
1966         vnode_t olddp = cdrp->olddp;
1967         vnode_t newdp = cdrp->newdp;
1968         struct filedesc *fdp;
1969         vnode_t new_cvp = newdp;
1970         vnode_t new_rvp = newdp;
1971         vnode_t old_cvp = NULL;
1972         vnode_t old_rvp = NULL;
1973
1974         /*
1975          * XXX Also needs to iterate each thread in the process to see if it
1976          * XXX is using a per-thread current working directory, and, if so,
1977          * XXX update that as well.
1978          */
1979
1980         /*
1981          * First, with the proc_fdlock held, check to see if we will need
1982          * to do any work.  If not, we will get out fast.
1983          */
1984         proc_fdlock(p);
1985         fdp = p->p_fd;
1986         if (fdp == NULL ||
1987             (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp)) {
1988                 proc_fdunlock(p);
1989                 return PROC_RETURNED;
1990         }
1991         proc_fdunlock(p);
1992
1993         /*
1994          * Ok, we will have to do some work.  Always take two refs
1995          * because we might need that many.  We'll dispose of whatever
1996          * we ended up not using.
1997          */
1998         if (vnode_ref(newdp) != 0) {
1999                 return PROC_RETURNED;
2000         }
2001         if (vnode_ref(newdp) != 0) {
2002                 vnode_rele(newdp);
2003                 return PROC_RETURNED;
2004         }
2005
2006         /*
2007          * Now do the work.  Note: we dropped the proc_fdlock, so we
2008          * have to do all of the checks again.
2009          */
2010         proc_fdlock(p);
2011         fdp = p->p_fd;
2012         if (fdp != NULL) {
2013                 if (fdp->fd_cdir == olddp) {
2014                         old_cvp = olddp;
2015                         fdp->fd_cdir = newdp;
2016                         new_cvp = NULL;
2017                 }
2018                 if (fdp->fd_rdir == olddp) {
2019                         old_rvp = olddp;
2020                         fdp->fd_rdir = newdp;
2021                         new_rvp = NULL;
2022                 }
2023         }
2024         proc_fdunlock(p);
2025
2026         /*
2027          * Dispose of any references that are no longer needed.
2028          */
2029         if (old_cvp != NULL) {
2030                 vnode_rele(old_cvp);
2031         }
2032         if (old_rvp != NULL) {
2033                 vnode_rele(old_rvp);
2034         }
2035         if (new_cvp != NULL) {
2036                 vnode_rele(new_cvp);
2037         }
2038         if (new_rvp != NULL) {
2039                 vnode_rele(new_rvp);
2040         }
2041
2042         return PROC_RETURNED;
2043 }
2044
2045
2046
2047 /*
2048  * Scan all active processes to see if any of them have a current
2049  * or root directory onto which the new filesystem has just been
2050  * mounted. If so, replace them with the new mount point.
2051  */
2052 static int
2053 checkdirs(vnode_t olddp, vfs_context_t ctx)
2054 {
2055         vnode_t newdp;
2056         vnode_t tvp;
2057         int err;
2058         struct cdirargs cdr;
2059
2060         if (olddp->v_usecount == 1) {
2061                 return 0;
2062         }
2063         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2064
2065         if (err != 0) {
2066 #if DIAGNOSTIC
2067                 panic("mount: lost mount: error %d", err);
2068 #endif
2069                 return err;
2070         }
2071
2072         cdr.olddp = olddp;
2073         cdr.newdp = newdp;
2074         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2075         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2076
2077         if (rootvnode == olddp) {
2078                 vnode_ref(newdp);
2079                 tvp = rootvnode;
2080                 rootvnode = newdp;
2081                 vnode_rele(tvp);
2082         }
2083
2084         vnode_put(newdp);
2085         return 0;
2086 }
2087
2088 /*
2089  * Unmount a file system.
2090  *
2091  * Note: unmount takes a path to the vnode mounted on as argument,
2092  * not special file (as before).
2093  */
2094 /* ARGSUSED */
2095 int
2096 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2097 {
2098         vnode_t vp;
2099         struct mount *mp;
2100         int error;
2101         struct nameidata nd;
2102         vfs_context_t ctx = vfs_context_current();
2103
2104         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2105             UIO_USERSPACE, uap->path, ctx);
2106         error = namei(&nd);
2107         if (error) {
2108                 return error;
2109         }
2110         vp = nd.ni_vp;
2111         mp = vp->v_mount;
2112         nameidone(&nd);
2113
2114 #if CONFIG_MACF
2115         error = mac_mount_check_umount(ctx, mp);
2116         if (error != 0) {
2117                 vnode_put(vp);
2118                 return error;
2119         }
2120 #endif
2121         /*
2122          * Must be the root of the filesystem
2123          */
2124         if ((vp->v_flag & VROOT) == 0) {
2125                 vnode_put(vp);
2126                 return EINVAL;
2127         }
2128         mount_ref(mp, 0);
2129         vnode_put(vp);
2130         /* safedounmount consumes the mount ref */
2131         return safedounmount(mp, uap->flags, ctx);
2132 }
2133
2134 int
2135 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2136 {
2137         mount_t mp;
2138
2139         mp = mount_list_lookupby_fsid(fsid, 0, 1);
2140         if (mp == (mount_t)0) {
2141                 return ENOENT;
2142         }
2143         mount_ref(mp, 0);
2144         mount_iterdrop(mp);
2145         /* safedounmount consumes the mount ref */
2146         return safedounmount(mp, flags, ctx);
2147 }
2148
2149
2150 /*
2151  * The mount struct comes with a mount ref which will be consumed.
2152  * Do the actual file system unmount, prevent some common foot shooting.
2153  */
2154 int
2155 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2156 {
2157         int error;
2158         proc_t p = vfs_context_proc(ctx);
2159
2160         /*
2161          * If the file system is not responding and MNT_NOBLOCK
2162          * is set and not a forced unmount then return EBUSY.
2163          */
2164         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2165             (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2166                 error = EBUSY;
2167                 goto out;
2168         }
2169
2170         /*
2171          * Skip authorization if the mount is tagged as permissive and
2172          * this is not a forced-unmount attempt.
2173          */
2174         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
2175                 /*
2176                  * Only root, or the user that did the original mount is
2177                  * permitted to unmount this filesystem.
2178                  */
2179                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2180                     (error = suser(kauth_cred_get(), &p->p_acflag))) {
2181                         goto out;
2182                 }
2183         }
2184         /*
2185          * Don't allow unmounting the root file system (or the associated VM or DATA mounts) .
2186          */
2187         if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2188                 error = EBUSY; /* the root (or associated volumes) is always busy */
2189                 goto out;
2190         }
2191
2192 #ifdef CONFIG_IMGSRC_ACCESS
2193         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2194                 error = EBUSY;
2195                 goto out;
2196         }
2197 #endif /* CONFIG_IMGSRC_ACCESS */
2198
2199         return dounmount(mp, flags, 1, ctx);
2200
2201 out:
2202         mount_drop(mp, 0);
2203         return error;
2204 }
2205
2206 /*
2207  * Do the actual file system unmount.
2208  */
2209 int
2210 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2211 {
2212         vnode_t coveredvp = (vnode_t)0;
2213         int error;
2214         int needwakeup = 0;
2215         int forcedunmount = 0;
2216         int lflags = 0;
2217         struct vnode *devvp = NULLVP;
2218 #if CONFIG_TRIGGERS
2219         proc_t p = vfs_context_proc(ctx);
2220         int did_vflush = 0;
2221         int pflags_save = 0;
2222 #endif /* CONFIG_TRIGGERS */
2223
2224 #if CONFIG_FSE
2225         if (!(flags & MNT_FORCE)) {
2226                 fsevent_unmount(mp, ctx);  /* has to come first! */
2227         }
2228 #endif
2229
2230         mount_lock(mp);
2231
2232         /*
2233          * If already an unmount in progress just return EBUSY.
2234          * Even a forced unmount cannot override.
2235          */
2236         if (mp->mnt_lflag & MNT_LUNMOUNT) {
2237                 if (withref != 0) {
2238                         mount_drop(mp, 1);
2239                 }
2240                 mount_unlock(mp);
2241                 return EBUSY;
2242         }
2243
2244         if (flags & MNT_FORCE) {
2245                 forcedunmount = 1;
2246                 mp->mnt_lflag |= MNT_LFORCE;
2247         }
2248
2249 #if CONFIG_TRIGGERS
2250         if (flags & MNT_NOBLOCK && p != kernproc) {
2251                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2252         }
2253 #endif
2254
2255         mp->mnt_kern_flag |= MNTK_UNMOUNT;
2256         mp->mnt_lflag |= MNT_LUNMOUNT;
2257         mp->mnt_flag &= ~MNT_ASYNC;
2258         /*
2259          * anyone currently in the fast path that
2260          * trips over the cached rootvp will be
2261          * dumped out and forced into the slow path
2262          * to regenerate a new cached value
2263          */
2264         mp->mnt_realrootvp = NULLVP;
2265         mount_unlock(mp);
2266
2267         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2268                 /*
2269                  * Force unmount any mounts in this filesystem.
2270                  * If any unmounts fail - just leave them dangling.
2271                  * Avoids recursion.
2272                  */
2273                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2274         }
2275
2276         /*
2277          * taking the name_cache_lock exclusively will
2278          * insure that everyone is out of the fast path who
2279          * might be trying to use a now stale copy of
2280          * vp->v_mountedhere->mnt_realrootvp
2281          * bumping mount_generation causes the cached values
2282          * to be invalidated
2283          */
2284         name_cache_lock();
2285         mount_generation++;
2286         name_cache_unlock();
2287
2288
2289         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2290         if (withref != 0) {
2291                 mount_drop(mp, 0);
2292         }
2293         error = 0;
2294         if (forcedunmount == 0) {
2295                 ubc_umount(mp); /* release cached vnodes */
2296                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2297                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
2298                         if (error) {
2299                                 mount_lock(mp);
2300                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2301                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2302                                 mp->mnt_lflag &= ~MNT_LFORCE;
2303                                 goto out;
2304                         }
2305                 }
2306         }
2307
2308         IOBSDMountChange(mp, kIOMountChangeUnmount);
2309
2310 #if CONFIG_TRIGGERS
2311         vfs_nested_trigger_unmounts(mp, flags, ctx);
2312         did_vflush = 1;
2313 #endif
2314         if (forcedunmount) {
2315                 lflags |= FORCECLOSE;
2316         }
2317         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2318         if ((forcedunmount == 0) && error) {
2319                 mount_lock(mp);
2320                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2321                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2322                 mp->mnt_lflag &= ~MNT_LFORCE;
2323                 goto out;
2324         }
2325
2326         /* make sure there are no one in the mount iterations or lookup */
2327         mount_iterdrain(mp);
2328
2329         error = VFS_UNMOUNT(mp, flags, ctx);
2330         if (error) {
2331                 mount_iterreset(mp);
2332                 mount_lock(mp);
2333                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2334                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2335                 mp->mnt_lflag &= ~MNT_LFORCE;
2336                 goto out;
2337         }
2338
2339         /* increment the operations count */
2340         if (!error) {
2341                 OSAddAtomic(1, &vfs_nummntops);
2342         }
2343
2344         if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2345                 /* hold an io reference and drop the usecount before close */
2346                 devvp = mp->mnt_devvp;
2347                 vnode_getalways(devvp);
2348                 vnode_rele(devvp);
2349                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2350                     ctx);
2351                 vnode_clearmountedon(devvp);
2352                 vnode_put(devvp);
2353         }
2354         lck_rw_done(&mp->mnt_rwlock);
2355         mount_list_remove(mp);
2356         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2357
2358         /* mark the mount point hook in the vp but not drop the ref yet */
2359         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2360                 /*
2361                  * The covered vnode needs special handling. Trying to get an
2362                  * iocount must not block here as this may lead to deadlocks
2363                  * if the Filesystem to which the covered vnode belongs is
2364                  * undergoing forced unmounts. Since we hold a usecount, the
2365                  * vnode cannot be reused (it can, however, still be terminated)
2366                  */
2367                 vnode_getalways(coveredvp);
2368                 vnode_lock_spin(coveredvp);
2369
2370                 mp->mnt_crossref++;
2371                 coveredvp->v_mountedhere = (struct mount *)0;
2372                 CLR(coveredvp->v_flag, VMOUNT);
2373
2374                 vnode_unlock(coveredvp);
2375                 vnode_put(coveredvp);
2376         }
2377
2378         mount_list_lock();
2379         mp->mnt_vtable->vfc_refcount--;
2380         mount_list_unlock();
2381
2382         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2383         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2384         mount_lock(mp);
2385         mp->mnt_lflag |= MNT_LDEAD;
2386
2387         if (mp->mnt_lflag & MNT_LWAIT) {
2388                 /*
2389                  * do the wakeup here
2390                  * in case we block in mount_refdrain
2391                  * which will drop the mount lock
2392                  * and allow anyone blocked in vfs_busy
2393                  * to wakeup and see the LDEAD state
2394                  */
2395                 mp->mnt_lflag &= ~MNT_LWAIT;
2396                 wakeup((caddr_t)mp);
2397         }
2398         mount_refdrain(mp);
2399
2400         /* free disk_conditioner_info structure for this mount */
2401         disk_conditioner_unmount(mp);
2402
2403 out:
2404         if (mp->mnt_lflag & MNT_LWAIT) {
2405                 mp->mnt_lflag &= ~MNT_LWAIT;
2406                 needwakeup = 1;
2407         }
2408
2409 #if CONFIG_TRIGGERS
2410         if (flags & MNT_NOBLOCK && p != kernproc) {
2411                 // Restore P_NOREMOTEHANG bit to its previous value
2412                 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2413                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2414                 }
2415         }
2416
2417         /*
2418          * Callback and context are set together under the mount lock, and
2419          * never cleared, so we're safe to examine them here, drop the lock,
2420          * and call out.
2421          */
2422         if (mp->mnt_triggercallback != NULL) {
2423                 mount_unlock(mp);
2424                 if (error == 0) {
2425                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2426                 } else if (did_vflush) {
2427                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2428                 }
2429         } else {
2430                 mount_unlock(mp);
2431         }
2432 #else
2433         mount_unlock(mp);
2434 #endif /* CONFIG_TRIGGERS */
2435
2436         lck_rw_done(&mp->mnt_rwlock);
2437
2438         if (needwakeup) {
2439                 wakeup((caddr_t)mp);
2440         }
2441
2442         if (!error) {
2443                 if ((coveredvp != NULLVP)) {
2444                         vnode_t pvp = NULLVP;
2445
2446                         /*
2447                          * The covered vnode needs special handling. Trying to
2448                          * get an iocount must not block here as this may lead
2449                          * to deadlocks if the Filesystem to which the covered
2450                          * vnode belongs is undergoing forced unmounts. Since we
2451                          * hold a usecount, the  vnode cannot be reused
2452                          * (it can, however, still be terminated).
2453                          */
2454                         vnode_getalways(coveredvp);
2455
2456                         mount_dropcrossref(mp, coveredvp, 0);
2457                         /*
2458                          * We'll _try_ to detect if this really needs to be
2459                          * done. The coveredvp can only be in termination (or
2460                          * terminated) if the coveredvp's mount point is in a
2461                          * forced unmount (or has been) since we still hold the
2462                          * ref.
2463                          */
2464                         if (!vnode_isrecycled(coveredvp)) {
2465                                 pvp = vnode_getparent(coveredvp);
2466 #if CONFIG_TRIGGERS
2467                                 if (coveredvp->v_resolve) {
2468                                         vnode_trigger_rearm(coveredvp, ctx);
2469                                 }
2470 #endif
2471                         }
2472
2473                         vnode_rele(coveredvp);
2474                         vnode_put(coveredvp);
2475                         coveredvp = NULLVP;
2476
2477                         if (pvp) {
2478                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2479                                 vnode_put(pvp);
2480                         }
2481                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2482                         mount_lock_destroy(mp);
2483 #if CONFIG_MACF
2484                         mac_mount_label_destroy(mp);
2485 #endif
2486                         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2487                 } else {
2488                         panic("dounmount: no coveredvp");
2489                 }
2490         }
2491         return error;
2492 }
2493
2494 /*
2495  * Unmount any mounts in this filesystem.
2496  */
2497 void
2498 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2499 {
2500         mount_t smp;
2501         fsid_t *fsids, fsid;
2502         int fsids_sz;
2503         int count = 0, i, m = 0;
2504         vnode_t vp;
2505
2506         mount_list_lock();
2507
2508         // Get an array to hold the submounts fsids.
2509         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2510         count++;
2511         fsids_sz = count * sizeof(fsid_t);
2512         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2513         if (fsids == NULL) {
2514                 mount_list_unlock();
2515                 goto out;
2516         }
2517         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2518
2519         /*
2520          * Fill the array with submount fsids.
2521          * Since mounts are always added to the tail of the mount list, the
2522          * list is always in mount order.
2523          * For each mount check if the mounted-on vnode belongs to a
2524          * mount that's already added to our array of mounts to be unmounted.
2525          */
2526         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2527                 vp = smp->mnt_vnodecovered;
2528                 if (vp == NULL) {
2529                         continue;
2530                 }
2531                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2532                 for (i = 0; i <= m; i++) {
2533                         if (fsids[i].val[0] == fsid.val[0] &&
2534                             fsids[i].val[1] == fsid.val[1]) {
2535                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2536                                 break;
2537                         }
2538                 }
2539         }
2540         mount_list_unlock();
2541
2542         // Unmount the submounts in reverse order. Ignore errors.
2543         for (i = m; i > 0; i--) {
2544                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2545                 if (smp) {
2546                         mount_ref(smp, 0);
2547                         mount_iterdrop(smp);
2548                         (void) dounmount(smp, flags, 1, ctx);
2549                 }
2550         }
2551 out:
2552         if (fsids) {
2553                 FREE(fsids, M_TEMP);
2554         }
2555 }
2556
2557 void
2558 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2559 {
2560         vnode_lock(dp);
2561         mp->mnt_crossref--;
2562
2563         if (mp->mnt_crossref < 0) {
2564                 panic("mount cross refs -ve");
2565         }
2566
2567         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2568                 if (need_put) {
2569                         vnode_put_locked(dp);
2570                 }
2571                 vnode_unlock(dp);
2572
2573                 mount_lock_destroy(mp);
2574 #if CONFIG_MACF
2575                 mac_mount_label_destroy(mp);
2576 #endif
2577                 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2578                 return;
2579         }
2580         if (need_put) {
2581                 vnode_put_locked(dp);
2582         }
2583         vnode_unlock(dp);
2584 }
2585
2586
2587 /*
2588  * Sync each mounted filesystem.
2589  */
2590 #if DIAGNOSTIC
2591 int syncprt = 0;
2592 #endif
2593
2594 int print_vmpage_stat = 0;
2595
2596 /*
2597  * sync_callback:       simple wrapper that calls VFS_SYNC() on volumes
2598  *                      mounted read-write with the passed waitfor value.
2599  *
2600  * Parameters:  mp      mount-point descriptor per mounted file-system instance.
2601  *              arg     user argument (please see below)
2602  *
2603  * User argument is a pointer to 32 bit unsigned integer which describes the
2604  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
2605  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2606  * waitfor value.
2607  *
2608  * Returns:             VFS_RETURNED
2609  */
2610 static int
2611 sync_callback(mount_t mp, void *arg)
2612 {
2613         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2614                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2615                 unsigned waitfor = MNT_NOWAIT;
2616
2617                 if (arg) {
2618                         waitfor = *(uint32_t*)arg;
2619                 }
2620
2621                 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
2622                 if (waitfor != MNT_WAIT &&
2623                     waitfor != (MNT_WAIT | MNT_VOLUME) &&
2624                     waitfor != MNT_NOWAIT &&
2625                     waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2626                     waitfor != MNT_DWAIT &&
2627                     waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2628                         panic("Passed inappropriate waitfor %u to "
2629                             "sync_callback()", waitfor);
2630                 }
2631
2632                 mp->mnt_flag &= ~MNT_ASYNC;
2633                 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2634                 if (asyncflag) {
2635                         mp->mnt_flag |= MNT_ASYNC;
2636                 }
2637         }
2638
2639         return VFS_RETURNED;
2640 }
2641
2642 /* ARGSUSED */
2643 int
2644 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2645 {
2646         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2647
2648         if (print_vmpage_stat) {
2649                 vm_countdirtypages();
2650         }
2651
2652 #if DIAGNOSTIC
2653         if (syncprt) {
2654                 vfs_bufstats();
2655         }
2656 #endif /* DIAGNOSTIC */
2657         return 0;
2658 }
2659
2660 typedef enum {
2661         SYNC_ALL = 0,
2662         SYNC_ONLY_RELIABLE_MEDIA = 1,
2663         SYNC_ONLY_UNRELIABLE_MEDIA = 2
2664 } sync_type_t;
2665
2666 static int
2667 sync_internal_callback(mount_t mp, void *arg)
2668 {
2669         if (arg) {
2670                 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2671                     (mp->mnt_flag & MNT_LOCAL);
2672                 sync_type_t sync_type = *((sync_type_t *)arg);
2673
2674                 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2675                         return VFS_RETURNED;
2676                 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2677                         return VFS_RETURNED;
2678                 }
2679         }
2680
2681         (void)sync_callback(mp, NULL);
2682
2683         return VFS_RETURNED;
2684 }
2685
2686 int sync_thread_state = 0;
2687 int sync_timeout_seconds = 5;
2688
2689 #define SYNC_THREAD_RUN       0x0001
2690 #define SYNC_THREAD_RUNNING   0x0002
2691
2692 static void
2693 sync_thread(__unused void *arg, __unused wait_result_t wr)
2694 {
2695         sync_type_t sync_type;
2696
2697         lck_mtx_lock(sync_mtx_lck);
2698         while (sync_thread_state & SYNC_THREAD_RUN) {
2699                 sync_thread_state &= ~SYNC_THREAD_RUN;
2700                 lck_mtx_unlock(sync_mtx_lck);
2701
2702                 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2703                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2704                 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2705                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2706
2707                 lck_mtx_lock(sync_mtx_lck);
2708         }
2709         /*
2710          * This wakeup _has_ to be issued before the lock is released otherwise
2711          * we may end up waking up a thread in sync_internal which is
2712          * expecting a wakeup from a thread it just created and not from this
2713          * thread which is about to exit.
2714          */
2715         wakeup(&sync_thread_state);
2716         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2717         lck_mtx_unlock(sync_mtx_lck);
2718
2719         if (print_vmpage_stat) {
2720                 vm_countdirtypages();
2721         }
2722
2723 #if DIAGNOSTIC
2724         if (syncprt) {
2725                 vfs_bufstats();
2726         }
2727 #endif /* DIAGNOSTIC */
2728 }
2729
2730 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2731
2732 /*
2733  * An in-kernel sync for power management to call.
2734  * This function always returns within sync_timeout seconds.
2735  */
2736 __private_extern__ int
2737 sync_internal(void)
2738 {
2739         thread_t thd;
2740         int error;
2741         int thread_created = FALSE;
2742         struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2743
2744         lck_mtx_lock(sync_mtx_lck);
2745         sync_thread_state |= SYNC_THREAD_RUN;
2746         if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2747                 int kr;
2748
2749                 sync_thread_state |= SYNC_THREAD_RUNNING;
2750                 kr = kernel_thread_start(sync_thread, NULL, &thd);
2751                 if (kr != KERN_SUCCESS) {
2752                         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2753                         lck_mtx_unlock(sync_mtx_lck);
2754                         printf("sync_thread failed\n");
2755                         return 0;
2756                 }
2757                 thread_created = TRUE;
2758         }
2759
2760         error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
2761             (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2762         if (error) {
2763                 struct timeval now;
2764
2765                 microtime(&now);
2766                 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2767                         printf("sync timed out: %d sec\n", sync_timeout_seconds);
2768                         sync_timeout_last_print.tv_sec = now.tv_sec;
2769                 }
2770         }
2771
2772         if (thread_created) {
2773                 thread_deallocate(thd);
2774         }
2775
2776         return 0;
2777 } /* end of sync_internal call */
2778
2779 /*
2780  * Change filesystem quotas.
2781  */
2782 #if QUOTA
2783 int
2784 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2785 {
2786         struct mount *mp;
2787         int error, quota_cmd, quota_status = 0;
2788         caddr_t datap;
2789         size_t fnamelen;
2790         struct nameidata nd;
2791         vfs_context_t ctx = vfs_context_current();
2792         struct dqblk my_dqblk = {};
2793
2794         AUDIT_ARG(uid, uap->uid);
2795         AUDIT_ARG(cmd, uap->cmd);
2796         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2797             uap->path, ctx);
2798         error = namei(&nd);
2799         if (error) {
2800                 return error;
2801         }
2802         mp = nd.ni_vp->v_mount;
2803         vnode_put(nd.ni_vp);
2804         nameidone(&nd);
2805
2806         /* copyin any data we will need for downstream code */
2807         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2808
2809         switch (quota_cmd) {
2810         case Q_QUOTAON:
2811                 /* uap->arg specifies a file from which to take the quotas */
2812                 fnamelen = MAXPATHLEN;
2813                 datap = kalloc(MAXPATHLEN);
2814                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2815                 break;
2816         case Q_GETQUOTA:
2817                 /* uap->arg is a pointer to a dqblk structure. */
2818                 datap = (caddr_t) &my_dqblk;
2819                 break;
2820         case Q_SETQUOTA:
2821         case Q_SETUSE:
2822                 /* uap->arg is a pointer to a dqblk structure. */
2823                 datap = (caddr_t) &my_dqblk;
2824                 if (proc_is64bit(p)) {
2825                         struct user_dqblk       my_dqblk64;
2826                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2827                         if (error == 0) {
2828                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2829                         }
2830                 } else {
2831                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2832                 }
2833                 break;
2834         case Q_QUOTASTAT:
2835                 /* uap->arg is a pointer to an integer */
2836                 datap = (caddr_t) &quota_status;
2837                 break;
2838         default:
2839                 datap = NULL;
2840                 break;
2841         } /* switch */
2842
2843         if (error == 0) {
2844                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2845         }
2846
2847         switch (quota_cmd) {
2848         case Q_QUOTAON:
2849                 if (datap != NULL) {
2850                         kfree(datap, MAXPATHLEN);
2851                 }
2852                 break;
2853         case Q_GETQUOTA:
2854                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2855                 if (error == 0) {
2856                         if (proc_is64bit(p)) {
2857                                 struct user_dqblk       my_dqblk64;
2858
2859                                 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
2860                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2861                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
2862                         } else {
2863                                 error = copyout(datap, uap->arg, sizeof(struct dqblk));
2864                         }
2865                 }
2866                 break;
2867         case Q_QUOTASTAT:
2868                 /* uap->arg is a pointer to an integer */
2869                 if (error == 0) {
2870                         error = copyout(datap, uap->arg, sizeof(quota_status));
2871                 }
2872                 break;
2873         default:
2874                 break;
2875         } /* switch */
2876
2877         return error;
2878 }
2879 #else
2880 int
2881 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2882 {
2883         return EOPNOTSUPP;
2884 }
2885 #endif /* QUOTA */
2886
2887 /*
2888  * Get filesystem statistics.
2889  *
2890  * Returns:     0                       Success
2891  *      namei:???
2892  *      vfs_update_vfsstat:???
2893  *      munge_statfs:EFAULT
2894  */
2895 /* ARGSUSED */
2896 int
2897 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2898 {
2899         struct mount *mp;
2900         struct vfsstatfs *sp;
2901         int error;
2902         struct nameidata nd;
2903         vfs_context_t ctx = vfs_context_current();
2904         vnode_t vp;
2905
2906         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2907             UIO_USERSPACE, uap->path, ctx);
2908         error = namei(&nd);
2909         if (error != 0) {
2910                 return error;
2911         }
2912         vp = nd.ni_vp;
2913         mp = vp->v_mount;
2914         sp = &mp->mnt_vfsstat;
2915         nameidone(&nd);
2916
2917 #if CONFIG_MACF
2918         error = mac_mount_check_stat(ctx, mp);
2919         if (error != 0) {
2920                 vnode_put(vp);
2921                 return error;
2922         }
2923 #endif
2924
2925         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2926         if (error != 0) {
2927                 vnode_put(vp);
2928                 return error;
2929         }
2930
2931         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2932         vnode_put(vp);
2933         return error;
2934 }
2935
2936 /*
2937  * Get filesystem statistics.
2938  */
2939 /* ARGSUSED */
2940 int
2941 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2942 {
2943         vnode_t vp;
2944         struct mount *mp;
2945         struct vfsstatfs *sp;
2946         int error;
2947
2948         AUDIT_ARG(fd, uap->fd);
2949
2950         if ((error = file_vnode(uap->fd, &vp))) {
2951                 return error;
2952         }
2953
2954         error = vnode_getwithref(vp);
2955         if (error) {
2956                 file_drop(uap->fd);
2957                 return error;
2958         }
2959
2960         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2961
2962         mp = vp->v_mount;
2963         if (!mp) {
2964                 error = EBADF;
2965                 goto out;
2966         }
2967
2968 #if CONFIG_MACF
2969         error = mac_mount_check_stat(vfs_context_current(), mp);
2970         if (error != 0) {
2971                 goto out;
2972         }
2973 #endif
2974
2975         sp = &mp->mnt_vfsstat;
2976         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2977                 goto out;
2978         }
2979
2980         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2981
2982 out:
2983         file_drop(uap->fd);
2984         vnode_put(vp);
2985
2986         return error;
2987 }
2988
2989 void
2990 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
2991 {
2992         struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
2993
2994         bzero(sfs, sizeof(*sfs));
2995
2996         sfs->f_bsize = vsfs->f_bsize;
2997         sfs->f_iosize = (int32_t)vsfs->f_iosize;
2998         sfs->f_blocks = vsfs->f_blocks;
2999         sfs->f_bfree = vsfs->f_bfree;
3000         sfs->f_bavail = vsfs->f_bavail;
3001         sfs->f_files = vsfs->f_files;
3002         sfs->f_ffree = vsfs->f_ffree;
3003         sfs->f_fsid = vsfs->f_fsid;
3004         sfs->f_owner = vsfs->f_owner;
3005         sfs->f_type = mp->mnt_vtable->vfc_typenum;
3006         sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3007         sfs->f_fssubtype = vsfs->f_fssubtype;
3008         sfs->f_flags_ext = ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS)) ? MNT_EXT_ROOT_DATA_VOL : 0;
3009         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3010                 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3011         } else {
3012                 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3013         }
3014         strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3015         strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3016 }
3017
3018 /*
3019  * Get file system statistics in 64-bit mode
3020  */
3021 int
3022 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3023 {
3024         struct mount *mp;
3025         int error;
3026         struct nameidata nd;
3027         struct statfs64 sfs;
3028         vfs_context_t ctxp = vfs_context_current();
3029         vnode_t vp;
3030
3031         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3032             UIO_USERSPACE, uap->path, ctxp);
3033         error = namei(&nd);
3034         if (error != 0) {
3035                 return error;
3036         }
3037         vp = nd.ni_vp;
3038         mp = vp->v_mount;
3039         nameidone(&nd);
3040
3041 #if CONFIG_MACF
3042         error = mac_mount_check_stat(ctxp, mp);
3043         if (error != 0) {
3044                 vnode_put(vp);
3045                 return error;
3046         }
3047 #endif
3048
3049         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3050         if (error != 0) {
3051                 vnode_put(vp);
3052                 return error;
3053         }
3054
3055         vfs_get_statfs64(mp, &sfs);
3056         if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3057             (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3058                 /* This process does not want to see a seperate data volume mountpoint */
3059                 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3060         }
3061         error = copyout(&sfs, uap->buf, sizeof(sfs));
3062         vnode_put(vp);
3063
3064         return error;
3065 }
3066
3067 /*
3068  * Get file system statistics in 64-bit mode
3069  */
3070 int
3071 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3072 {
3073         struct vnode *vp;
3074         struct mount *mp;
3075         struct statfs64 sfs;
3076         int error;
3077
3078         AUDIT_ARG(fd, uap->fd);
3079
3080         if ((error = file_vnode(uap->fd, &vp))) {
3081                 return error;
3082         }
3083
3084         error = vnode_getwithref(vp);
3085         if (error) {
3086                 file_drop(uap->fd);
3087                 return error;
3088         }
3089
3090         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3091
3092         mp = vp->v_mount;
3093         if (!mp) {
3094                 error = EBADF;
3095                 goto out;
3096         }
3097
3098 #if CONFIG_MACF
3099         error = mac_mount_check_stat(vfs_context_current(), mp);
3100         if (error != 0) {
3101                 goto out;
3102         }
3103 #endif
3104
3105         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3106                 goto out;
3107         }
3108
3109         vfs_get_statfs64(mp, &sfs);
3110         if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3111             (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3112                 /* This process does not want to see a seperate data volume mountpoint */
3113                 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3114         }
3115         error = copyout(&sfs, uap->buf, sizeof(sfs));
3116
3117 out:
3118         file_drop(uap->fd);
3119         vnode_put(vp);
3120
3121         return error;
3122 }
3123
3124 struct getfsstat_struct {
3125         user_addr_t     sfsp;
3126         user_addr_t     *mp;
3127         int             count;
3128         int             maxcount;
3129         int             flags;
3130         int             error;
3131 };
3132
3133
3134 static int
3135 getfsstat_callback(mount_t mp, void * arg)
3136 {
3137         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3138         struct vfsstatfs *sp;
3139         int error, my_size;
3140         vfs_context_t ctx = vfs_context_current();
3141
3142         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3143 #if CONFIG_MACF
3144                 error = mac_mount_check_stat(ctx, mp);
3145                 if (error != 0) {
3146                         fstp->error = error;
3147                         return VFS_RETURNED_DONE;
3148                 }
3149 #endif
3150                 sp = &mp->mnt_vfsstat;
3151                 /*
3152                  * If MNT_NOWAIT is specified, do not refresh the
3153                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3154                  */
3155                 if ((mp->mnt_lflag & MNT_LDEAD) ||
3156                     (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3157                     (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3158                     (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3159                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3160                         return VFS_RETURNED;
3161                 }
3162
3163                 /*
3164                  * Need to handle LP64 version of struct statfs
3165                  */
3166                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3167                 if (error) {
3168                         fstp->error = error;
3169                         return VFS_RETURNED_DONE;
3170                 }
3171                 fstp->sfsp += my_size;
3172
3173                 if (fstp->mp) {
3174 #if CONFIG_MACF
3175                         error = mac_mount_label_get(mp, *fstp->mp);
3176                         if (error) {
3177                                 fstp->error = error;
3178                                 return VFS_RETURNED_DONE;
3179                         }
3180 #endif
3181                         fstp->mp++;
3182                 }
3183         }
3184         fstp->count++;
3185         return VFS_RETURNED;
3186 }
3187
3188 /*
3189  * Get statistics on all filesystems.
3190  */
3191 int
3192 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3193 {
3194         struct __mac_getfsstat_args muap;
3195
3196         muap.buf = uap->buf;
3197         muap.bufsize = uap->bufsize;
3198         muap.mac = USER_ADDR_NULL;
3199         muap.macsize = 0;
3200         muap.flags = uap->flags;
3201
3202         return __mac_getfsstat(p, &muap, retval);
3203 }
3204
3205 /*
3206  * __mac_getfsstat: Get MAC-related file system statistics
3207  *
3208  * Parameters:    p                        (ignored)
3209  *                uap                      User argument descriptor (see below)
3210  *                retval                   Count of file system statistics (N stats)
3211  *
3212  * Indirect:      uap->bufsize             Buffer size
3213  *                uap->macsize             MAC info size
3214  *                uap->buf                 Buffer where information will be returned
3215  *                uap->mac                 MAC info
3216  *                uap->flags               File system flags
3217  *
3218  *
3219  * Returns:        0                       Success
3220  *                !0                       Not success
3221  *
3222  */
3223 int
3224 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3225 {
3226         user_addr_t sfsp;
3227         user_addr_t *mp;
3228         size_t count, maxcount, bufsize, macsize;
3229         struct getfsstat_struct fst;
3230
3231         if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3232                 return EINVAL;
3233         }
3234
3235         bufsize = (size_t) uap->bufsize;
3236         macsize = (size_t) uap->macsize;
3237
3238         if (IS_64BIT_PROCESS(p)) {
3239                 maxcount = bufsize / sizeof(struct user64_statfs);
3240         } else {
3241                 maxcount = bufsize / sizeof(struct user32_statfs);
3242         }
3243         sfsp = uap->buf;
3244         count = 0;
3245
3246         mp = NULL;
3247
3248 #if CONFIG_MACF
3249         if (uap->mac != USER_ADDR_NULL) {
3250                 u_int32_t *mp0;
3251                 int error;
3252                 unsigned int i;
3253
3254                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3255                 if (count != maxcount) {
3256                         return EINVAL;
3257                 }
3258
3259                 /* Copy in the array */
3260                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
3261                 if (mp0 == NULL) {
3262                         return ENOMEM;
3263                 }
3264
3265                 error = copyin(uap->mac, mp0, macsize);
3266                 if (error) {
3267                         FREE(mp0, M_MACTEMP);
3268                         return error;
3269                 }
3270
3271                 /* Normalize to an array of user_addr_t */
3272                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
3273                 if (mp == NULL) {
3274                         FREE(mp0, M_MACTEMP);
3275                         return ENOMEM;
3276                 }
3277
3278                 for (i = 0; i < count; i++) {
3279                         if (IS_64BIT_PROCESS(p)) {
3280                                 mp[i] = ((user_addr_t *)mp0)[i];
3281                         } else {
3282                                 mp[i] = (user_addr_t)mp0[i];
3283                         }
3284                 }
3285                 FREE(mp0, M_MACTEMP);
3286         }
3287 #endif
3288
3289
3290         fst.sfsp = sfsp;
3291         fst.mp = mp;
3292         fst.flags = uap->flags;
3293         fst.count = 0;
3294         fst.error = 0;
3295         fst.maxcount = maxcount;
3296
3297
3298         vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3299
3300         if (mp) {
3301                 FREE(mp, M_MACTEMP);
3302         }
3303
3304         if (fst.error) {
3305                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3306                 return fst.error;
3307         }
3308
3309         if (fst.sfsp && fst.count > fst.maxcount) {
3310                 *retval = fst.maxcount;
3311         } else {
3312                 *retval = fst.count;
3313         }
3314         return 0;
3315 }
3316
3317 static int
3318 getfsstat64_callback(mount_t mp, void * arg)
3319 {
3320         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3321         struct vfsstatfs *sp;
3322         struct statfs64 sfs;
3323         int error;
3324
3325         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3326 #if CONFIG_MACF
3327                 error = mac_mount_check_stat(vfs_context_current(), mp);
3328                 if (error != 0) {
3329                         fstp->error = error;
3330                         return VFS_RETURNED_DONE;
3331                 }
3332 #endif
3333                 sp = &mp->mnt_vfsstat;
3334                 /*
3335                  * If MNT_NOWAIT is specified, do not refresh the fsstat
3336                  * cache. MNT_WAIT overrides MNT_NOWAIT.
3337                  *
3338                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
3339                  * getfsstat, since the constants are out of the same
3340                  * namespace.
3341                  */
3342                 if ((mp->mnt_lflag & MNT_LDEAD) ||
3343                     ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3344                     (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3345                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3346                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3347                         return VFS_RETURNED;
3348                 }
3349
3350                 vfs_get_statfs64(mp, &sfs);
3351                 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3352                 if (error) {
3353                         fstp->error = error;
3354                         return VFS_RETURNED_DONE;
3355                 }
3356                 fstp->sfsp += sizeof(sfs);
3357         }
3358         fstp->count++;
3359         return VFS_RETURNED;
3360 }
3361
3362 /*
3363  * Get statistics on all file systems in 64 bit mode.
3364  */
3365 int
3366 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3367 {
3368         user_addr_t sfsp;
3369         int count, maxcount;
3370         struct getfsstat_struct fst;
3371
3372         maxcount = uap->bufsize / sizeof(struct statfs64);
3373
3374         sfsp = uap->buf;
3375         count = 0;
3376
3377         fst.sfsp = sfsp;
3378         fst.flags = uap->flags;
3379         fst.count = 0;
3380         fst.error = 0;
3381         fst.maxcount = maxcount;
3382
3383         vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3384
3385         if (fst.error) {
3386                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3387                 return fst.error;
3388         }
3389
3390         if (fst.sfsp && fst.count > fst.maxcount) {
3391                 *retval = fst.maxcount;
3392         } else {
3393                 *retval = fst.count;
3394         }
3395
3396         return 0;
3397 }
3398
3399 /*
3400  * gets the associated vnode with the file descriptor passed.
3401  * as input
3402  *
3403  * INPUT
3404  * ctx - vfs context of caller
3405  * fd - file descriptor for which vnode is required.
3406  * vpp - Pointer to pointer to vnode to be returned.
3407  *
3408  * The vnode is returned with an iocount so any vnode obtained
3409  * by this call needs a vnode_put
3410  *
3411  */
3412 int
3413 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3414 {
3415         int error;
3416         vnode_t vp;
3417         struct fileproc *fp;
3418         proc_t p = vfs_context_proc(ctx);
3419
3420         *vpp =  NULLVP;
3421
3422         error = fp_getfvp(p, fd, &fp, &vp);
3423         if (error) {
3424                 return error;
3425         }
3426
3427         error = vnode_getwithref(vp);
3428         if (error) {
3429                 (void)fp_drop(p, fd, fp, 0);
3430                 return error;
3431         }
3432
3433         (void)fp_drop(p, fd, fp, 0);
3434         *vpp = vp;
3435         return error;
3436 }
3437
3438 /*
3439  * Wrapper function around namei to start lookup from a directory
3440  * specified by a file descriptor ni_dirfd.
3441  *
3442  * In addition to all the errors returned by namei, this call can
3443  * return ENOTDIR if the file descriptor does not refer to a directory.
3444  * and EBADF if the file descriptor is not valid.
3445  */
3446 int
3447 nameiat(struct nameidata *ndp, int dirfd)
3448 {
3449         if ((dirfd != AT_FDCWD) &&
3450             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3451             !(ndp->ni_cnd.cn_flags & USEDVP)) {
3452                 int error = 0;
3453                 char c;
3454
3455                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3456                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3457                         if (error) {
3458                                 return error;
3459                         }
3460                 } else {
3461                         c = *((char *)(ndp->ni_dirp));
3462                 }
3463
3464                 if (c != '/') {
3465                         vnode_t dvp_at;
3466
3467                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3468                             &dvp_at);
3469                         if (error) {
3470                                 return error;
3471                         }
3472
3473                         if (vnode_vtype(dvp_at) != VDIR) {
3474                                 vnode_put(dvp_at);
3475                                 return ENOTDIR;
3476                         }
3477
3478                         ndp->ni_dvp = dvp_at;
3479                         ndp->ni_cnd.cn_flags |= USEDVP;
3480                         error = namei(ndp);
3481                         ndp->ni_cnd.cn_flags &= ~USEDVP;
3482                         vnode_put(dvp_at);
3483                         return error;
3484                 }
3485         }
3486
3487         return namei(ndp);
3488 }
3489
3490 /*
3491  * Change current working directory to a given file descriptor.
3492  */
3493 /* ARGSUSED */
3494 static int
3495 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3496 {
3497         struct filedesc *fdp = p->p_fd;
3498         vnode_t vp;
3499         vnode_t tdp;
3500         vnode_t tvp;
3501         struct mount *mp;
3502         int error;
3503         vfs_context_t ctx = vfs_context_current();
3504
3505         AUDIT_ARG(fd, uap->fd);
3506         if (per_thread && uap->fd == -1) {
3507                 /*
3508                  * Switching back from per-thread to per process CWD; verify we
3509                  * in fact have one before proceeding.  The only success case
3510                  * for this code path is to return 0 preemptively after zapping
3511                  * the thread structure contents.
3512                  */
3513                 thread_t th = vfs_context_thread(ctx);
3514                 if (th) {
3515                         uthread_t uth = get_bsdthread_info(th);
3516                         tvp = uth->uu_cdir;
3517                         uth->uu_cdir = NULLVP;
3518                         if (tvp != NULLVP) {
3519                                 vnode_rele(tvp);
3520                                 return 0;
3521                         }
3522                 }
3523                 return EBADF;
3524         }
3525
3526         if ((error = file_vnode(uap->fd, &vp))) {
3527                 return error;
3528         }
3529         if ((error = vnode_getwithref(vp))) {
3530                 file_drop(uap->fd);
3531                 return error;
3532         }
3533
3534         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3535
3536         if (vp->v_type != VDIR) {
3537                 error = ENOTDIR;
3538                 goto out;
3539         }
3540
3541 #if CONFIG_MACF
3542         error = mac_vnode_check_chdir(ctx, vp);
3543         if (error) {
3544                 goto out;
3545         }
3546 #endif
3547         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3548         if (error) {
3549                 goto out;
3550         }
3551
3552         while (!error && (mp = vp->v_mountedhere) != NULL) {
3553                 if (vfs_busy(mp, LK_NOWAIT)) {
3554                         error = EACCES;
3555                         goto out;
3556                 }
3557                 error = VFS_ROOT(mp, &tdp, ctx);
3558                 vfs_unbusy(mp);
3559                 if (error) {
3560                         break;
3561                 }
3562                 vnode_put(vp);
3563                 vp = tdp;
3564         }
3565         if (error) {
3566                 goto out;
3567         }
3568         if ((error = vnode_ref(vp))) {
3569                 goto out;
3570         }
3571         vnode_put(vp);
3572
3573         if (per_thread) {
3574                 thread_t th = vfs_context_thread(ctx);
3575                 if (th) {
3576                         uthread_t uth = get_bsdthread_info(th);
3577                         tvp = uth->uu_cdir;
3578                         uth->uu_cdir = vp;
3579                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3580                 } else {
3581                         vnode_rele(vp);
3582                         return ENOENT;
3583                 }
3584         } else {
3585                 proc_fdlock(p);
3586                 tvp = fdp->fd_cdir;
3587                 fdp->fd_cdir = vp;
3588                 proc_fdunlock(p);
3589         }
3590
3591         if (tvp) {
3592                 vnode_rele(tvp);
3593         }
3594         file_drop(uap->fd);
3595
3596         return 0;
3597 out:
3598         vnode_put(vp);
3599         file_drop(uap->fd);
3600
3601         return error;
3602 }
3603
3604 int
3605 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3606 {
3607         return common_fchdir(p, uap, 0);
3608 }
3609
3610 int
3611 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3612 {
3613         return common_fchdir(p, (void *)uap, 1);
3614 }
3615
3616
3617 /*
3618  * Change current working directory (".").
3619  *
3620  * Returns:     0                       Success
3621  *      change_dir:ENOTDIR
3622  *      change_dir:???
3623  *      vnode_ref:ENOENT                No such file or directory
3624  */
3625 /* ARGSUSED */
3626 int
3627 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3628 {
3629         struct filedesc *fdp = p->p_fd;
3630         int error;
3631         vnode_t tvp;
3632
3633         error = change_dir(ndp, ctx);
3634         if (error) {
3635                 return error;
3636         }
3637         if ((error = vnode_ref(ndp->ni_vp))) {
3638                 vnode_put(ndp->ni_vp);
3639                 return error;
3640         }
3641         /*
3642          * drop the iocount we picked up in change_dir
3643          */
3644         vnode_put(ndp->ni_vp);
3645
3646         if (per_thread) {
3647                 thread_t th = vfs_context_thread(ctx);
3648                 if (th) {
3649                         uthread_t uth = get_bsdthread_info(th);
3650                         tvp = uth->uu_cdir;
3651                         uth->uu_cdir = ndp->ni_vp;
3652                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3653                 } else {
3654                         vnode_rele(ndp->ni_vp);
3655                         return ENOENT;
3656                 }
3657         } else {
3658                 proc_fdlock(p);
3659                 tvp = fdp->fd_cdir;
3660                 fdp->fd_cdir = ndp->ni_vp;
3661                 proc_fdunlock(p);
3662         }
3663
3664         if (tvp) {
3665                 vnode_rele(tvp);
3666         }
3667
3668         return 0;
3669 }
3670
3671
3672 /*
3673  * Change current working directory (".").
3674  *
3675  * Returns:     0                       Success
3676  *      chdir_internal:ENOTDIR
3677  *      chdir_internal:ENOENT           No such file or directory
3678  *      chdir_internal:???
3679  */
3680 /* ARGSUSED */
3681 static int
3682 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3683 {
3684         struct nameidata nd;
3685         vfs_context_t ctx = vfs_context_current();
3686
3687         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3688             UIO_USERSPACE, uap->path, ctx);
3689
3690         return chdir_internal(p, ctx, &nd, per_thread);
3691 }
3692
3693
3694 /*
3695  * chdir
3696  *
3697  * Change current working directory (".") for the entire process
3698  *
3699  * Parameters:  p       Process requesting the call
3700  *              uap     User argument descriptor (see below)
3701  *              retval  (ignored)
3702  *
3703  * Indirect parameters: uap->path       Directory path
3704  *
3705  * Returns:     0                       Success
3706  *              common_chdir: ENOTDIR
3707  *              common_chdir: ENOENT    No such file or directory
3708  *              common_chdir: ???
3709  *
3710  */
3711 int
3712 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3713 {
3714         return common_chdir(p, (void *)uap, 0);
3715 }
3716
3717 /*
3718  * __pthread_chdir
3719  *
3720  * Change current working directory (".") for a single thread
3721  *
3722  * Parameters:  p       Process requesting the call
3723  *              uap     User argument descriptor (see below)
3724  *              retval  (ignored)
3725  *
3726  * Indirect parameters: uap->path       Directory path
3727  *
3728  * Returns:     0                       Success
3729  *              common_chdir: ENOTDIR
3730  *              common_chdir: ENOENT    No such file or directory
3731  *              common_chdir: ???
3732  *
3733  */
3734 int
3735 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3736 {
3737         return common_chdir(p, (void *)uap, 1);
3738 }
3739
3740
3741 /*
3742  * Change notion of root (``/'') directory.
3743  */
3744 /* ARGSUSED */
3745 int
3746 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3747 {
3748         struct filedesc *fdp = p->p_fd;
3749         int error;
3750         struct nameidata nd;
3751         vnode_t tvp;
3752         vfs_context_t ctx = vfs_context_current();
3753
3754         if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3755                 return error;
3756         }
3757
3758         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3759             UIO_USERSPACE, uap->path, ctx);
3760         error = change_dir(&nd, ctx);
3761         if (error) {
3762                 return error;
3763         }
3764
3765 #if CONFIG_MACF
3766         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3767             &nd.ni_cnd);
3768         if (error) {
3769                 vnode_put(nd.ni_vp);
3770                 return error;
3771         }
3772 #endif
3773
3774         if ((error = vnode_ref(nd.ni_vp))) {
3775                 vnode_put(nd.ni_vp);
3776                 return error;
3777         }
3778         vnode_put(nd.ni_vp);
3779
3780         proc_fdlock(p);
3781         tvp = fdp->fd_rdir;
3782         fdp->fd_rdir = nd.ni_vp;
3783         fdp->fd_flags |= FD_CHROOT;
3784         proc_fdunlock(p);
3785
3786         if (tvp != NULL) {
3787                 vnode_rele(tvp);
3788         }
3789
3790         return 0;
3791 }
3792
3793 /*
3794  * Common routine for chroot and chdir.
3795  *
3796  * Returns:     0                       Success
3797  *              ENOTDIR                 Not a directory
3798  *              namei:???               [anything namei can return]
3799  *              vnode_authorize:???     [anything vnode_authorize can return]
3800  */
3801 static int
3802 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3803 {
3804         vnode_t vp;
3805         int error;
3806
3807         if ((error = namei(ndp))) {
3808                 return error;
3809         }
3810         nameidone(ndp);
3811         vp = ndp->ni_vp;
3812
3813         if (vp->v_type != VDIR) {
3814                 vnode_put(vp);
3815                 return ENOTDIR;
3816         }
3817
3818 #if CONFIG_MACF
3819         error = mac_vnode_check_chdir(ctx, vp);
3820         if (error) {
3821                 vnode_put(vp);
3822                 return error;
3823         }
3824 #endif
3825
3826         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3827         if (error) {
3828                 vnode_put(vp);
3829                 return error;
3830         }
3831
3832         return error;
3833 }
3834
3835 /*
3836  * Free the vnode data (for directories) associated with the file glob.
3837  */
3838 struct fd_vn_data *
3839 fg_vn_data_alloc(void)
3840 {
3841         struct fd_vn_data *fvdata;
3842
3843         /* Allocate per fd vnode data */
3844         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3845             M_FD_VN_DATA, M_WAITOK | M_ZERO);
3846         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3847         return fvdata;
3848 }
3849
3850 /*
3851  * Free the vnode data (for directories) associated with the file glob.
3852  */
3853 void
3854 fg_vn_data_free(void *fgvndata)
3855 {
3856         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3857
3858         if (fvdata->fv_buf) {
3859                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3860         }
3861         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3862         FREE(fvdata, M_FD_VN_DATA);
3863 }
3864
3865 /*
3866  * Check permissions, allocate an open file structure,
3867  * and call the device open routine if any.
3868  *
3869  * Returns:     0                       Success
3870  *              EINVAL
3871  *              EINTR
3872  *      falloc:ENFILE
3873  *      falloc:EMFILE
3874  *      falloc:ENOMEM
3875  *      vn_open_auth:???
3876  *      dupfdopen:???
3877  *      VNOP_ADVLOCK:???
3878  *      vnode_setsize:???
3879  *
3880  * XXX Need to implement uid, gid
3881  */
3882 int
3883 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3884     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3885     int32_t *retval)
3886 {
3887         proc_t p = vfs_context_proc(ctx);
3888         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3889         struct fileproc *fp;
3890         vnode_t vp;
3891         int flags, oflags;
3892         int type, indx, error;
3893         struct flock lf;
3894         struct vfs_context context;
3895
3896         oflags = uflags;
3897
3898         if ((oflags & O_ACCMODE) == O_ACCMODE) {
3899                 return EINVAL;
3900         }
3901
3902         flags = FFLAGS(uflags);
3903         CLR(flags, FENCRYPTED);
3904         CLR(flags, FUNENCRYPTED);
3905
3906         AUDIT_ARG(fflags, oflags);
3907         AUDIT_ARG(mode, vap->va_mode);
3908
3909         if ((error = falloc_withalloc(p,
3910             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3911                 return error;
3912         }
3913         uu->uu_dupfd = -indx - 1;
3914
3915         if ((error = vn_open_auth(ndp, &flags, vap))) {
3916                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {       /* XXX from fdopen */
3917                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3918                                 fp_drop(p, indx, NULL, 0);
3919                                 *retval = indx;
3920                                 return 0;
3921                         }
3922                 }
3923                 if (error == ERESTART) {
3924                         error = EINTR;
3925                 }
3926                 fp_free(p, indx, fp);
3927                 return error;
3928         }
3929         uu->uu_dupfd = 0;
3930         vp = ndp->ni_vp;
3931
3932         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3933         fp->f_fglob->fg_ops = &vnops;
3934         fp->f_fglob->fg_data = (caddr_t)vp;
3935
3936         if (flags & (O_EXLOCK | O_SHLOCK)) {
3937                 lf.l_whence = SEEK_SET;
3938                 lf.l_start = 0;
3939                 lf.l_len = 0;
3940                 if (flags & O_EXLOCK) {
3941                         lf.l_type = F_WRLCK;
3942                 } else {
3943                         lf.l_type = F_RDLCK;
3944                 }
3945                 type = F_FLOCK;
3946                 if ((flags & FNONBLOCK) == 0) {
3947                         type |= F_WAIT;
3948                 }
3949 #if CONFIG_MACF
3950                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3951                     F_SETLK, &lf);
3952                 if (error) {
3953                         goto bad;
3954                 }
3955 #endif
3956                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL))) {
3957                         goto bad;
3958                 }
3959                 fp->f_fglob->fg_flag |= FHASLOCK;
3960         }
3961
3962         /* try to truncate by setting the size attribute */
3963         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
3964                 goto bad;
3965         }
3966
3967         /*
3968          * For directories we hold some additional information in the fd.
3969          */
3970         if (vnode_vtype(vp) == VDIR) {
3971                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3972         } else {
3973                 fp->f_fglob->fg_vn_data = NULL;
3974         }
3975
3976         vnode_put(vp);
3977
3978         /*
3979          * The first terminal open (without a O_NOCTTY) by a session leader
3980          * results in it being set as the controlling terminal.
3981          */
3982         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3983             !(flags & O_NOCTTY)) {
3984                 int tmp = 0;
3985
3986                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3987                     (caddr_t)&tmp, ctx);
3988         }
3989
3990         proc_fdlock(p);
3991         if (flags & O_CLOEXEC) {
3992                 *fdflags(p, indx) |= UF_EXCLOSE;
3993         }
3994         if (flags & O_CLOFORK) {
3995                 *fdflags(p, indx) |= UF_FORKCLOSE;
3996         }
3997         procfdtbl_releasefd(p, indx, NULL);
3998
3999 #if CONFIG_SECLUDED_MEMORY
4000         if (secluded_for_filecache &&
4001             FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
4002             vnode_vtype(vp) == VREG) {
4003                 memory_object_control_t moc;
4004
4005                 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4006
4007                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4008                         /* nothing to do... */
4009                 } else if (fp->f_fglob->fg_flag & FWRITE) {
4010                         /* writable -> no longer  eligible for secluded pages */
4011                         memory_object_mark_eligible_for_secluded(moc,
4012                             FALSE);
4013                 } else if (secluded_for_filecache == 1) {
4014                         char pathname[32] = { 0, };
4015                         size_t copied;
4016                         /* XXX FBDP: better way to detect /Applications/ ? */
4017                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4018                                 (void)copyinstr(ndp->ni_dirp,
4019                                     pathname,
4020                                     sizeof(pathname),
4021                                     &copied);
4022                         } else {
4023                                 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4024                                     pathname,
4025                                     sizeof(pathname),
4026                                     &copied);
4027                         }
4028                         pathname[sizeof(pathname) - 1] = '\0';
4029                         if (strncmp(pathname,
4030                             "/Applications/",
4031                             strlen("/Applications/")) == 0 &&
4032                             strncmp(pathname,
4033                             "/Applications/Camera.app/",
4034                             strlen("/Applications/Camera.app/")) != 0) {
4035                                 /*
4036                                  * not writable
4037                                  * AND from "/Applications/"
4038                                  * AND not from "/Applications/Camera.app/"
4039                                  * ==> eligible for secluded
4040                                  */
4041                                 memory_object_mark_eligible_for_secluded(moc,
4042                                     TRUE);
4043                         }
4044                 } else if (secluded_for_filecache == 2) {
4045 #if __arm64__
4046 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
4047 #elif __arm__
4048 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
4049 #else
4050 /* not implemented... */
4051 #endif
4052                         size_t len = strlen(vp->v_name);
4053                         if (!strncmp(vp->v_name, DYLD_SHARED_CACHE_NAME, len) ||
4054                             !strncmp(vp->v_name, "dyld", len) ||
4055                             !strncmp(vp->v_name, "launchd", len) ||
4056                             !strncmp(vp->v_name, "Camera", len) ||
4057                             !strncmp(vp->v_name, "mediaserverd", len) ||
4058                             !strncmp(vp->v_name, "SpringBoard", len) ||
4059                             !strncmp(vp->v_name, "backboardd", len)) {
4060                                 /*
4061                                  * This file matters when launching Camera:
4062                                  * do not store its contents in the secluded
4063                                  * pool that will be drained on Camera launch.
4064                                  */
4065                                 memory_object_mark_eligible_for_secluded(moc,
4066                                     FALSE);
4067                         }
4068                 }
4069         }
4070 #endif /* CONFIG_SECLUDED_MEMORY */
4071
4072         fp_drop(p, indx, fp, 1);
4073         proc_fdunlock(p);
4074
4075         *retval = indx;
4076
4077         return 0;
4078 bad:
4079         context = *vfs_context_current();
4080         context.vc_ucred = fp->f_fglob->fg_cred;
4081
4082         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
4083             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
4084                 lf.l_whence = SEEK_SET;
4085                 lf.l_start = 0;
4086                 lf.l_len = 0;
4087                 lf.l_type = F_UNLCK;
4088
4089                 (void)VNOP_ADVLOCK(
4090                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4091         }
4092
4093         vn_close(vp, fp->f_fglob->fg_flag, &context);
4094         vnode_put(vp);
4095         fp_free(p, indx, fp);
4096
4097         return error;
4098 }
4099
4100 /*
4101  * While most of the *at syscall handlers can call nameiat() which
4102  * is a wrapper around namei, the use of namei and initialisation
4103  * of nameidata are far removed and in different functions  - namei
4104  * gets called in vn_open_auth for open1. So we'll just do here what
4105  * nameiat() does.
4106  */
4107 static int
4108 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4109     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
4110     int dirfd)
4111 {
4112         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4113                 int error;
4114                 char c;
4115
4116                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4117                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
4118                         if (error) {
4119                                 return error;
4120                         }
4121                 } else {
4122                         c = *((char *)(ndp->ni_dirp));
4123                 }
4124
4125                 if (c != '/') {
4126                         vnode_t dvp_at;
4127
4128                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4129                             &dvp_at);
4130                         if (error) {
4131                                 return error;
4132                         }
4133
4134                         if (vnode_vtype(dvp_at) != VDIR) {
4135                                 vnode_put(dvp_at);
4136                                 return ENOTDIR;
4137                         }
4138
4139                         ndp->ni_dvp = dvp_at;
4140                         ndp->ni_cnd.cn_flags |= USEDVP;
4141                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
4142                             retval);
4143                         vnode_put(dvp_at);
4144                         return error;
4145                 }
4146         }
4147
4148         return open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval);
4149 }
4150
4151 /*
4152  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4153  *
4154  * Parameters:  p                       Process requesting the open
4155  *              uap                     User argument descriptor (see below)
4156  *              retval                  Pointer to an area to receive the
4157  *                                      return calue from the system call
4158  *
4159  * Indirect:    uap->path               Path to open (same as 'open')
4160  *              uap->flags              Flags to open (same as 'open'
4161  *              uap->uid                UID to set, if creating
4162  *              uap->gid                GID to set, if creating
4163  *              uap->mode               File mode, if creating (same as 'open')
4164  *              uap->xsecurity          ACL to set, if creating
4165  *
4166  * Returns:     0                       Success
4167  *              !0                      errno value
4168  *
4169  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4170  *
4171  * XXX:         We should enummerate the possible errno values here, and where
4172  *              in the code they originated.
4173  */
4174 int
4175 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4176 {
4177         struct filedesc *fdp = p->p_fd;
4178         int ciferror;
4179         kauth_filesec_t xsecdst;
4180         struct vnode_attr va;
4181         struct nameidata nd;
4182         int cmode;
4183
4184         AUDIT_ARG(owner, uap->uid, uap->gid);
4185
4186         xsecdst = NULL;
4187         if ((uap->xsecurity != USER_ADDR_NULL) &&
4188             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4189                 return ciferror;
4190         }
4191
4192         VATTR_INIT(&va);
4193         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4194         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4195         if (uap->uid != KAUTH_UID_NONE) {
4196                 VATTR_SET(&va, va_uid, uap->uid);
4197         }
4198         if (uap->gid != KAUTH_GID_NONE) {
4199                 VATTR_SET(&va, va_gid, uap->gid);
4200         }
4201         if (xsecdst != NULL) {
4202                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4203         }
4204
4205         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4206             uap->path, vfs_context_current());
4207
4208         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4209             fileproc_alloc_init, NULL, retval);
4210         if (xsecdst != NULL) {
4211                 kauth_filesec_free(xsecdst);
4212         }
4213
4214         return ciferror;
4215 }
4216
4217 /*
4218  * Go through the data-protected atomically controlled open (2)
4219  *
4220  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4221  */
4222 int
4223 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4224 {
4225         int flags = uap->flags;
4226         int class = uap->class;
4227         int dpflags = uap->dpflags;
4228
4229         /*
4230          * Follow the same path as normal open(2)
4231          * Look up the item if it exists, and acquire the vnode.
4232          */
4233         struct filedesc *fdp = p->p_fd;
4234         struct vnode_attr va;
4235         struct nameidata nd;
4236         int cmode;
4237         int error;
4238
4239         VATTR_INIT(&va);
4240         /* Mask off all but regular access permissions */
4241         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4242         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4243
4244         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4245             uap->path, vfs_context_current());
4246
4247         /*
4248          * Initialize the extra fields in vnode_attr to pass down our
4249          * extra fields.
4250          * 1. target cprotect class.
4251          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4252          */
4253         if (flags & O_CREAT) {
4254                 /* lower level kernel code validates that the class is valid before applying it. */
4255                 if (class != PROTECTION_CLASS_DEFAULT) {
4256                         /*
4257                          * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4258                          * file behave the same as open (2)
4259                          */
4260                         VATTR_SET(&va, va_dataprotect_class, class);
4261                 }
4262         }
4263
4264         if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4265                 if (flags & (O_RDWR | O_WRONLY)) {
4266                         /* Not allowed to write raw encrypted bytes */
4267                         return EINVAL;
4268                 }
4269                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4270                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4271                 }
4272                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4273                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4274                 }
4275         }
4276
4277         error = open1(vfs_context_current(), &nd, uap->flags, &va,
4278             fileproc_alloc_init, NULL, retval);
4279
4280         return error;
4281 }
4282
4283 static int
4284 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4285     int fd, enum uio_seg segflg, int *retval)
4286 {
4287         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
4288         struct vnode_attr va;
4289         struct nameidata nd;
4290         int cmode;
4291
4292         VATTR_INIT(&va);
4293         /* Mask off all but regular access permissions */
4294         cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4295         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4296
4297         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4298             segflg, path, ctx);
4299
4300         return open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
4301                    retval, fd);
4302 }
4303
4304 int
4305 open(proc_t p, struct open_args *uap, int32_t *retval)
4306 {
4307         __pthread_testcancel(1);
4308         return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4309 }
4310
4311 int
4312 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4313     int32_t *retval)
4314 {
4315         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4316                    uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4317 }
4318
4319 int
4320 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4321     int32_t *retval)
4322 {
4323         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4324                    uap->mode, uap->fd, UIO_USERSPACE, retval);
4325 }
4326
4327 int
4328 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4329 {
4330         __pthread_testcancel(1);
4331         return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4332 }
4333
4334 /*
4335  * openbyid_np: open a file given a file system id and a file system object id
4336  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
4337  *      file systems that don't support object ids it is a node id (uint64_t).
4338  *
4339  * Parameters:  p                       Process requesting the open
4340  *              uap                     User argument descriptor (see below)
4341  *              retval                  Pointer to an area to receive the
4342  *                                      return calue from the system call
4343  *
4344  * Indirect:    uap->path               Path to open (same as 'open')
4345  *
4346  *              uap->fsid               id of target file system
4347  *              uap->objid              id of target file system object
4348  *              uap->flags              Flags to open (same as 'open')
4349  *
4350  * Returns:     0                       Success
4351  *              !0                      errno value
4352  *
4353  *
4354  * XXX:         We should enummerate the possible errno values here, and where
4355  *              in the code they originated.
4356  */
4357 int
4358 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4359 {
4360         fsid_t fsid;
4361         uint64_t objid;
4362         int error;
4363         char *buf = NULL;
4364         int buflen = MAXPATHLEN;
4365         int pathlen = 0;
4366         vfs_context_t ctx = vfs_context_current();
4367
4368         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4369                 return error;
4370         }
4371
4372         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4373                 return error;
4374         }
4375
4376         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4377         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4378                 return error;
4379         }
4380
4381         AUDIT_ARG(value32, fsid.val[0]);
4382         AUDIT_ARG(value64, objid);
4383
4384         /*resolve path from fsis, objid*/
4385         do {
4386                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
4387                 if (buf == NULL) {
4388                         return ENOMEM;
4389                 }
4390
4391                 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4392                     buf, FSOPT_ISREALFSID, &pathlen);
4393
4394                 if (error) {
4395                         FREE(buf, M_TEMP);
4396                         buf = NULL;
4397                 }
4398         } while (error == ENOSPC && (buflen += MAXPATHLEN));
4399
4400         if (error) {
4401                 return error;
4402         }
4403
4404         buf[pathlen] = 0;
4405
4406         error = openat_internal(
4407                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4408
4409         FREE(buf, M_TEMP);
4410
4411         return error;
4412 }
4413
4414
4415 /*
4416  * Create a special file.
4417  */
4418 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4419
4420 int
4421 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4422 {
4423         struct vnode_attr va;
4424         vfs_context_t ctx = vfs_context_current();
4425         int error;
4426         struct nameidata nd;
4427         vnode_t vp, dvp;
4428
4429         VATTR_INIT(&va);
4430         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4431         VATTR_SET(&va, va_rdev, uap->dev);
4432
4433         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4434         if ((uap->mode & S_IFMT) == S_IFIFO) {
4435                 return mkfifo1(ctx, uap->path, &va);
4436         }
4437
4438         AUDIT_ARG(mode, uap->mode);
4439         AUDIT_ARG(value32, uap->dev);
4440
4441         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4442                 return error;
4443         }
4444         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4445             UIO_USERSPACE, uap->path, ctx);
4446         error = namei(&nd);
4447         if (error) {
4448                 return error;
4449         }
4450         dvp = nd.ni_dvp;
4451         vp = nd.ni_vp;
4452
4453         if (vp != NULL) {
4454                 error = EEXIST;
4455                 goto out;
4456         }
4457
4458         switch (uap->mode & S_IFMT) {
4459         case S_IFCHR:
4460                 VATTR_SET(&va, va_type, VCHR);
4461                 break;
4462         case S_IFBLK:
4463                 VATTR_SET(&va, va_type, VBLK);
4464                 break;
4465         default:
4466                 error = EINVAL;
4467                 goto out;
4468         }
4469
4470 #if CONFIG_MACF
4471         error = mac_vnode_check_create(ctx,
4472             nd.ni_dvp, &nd.ni_cnd, &va);
4473         if (error) {
4474                 goto out;
4475         }
4476 #endif
4477
4478         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4479                 goto out;
4480         }
4481
4482         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4483                 goto out;
4484         }
4485
4486         if (vp) {
4487                 int     update_flags = 0;
4488
4489                 // Make sure the name & parent pointers are hooked up
4490                 if (vp->v_name == NULL) {
4491                         update_flags |= VNODE_UPDATE_NAME;
4492                 }
4493                 if (vp->v_parent == NULLVP) {
4494                         update_flags |= VNODE_UPDATE_PARENT;
4495                 }
4496
4497                 if (update_flags) {
4498                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4499                 }
4500
4501 #if CONFIG_FSE
4502                 add_fsevent(FSE_CREATE_FILE, ctx,
4503                     FSE_ARG_VNODE, vp,
4504                     FSE_ARG_DONE);
4505 #endif
4506         }
4507
4508 out:
4509         /*
4510          * nameidone has to happen before we vnode_put(dvp)
4511          * since it may need to release the fs_nodelock on the dvp
4512          */
4513         nameidone(&nd);
4514
4515         if (vp) {
4516                 vnode_put(vp);
4517         }
4518         vnode_put(dvp);
4519
4520         return error;
4521 }
4522
4523 /*
4524  * Create a named pipe.
4525  *
4526  * Returns:     0                       Success
4527  *              EEXIST
4528  *      namei:???
4529  *      vnode_authorize:???
4530  *      vn_create:???
4531  */
4532 static int
4533 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4534 {
4535         vnode_t vp, dvp;
4536         int error;
4537         struct nameidata nd;
4538
4539         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4540             UIO_USERSPACE, upath, ctx);
4541         error = namei(&nd);
4542         if (error) {
4543                 return error;
4544         }
4545         dvp = nd.ni_dvp;
4546         vp = nd.ni_vp;
4547
4548         /* check that this is a new file and authorize addition */
4549         if (vp != NULL) {
4550                 error = EEXIST;
4551                 goto out;
4552         }
4553         VATTR_SET(vap, va_type, VFIFO);
4554
4555         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4556                 goto out;
4557         }
4558
4559         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4560 out:
4561         /*
4562          * nameidone has to happen before we vnode_put(dvp)
4563          * since it may need to release the fs_nodelock on the dvp
4564          */
4565         nameidone(&nd);
4566
4567         if (vp) {
4568                 vnode_put(vp);
4569         }
4570         vnode_put(dvp);
4571
4572         return error;
4573 }
4574
4575
4576 /*
4577  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4578  *
4579  * Parameters:  p                       Process requesting the open
4580  *              uap                     User argument descriptor (see below)
4581  *              retval                  (Ignored)
4582  *
4583  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
4584  *              uap->uid                UID to set
4585  *              uap->gid                GID to set
4586  *              uap->mode               File mode to set (same as 'mkfifo')
4587  *              uap->xsecurity          ACL to set, if creating
4588  *
4589  * Returns:     0                       Success
4590  *              !0                      errno value
4591  *
4592  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4593  *
4594  * XXX:         We should enummerate the possible errno values here, and where
4595  *              in the code they originated.
4596  */
4597 int
4598 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4599 {
4600         int ciferror;
4601         kauth_filesec_t xsecdst;
4602         struct vnode_attr va;
4603
4604         AUDIT_ARG(owner, uap->uid, uap->gid);
4605
4606         xsecdst = KAUTH_FILESEC_NONE;
4607         if (uap->xsecurity != USER_ADDR_NULL) {
4608                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4609                         return ciferror;
4610                 }
4611         }
4612
4613         VATTR_INIT(&va);
4614         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4615         if (uap->uid != KAUTH_UID_NONE) {
4616                 VATTR_SET(&va, va_uid, uap->uid);
4617         }
4618         if (uap->gid != KAUTH_GID_NONE) {
4619                 VATTR_SET(&va, va_gid, uap->gid);
4620         }
4621         if (xsecdst != KAUTH_FILESEC_NONE) {
4622                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4623         }
4624
4625         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4626
4627         if (xsecdst != KAUTH_FILESEC_NONE) {
4628                 kauth_filesec_free(xsecdst);
4629         }
4630         return ciferror;
4631 }
4632
4633 /* ARGSUSED */
4634 int
4635 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4636 {
4637         struct vnode_attr va;
4638
4639         VATTR_INIT(&va);
4640         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4641
4642         return mkfifo1(vfs_context_current(), uap->path, &va);
4643 }
4644
4645
4646 static char *
4647 my_strrchr(char *p, int ch)
4648 {
4649         char *save;
4650
4651         for (save = NULL;; ++p) {
4652                 if (*p == ch) {
4653                         save = p;
4654                 }
4655                 if (!*p) {
4656                         return save;
4657                 }
4658         }
4659         /* NOTREACHED */
4660 }
4661
4662 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4663 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4664 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4665
4666 int
4667 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4668 {
4669         int ret, len = _len;
4670
4671         *truncated_path = 0;
4672
4673         if (firmlink) {
4674                 ret = vn_getpath(dvp, path, &len);
4675         } else {
4676                 ret = vn_getpath_no_firmlink(dvp, path, &len);
4677         }
4678         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4679                 if (leafname) {
4680                         path[len - 1] = '/';
4681                         len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4682                         if (len > MAXPATHLEN) {
4683                                 char *ptr;
4684
4685                                 // the string got truncated!
4686                                 *truncated_path = 1;
4687                                 ptr = my_strrchr(path, '/');
4688                                 if (ptr) {
4689                                         *ptr = '\0';   // chop off the string at the last directory component
4690                                 }
4691                                 len = strlen(path) + 1;
4692                         }
4693                 }
4694         } else if (ret == 0) {
4695                 *truncated_path = 1;
4696         } else if (ret != 0) {
4697                 struct vnode *mydvp = dvp;
4698
4699                 if (ret != ENOSPC) {
4700                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4701                             dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4702                 }
4703                 *truncated_path = 1;
4704
4705                 do {
4706                         if (mydvp->v_parent != NULL) {
4707                                 mydvp = mydvp->v_parent;
4708                         } else if (mydvp->v_mount) {
4709                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4710                                 break;
4711                         } else {
4712                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4713                                 strlcpy(path, "/", _len);
4714                                 len = 2;
4715                                 mydvp = NULL;
4716                         }
4717
4718                         if (mydvp == NULL) {
4719                                 break;
4720                         }
4721
4722                         len = _len;
4723                         if (firmlink) {
4724                                 ret = vn_getpath(mydvp, path, &len);
4725                         } else {
4726                                 ret = vn_getpath_no_firmlink(mydvp, path, &len);
4727                         }
4728                 } while (ret == ENOSPC);
4729         }
4730
4731         return len;
4732 }
4733
4734 int
4735 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4736 {
4737         return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
4738 }
4739
4740 int
4741 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4742 {
4743         return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
4744 }
4745
4746 /*
4747  * Make a hard file link.
4748  *
4749  * Returns:     0                       Success
4750  *              EPERM
4751  *              EEXIST
4752  *              EXDEV
4753  *      namei:???
4754  *      vnode_authorize:???
4755  *      VNOP_LINK:???
4756  */
4757 /* ARGSUSED */
4758 static int
4759 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4760     user_addr_t link, int flag, enum uio_seg segflg)
4761 {
4762         vnode_t vp, pvp, dvp, lvp;
4763         struct nameidata nd;
4764         int follow;
4765         int error;
4766 #if CONFIG_FSE
4767         fse_info finfo;
4768 #endif
4769         int need_event, has_listeners, need_kpath2;
4770         char *target_path = NULL;
4771         int truncated = 0;
4772
4773         vp = dvp = lvp = NULLVP;
4774
4775         /* look up the object we are linking to */
4776         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4777         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4778             segflg, path, ctx);
4779
4780         error = nameiat(&nd, fd1);
4781         if (error) {
4782                 return error;
4783         }
4784         vp = nd.ni_vp;
4785
4786         nameidone(&nd);
4787
4788         /*
4789          * Normally, linking to directories is not supported.
4790          * However, some file systems may have limited support.
4791          */
4792         if (vp->v_type == VDIR) {
4793                 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4794                         error = EPERM;   /* POSIX */
4795                         goto out;
4796                 }
4797
4798                 /* Linking to a directory requires ownership. */
4799                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4800                         struct vnode_attr dva;
4801
4802                         VATTR_INIT(&dva);
4803                         VATTR_WANTED(&dva, va_uid);
4804                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4805                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4806                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4807                                 error = EACCES;
4808                                 goto out;
4809                         }
4810                 }
4811         }
4812
4813         /* lookup the target node */
4814 #if CONFIG_TRIGGERS
4815         nd.ni_op = OP_LINK;
4816 #endif
4817         nd.ni_cnd.cn_nameiop = CREATE;
4818         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4819         nd.ni_dirp = link;
4820         error = nameiat(&nd, fd2);
4821         if (error != 0) {
4822                 goto out;
4823         }
4824         dvp = nd.ni_dvp;
4825         lvp = nd.ni_vp;
4826
4827 #if CONFIG_MACF
4828         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
4829                 goto out2;
4830         }
4831 #endif
4832
4833         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4834         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
4835                 goto out2;
4836         }
4837
4838         /* target node must not exist */
4839         if (lvp != NULLVP) {
4840                 error = EEXIST;
4841                 goto out2;
4842         }
4843         /* cannot link across mountpoints */
4844         if (vnode_mount(vp) != vnode_mount(dvp)) {
4845                 error = EXDEV;
4846                 goto out2;
4847         }
4848
4849         /* authorize creation of the target note */
4850         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4851                 goto out2;
4852         }
4853
4854         /* and finally make the link */
4855         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4856         if (error) {
4857                 goto out2;
4858         }
4859
4860 #if CONFIG_MACF
4861         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4862 #endif
4863
4864 #if CONFIG_FSE
4865         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4866 #else
4867         need_event = 0;
4868 #endif
4869         has_listeners = kauth_authorize_fileop_has_listeners();
4870
4871         need_kpath2 = 0;
4872 #if CONFIG_AUDIT
4873         if (AUDIT_RECORD_EXISTS()) {
4874                 need_kpath2 = 1;
4875         }
4876 #endif
4877
4878         if (need_event || has_listeners || need_kpath2) {
4879                 char *link_to_path = NULL;
4880                 int len, link_name_len;
4881
4882                 /* build the path to the new link file */
4883                 GET_PATH(target_path);
4884                 if (target_path == NULL) {
4885                         error = ENOMEM;
4886                         goto out2;
4887                 }
4888
4889                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4890
4891                 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
4892
4893                 if (has_listeners) {
4894                         /* build the path to file we are linking to */
4895                         GET_PATH(link_to_path);
4896                         if (link_to_path == NULL) {
4897                                 error = ENOMEM;
4898                                 goto out2;
4899                         }
4900
4901                         link_name_len = MAXPATHLEN;
4902                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4903                                 /*
4904                                  * Call out to allow 3rd party notification of rename.
4905                                  * Ignore result of kauth_authorize_fileop call.
4906                                  */
4907                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4908                                     (uintptr_t)link_to_path,
4909                                     (uintptr_t)target_path);
4910                         }
4911                         if (link_to_path != NULL) {
4912                                 RELEASE_PATH(link_to_path);
4913                         }
4914                 }
4915 #if CONFIG_FSE
4916                 if (need_event) {
4917                         /* construct fsevent */
4918                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4919                                 if (truncated) {
4920                                         finfo.mode |= FSE_TRUNCATED_PATH;
4921                                 }
4922
4923                                 // build the path to the destination of the link
4924                                 add_fsevent(FSE_CREATE_FILE, ctx,
4925                                     FSE_ARG_STRING, len, target_path,
4926                                     FSE_ARG_FINFO, &finfo,
4927                                     FSE_ARG_DONE);
4928                         }
4929
4930                         pvp = vp->v_parent;
4931                         // need an iocount on pvp in this case
4932                         if (pvp && pvp != dvp) {
4933                                 error = vnode_get(pvp);
4934                                 if (error) {
4935                                         pvp = NULLVP;
4936                                         error = 0;
4937                                 }
4938                         }
4939                         if (pvp) {
4940                                 add_fsevent(FSE_STAT_CHANGED, ctx,
4941                                     FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
4942                         }
4943                         if (pvp && pvp != dvp) {
4944                                 vnode_put(pvp);
4945                         }
4946                 }
4947 #endif
4948         }
4949 out2:
4950         /*
4951          * nameidone has to happen before we vnode_put(dvp)
4952          * since it may need to release the fs_nodelock on the dvp
4953          */
4954         nameidone(&nd);
4955         if (target_path != NULL) {
4956                 RELEASE_PATH(target_path);
4957         }
4958 out:
4959         if (lvp) {
4960                 vnode_put(lvp);
4961         }
4962         if (dvp) {
4963                 vnode_put(dvp);
4964         }
4965         vnode_put(vp);
4966         return error;
4967 }
4968
4969 int
4970 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4971 {
4972         return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4973                    AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
4974 }
4975
4976 int
4977 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4978 {
4979         if (uap->flag & ~AT_SYMLINK_FOLLOW) {
4980                 return EINVAL;
4981         }
4982
4983         return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4984                    uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
4985 }
4986
4987 /*
4988  * Make a symbolic link.
4989  *
4990  * We could add support for ACLs here too...
4991  */
4992 /* ARGSUSED */
4993 static int
4994 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4995     user_addr_t link, enum uio_seg segflg)
4996 {
4997         struct vnode_attr va;
4998         char *path;
4999         int error;
5000         struct nameidata nd;
5001         vnode_t vp, dvp;
5002         size_t dummy = 0;
5003         proc_t p;
5004
5005         error = 0;
5006         if (UIO_SEG_IS_USER_SPACE(segflg)) {
5007                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
5008                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5009         } else {
5010                 path = (char *)path_data;
5011         }
5012         if (error) {
5013                 goto out;
5014         }
5015         AUDIT_ARG(text, path);  /* This is the link string */
5016
5017         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5018             segflg, link, ctx);
5019
5020         error = nameiat(&nd, fd);
5021         if (error) {
5022                 goto out;
5023         }
5024         dvp = nd.ni_dvp;
5025         vp = nd.ni_vp;
5026
5027         p = vfs_context_proc(ctx);
5028         VATTR_INIT(&va);
5029         VATTR_SET(&va, va_type, VLNK);
5030         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
5031
5032 #if CONFIG_MACF
5033         error = mac_vnode_check_create(ctx,
5034             dvp, &nd.ni_cnd, &va);
5035 #endif
5036         if (error != 0) {
5037                 goto skipit;
5038         }
5039
5040         if (vp != NULL) {
5041                 error = EEXIST;
5042                 goto skipit;
5043         }
5044
5045         /* authorize */
5046         if (error == 0) {
5047                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5048         }
5049         /* get default ownership, etc. */
5050         if (error == 0) {
5051                 error = vnode_authattr_new(dvp, &va, 0, ctx);
5052         }
5053         if (error == 0) {
5054                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5055         }
5056
5057 #if CONFIG_MACF
5058         if (error == 0 && vp) {
5059                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5060         }
5061 #endif
5062
5063         /* do fallback attribute handling */
5064         if (error == 0 && vp) {
5065                 error = vnode_setattr_fallback(vp, &va, ctx);
5066         }
5067
5068         if (error == 0) {
5069                 int     update_flags = 0;
5070
5071                 /*check if a new vnode was created, else try to get one*/
5072                 if (vp == NULL) {
5073                         nd.ni_cnd.cn_nameiop = LOOKUP;
5074 #if CONFIG_TRIGGERS
5075                         nd.ni_op = OP_LOOKUP;
5076 #endif
5077                         nd.ni_cnd.cn_flags = 0;
5078                         error = nameiat(&nd, fd);
5079                         vp = nd.ni_vp;
5080
5081                         if (vp == NULL) {
5082                                 goto skipit;
5083                         }
5084                 }
5085
5086 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5087                 /* call out to allow 3rd party notification of rename.
5088                  * Ignore result of kauth_authorize_fileop call.
5089                  */
5090                 if (kauth_authorize_fileop_has_listeners() &&
5091                     namei(&nd) == 0) {
5092                         char *new_link_path = NULL;
5093                         int             len;
5094
5095                         /* build the path to the new link file */
5096                         new_link_path = get_pathbuff();
5097                         len = MAXPATHLEN;
5098                         vn_getpath(dvp, new_link_path, &len);
5099                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5100                                 new_link_path[len - 1] = '/';
5101                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5102                         }
5103
5104                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5105                             (uintptr_t)path, (uintptr_t)new_link_path);
5106                         if (new_link_path != NULL) {
5107                                 release_pathbuff(new_link_path);
5108                         }
5109                 }
5110 #endif
5111                 // Make sure the name & parent pointers are hooked up
5112                 if (vp->v_name == NULL) {
5113                         update_flags |= VNODE_UPDATE_NAME;
5114                 }
5115                 if (vp->v_parent == NULLVP) {
5116                         update_flags |= VNODE_UPDATE_PARENT;
5117                 }
5118
5119                 if (update_flags) {
5120                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5121                 }
5122
5123 #if CONFIG_FSE
5124                 add_fsevent(FSE_CREATE_FILE, ctx,
5125                     FSE_ARG_VNODE, vp,
5126                     FSE_ARG_DONE);
5127 #endif
5128         }
5129
5130 skipit:
5131         /*
5132          * nameidone has to happen before we vnode_put(dvp)
5133          * since it may need to release the fs_nodelock on the dvp
5134          */
5135         nameidone(&nd);
5136
5137         if (vp) {
5138                 vnode_put(vp);
5139         }
5140         vnode_put(dvp);
5141 out:
5142         if (path && (path != (char *)path_data)) {
5143                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
5144         }
5145
5146         return error;
5147 }
5148
5149 int
5150 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5151 {
5152         return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5153                    uap->link, UIO_USERSPACE);
5154 }
5155
5156 int
5157 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5158     __unused int32_t *retval)
5159 {
5160         return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5161                    uap->path2, UIO_USERSPACE);
5162 }
5163
5164 /*
5165  * Delete a whiteout from the filesystem.
5166  * No longer supported.
5167  */
5168 int
5169 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5170 {
5171         return ENOTSUP;
5172 }
5173
5174 /*
5175  * Delete a name from the filesystem.
5176  */
5177 /* ARGSUSED */
5178 static int
5179 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5180     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5181 {
5182         struct nameidata nd;
5183         vnode_t vp, dvp;
5184         int error;
5185         struct componentname *cnp;
5186         char  *path = NULL;
5187         char  *no_firmlink_path = NULL;
5188         int  len_path = 0;
5189         int  len_no_firmlink_path = 0;
5190 #if CONFIG_FSE
5191         fse_info  finfo;
5192         struct vnode_attr va;
5193 #endif
5194         int flags;
5195         int need_event;
5196         int has_listeners;
5197         int truncated_path;
5198         int truncated_no_firmlink_path;
5199         int batched;
5200         struct vnode_attr *vap;
5201         int do_retry;
5202         int retry_count = 0;
5203         int cn_flags;
5204
5205         cn_flags = LOCKPARENT;
5206         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5207                 cn_flags |= AUDITVNPATH1;
5208         }
5209         /* If a starting dvp is passed, it trumps any fd passed. */
5210         if (start_dvp) {
5211                 cn_flags |= USEDVP;
5212         }
5213
5214 #if NAMEDRSRCFORK
5215         /* unlink or delete is allowed on rsrc forks and named streams */
5216         cn_flags |= CN_ALLOWRSRCFORK;
5217 #endif
5218
5219 retry:
5220         do_retry = 0;
5221         flags = 0;
5222         need_event = 0;
5223         has_listeners = 0;
5224         truncated_path = 0;
5225         truncated_no_firmlink_path = 0;
5226         vap = NULL;
5227
5228         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5229
5230         nd.ni_dvp = start_dvp;
5231         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
5232         cnp = &nd.ni_cnd;
5233
5234 continue_lookup:
5235         error = nameiat(&nd, fd);
5236         if (error) {
5237                 return error;
5238         }
5239
5240         dvp = nd.ni_dvp;
5241         vp = nd.ni_vp;
5242
5243
5244         /* With Carbon delete semantics, busy files cannot be deleted */
5245         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5246                 flags |= VNODE_REMOVE_NODELETEBUSY;
5247         }
5248
5249         /* Skip any potential upcalls if told to. */
5250         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5251                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5252         }
5253
5254         if (vp) {
5255                 batched = vnode_compound_remove_available(vp);
5256                 /*
5257                  * The root of a mounted filesystem cannot be deleted.
5258                  */
5259                 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5260                         error = EBUSY;
5261                         goto out;
5262                 }
5263
5264 #if DEVELOPMENT || DEBUG
5265                 /*
5266                  * XXX VSWAP: Check for entitlements or special flag here
5267                  * so we can restrict access appropriately.
5268                  */
5269 #else /* DEVELOPMENT || DEBUG */
5270
5271                 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5272                         error = EPERM;
5273                         goto out;
5274                 }
5275 #endif /* DEVELOPMENT || DEBUG */
5276
5277                 if (!batched) {
5278                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5279                         if (error) {
5280                                 if (error == ENOENT) {
5281                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5282                                                 do_retry = 1;
5283                                                 retry_count++;
5284                                         }
5285                                 }
5286                                 goto out;
5287                         }
5288                 }
5289         } else {
5290                 batched = 1;
5291
5292                 if (!vnode_compound_remove_available(dvp)) {
5293                         panic("No vp, but no compound remove?");
5294                 }
5295         }
5296
5297 #if CONFIG_FSE
5298         need_event = need_fsevent(FSE_DELETE, dvp);
5299         if (need_event) {
5300                 if (!batched) {
5301                         if ((vp->v_flag & VISHARDLINK) == 0) {
5302                                 /* XXX need to get these data in batched VNOP */
5303                                 get_fse_info(vp, &finfo, ctx);
5304                         }
5305                 } else {
5306                         error = vfs_get_notify_attributes(&va);
5307                         if (error) {
5308                                 goto out;
5309                         }
5310
5311                         vap = &va;
5312                 }
5313         }
5314 #endif
5315         has_listeners = kauth_authorize_fileop_has_listeners();
5316         if (need_event || has_listeners) {
5317                 if (path == NULL) {
5318                         GET_PATH(path);
5319                         if (path == NULL) {
5320                                 error = ENOMEM;
5321                                 goto out;
5322                         }
5323                 }
5324                 len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5325                 if (no_firmlink_path == NULL) {
5326                         GET_PATH(no_firmlink_path);
5327                         if (no_firmlink_path == NULL) {
5328                                 error = ENOMEM;
5329                                 goto out;
5330                         }
5331                 }
5332                 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5333         }
5334
5335 #if NAMEDRSRCFORK
5336         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5337                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5338         } else
5339 #endif
5340         {
5341                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
5342                 vp = nd.ni_vp;
5343                 if (error == EKEEPLOOKING) {
5344                         if (!batched) {
5345                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5346                         }
5347
5348                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
5349                                 panic("EKEEPLOOKING, but continue flag not set?");
5350                         }
5351
5352                         if (vnode_isdir(vp)) {
5353                                 error = EISDIR;
5354                                 goto out;
5355                         }
5356                         goto continue_lookup;
5357                 } else if (error == ENOENT && batched) {
5358                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5359                                 /*
5360                                  * For compound VNOPs, the authorization callback may
5361                                  * return ENOENT in case of racing hardlink lookups
5362                                  * hitting the name  cache, redrive the lookup.
5363                                  */
5364                                 do_retry = 1;
5365                                 retry_count += 1;
5366                                 goto out;
5367                         }
5368                 }
5369         }
5370
5371         /*
5372          * Call out to allow 3rd party notification of delete.
5373          * Ignore result of kauth_authorize_fileop call.
5374          */
5375         if (!error) {
5376                 if (has_listeners) {
5377                         kauth_authorize_fileop(vfs_context_ucred(ctx),
5378                             KAUTH_FILEOP_DELETE,
5379                             (uintptr_t)vp,
5380                             (uintptr_t)path);
5381                 }
5382
5383                 if (vp->v_flag & VISHARDLINK) {
5384                         //
5385                         // if a hardlink gets deleted we want to blow away the
5386                         // v_parent link because the path that got us to this
5387                         // instance of the link is no longer valid.  this will
5388                         // force the next call to get the path to ask the file
5389                         // system instead of just following the v_parent link.
5390                         //
5391                         vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5392                 }
5393
5394 #if CONFIG_FSE
5395                 if (need_event) {
5396                         if (vp->v_flag & VISHARDLINK) {
5397                                 get_fse_info(vp, &finfo, ctx);
5398                         } else if (vap) {
5399                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
5400                         }
5401                         if (truncated_path) {
5402                                 finfo.mode |= FSE_TRUNCATED_PATH;
5403                         }
5404                         add_fsevent(FSE_DELETE, ctx,
5405                             FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5406                             FSE_ARG_FINFO, &finfo,
5407                             FSE_ARG_DONE);
5408                 }
5409 #endif
5410         }
5411
5412 out:
5413         if (path != NULL) {
5414                 RELEASE_PATH(path);
5415                 path = NULL;
5416         }
5417
5418         if (no_firmlink_path != NULL) {
5419                 RELEASE_PATH(no_firmlink_path);
5420                 no_firmlink_path = NULL;
5421         }
5422 #if NAMEDRSRCFORK
5423         /* recycle the deleted rsrc fork vnode to force a reclaim, which
5424          * will cause its shadow file to go away if necessary.
5425          */
5426         if (vp && (vnode_isnamedstream(vp)) &&
5427             (vp->v_parent != NULLVP) &&
5428             vnode_isshadow(vp)) {
5429                 vnode_recycle(vp);
5430         }
5431 #endif
5432         /*
5433          * nameidone has to happen before we vnode_put(dvp)
5434          * since it may need to release the fs_nodelock on the dvp
5435          */
5436         nameidone(&nd);
5437         vnode_put(dvp);
5438         if (vp) {
5439                 vnode_put(vp);
5440         }
5441
5442         if (do_retry) {
5443                 goto retry;
5444         }
5445
5446         return error;
5447 }
5448
5449 int
5450 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5451     enum uio_seg segflg, int unlink_flags)
5452 {
5453         return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5454                    unlink_flags);
5455 }
5456
5457 /*
5458  * Delete a name from the filesystem using Carbon semantics.
5459  */
5460 int
5461 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5462 {
5463         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5464                    uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5465 }
5466
5467 /*
5468  * Delete a name from the filesystem using POSIX semantics.
5469  */
5470 int
5471 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5472 {
5473         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5474                    uap->path, UIO_USERSPACE, 0);
5475 }
5476
5477 int
5478 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5479 {
5480         if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5481                 return EINVAL;
5482         }
5483
5484         if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5485                 int unlink_flags = 0;
5486
5487                 if (uap->flag & AT_REMOVEDIR_DATALESS) {
5488                         unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5489                 }
5490                 return rmdirat_internal(vfs_context_current(), uap->fd,
5491                            uap->path, UIO_USERSPACE, unlink_flags);
5492         } else {
5493                 return unlinkat_internal(vfs_context_current(), uap->fd,
5494                            NULLVP, uap->path, UIO_USERSPACE, 0);
5495         }
5496 }
5497
5498 /*
5499  * Reposition read/write file offset.
5500  */
5501 int
5502 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5503 {
5504         struct fileproc *fp;
5505         vnode_t vp;
5506         struct vfs_context *ctx;
5507         off_t offset = uap->offset, file_size;
5508         int error;
5509
5510         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5511                 if (error == ENOTSUP) {
5512                         return ESPIPE;
5513                 }
5514                 return error;
5515         }
5516         if (vnode_isfifo(vp)) {
5517                 file_drop(uap->fd);
5518                 return ESPIPE;
5519         }
5520
5521
5522         ctx = vfs_context_current();
5523 #if CONFIG_MACF
5524         if (uap->whence == L_INCR && uap->offset == 0) {
5525                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5526                     fp->f_fglob);
5527         } else {
5528                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5529                     fp->f_fglob);
5530         }
5531         if (error) {
5532                 file_drop(uap->fd);
5533                 return error;
5534         }
5535 #endif
5536         if ((error = vnode_getwithref(vp))) {
5537                 file_drop(uap->fd);
5538                 return error;
5539         }
5540
5541         switch (uap->whence) {
5542         case L_INCR:
5543                 offset += fp->f_fglob->fg_offset;
5544                 break;
5545         case L_XTND:
5546                 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5547                         break;
5548                 }
5549                 offset += file_size;
5550                 break;
5551         case L_SET:
5552                 break;
5553         case SEEK_HOLE:
5554                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5555                 break;
5556         case SEEK_DATA:
5557                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5558                 break;
5559         default:
5560                 error = EINVAL;
5561         }
5562         if (error == 0) {
5563                 if (uap->offset > 0 && offset < 0) {
5564                         /* Incremented/relative move past max size */
5565                         error = EOVERFLOW;
5566                 } else {
5567                         /*
5568                          * Allow negative offsets on character devices, per
5569                          * POSIX 1003.1-2001.  Most likely for writing disk
5570                          * labels.
5571                          */
5572                         if (offset < 0 && vp->v_type != VCHR) {
5573                                 /* Decremented/relative move before start */
5574                                 error = EINVAL;
5575                         } else {
5576                                 /* Success */
5577                                 fp->f_fglob->fg_offset = offset;
5578                                 *retval = fp->f_fglob->fg_offset;
5579                         }
5580                 }
5581         }
5582
5583         /*
5584          * An lseek can affect whether data is "available to read."  Use
5585          * hint of NOTE_NONE so no EVFILT_VNODE events fire
5586          */
5587         post_event_if_success(vp, error, NOTE_NONE);
5588         (void)vnode_put(vp);
5589         file_drop(uap->fd);
5590         return error;
5591 }
5592
5593
5594 /*
5595  * Check access permissions.
5596  *
5597  * Returns:     0                       Success
5598  *              vnode_authorize:???
5599  */
5600 static int
5601 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5602 {
5603         kauth_action_t action;
5604         int error;
5605
5606         /*
5607          * If just the regular access bits, convert them to something
5608          * that vnode_authorize will understand.
5609          */
5610         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5611                 action = 0;
5612                 if (uflags & R_OK) {
5613                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
5614                 }
5615                 if (uflags & W_OK) {
5616                         if (vnode_isdir(vp)) {
5617                                 action |= KAUTH_VNODE_ADD_FILE |
5618                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
5619                                 /* might want delete rights here too */
5620                         } else {
5621                                 action |= KAUTH_VNODE_WRITE_DATA;
5622                         }
5623                 }
5624                 if (uflags & X_OK) {
5625                         if (vnode_isdir(vp)) {
5626                                 action |= KAUTH_VNODE_SEARCH;
5627                         } else {
5628                                 action |= KAUTH_VNODE_EXECUTE;
5629                         }
5630                 }
5631         } else {
5632                 /* take advantage of definition of uflags */
5633                 action = uflags >> 8;
5634         }
5635
5636 #if CONFIG_MACF
5637         error = mac_vnode_check_access(ctx, vp, uflags);
5638         if (error) {
5639                 return error;
5640         }
5641 #endif /* MAC */
5642
5643         /* action == 0 means only check for existence */
5644         if (action != 0) {
5645                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5646         } else {
5647                 error = 0;
5648         }
5649
5650         return error;
5651 }
5652
5653
5654
5655 /*
5656  * access_extended: Check access permissions in bulk.
5657  *
5658  * Description: uap->entries            Pointer to an array of accessx
5659  *                                      descriptor structs, plus one or
5660  *                                      more NULL terminated strings (see
5661  *                                      "Notes" section below).
5662  *              uap->size               Size of the area pointed to by
5663  *                                      uap->entries.
5664  *              uap->results            Pointer to the results array.
5665  *
5666  * Returns:     0                       Success
5667  *              ENOMEM                  Insufficient memory
5668  *              EINVAL                  Invalid arguments
5669  *              namei:EFAULT            Bad address
5670  *              namei:ENAMETOOLONG      Filename too long
5671  *              namei:ENOENT            No such file or directory
5672  *              namei:ELOOP             Too many levels of symbolic links
5673  *              namei:EBADF             Bad file descriptor
5674  *              namei:ENOTDIR           Not a directory
5675  *              namei:???
5676  *              access1:
5677  *
5678  * Implicit returns:
5679  *              uap->results            Array contents modified
5680  *
5681  * Notes:       The uap->entries are structured as an arbitrary length array
5682  *              of accessx descriptors, followed by one or more NULL terminated
5683  *              strings
5684  *
5685  *                      struct accessx_descriptor[0]
5686  *                      ...
5687  *                      struct accessx_descriptor[n]
5688  *                      char name_data[0];
5689  *
5690  *              We determine the entry count by walking the buffer containing
5691  *              the uap->entries argument descriptor.  For each descriptor we
5692  *              see, the valid values for the offset ad_name_offset will be
5693  *              in the byte range:
5694  *
5695  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
5696  *                                              to
5697  *                              [ uap->entries + uap->size - 2 ]
5698  *
5699  *              since we must have at least one string, and the string must
5700  *              be at least one character plus the NULL terminator in length.
5701  *
5702  * XXX:         Need to support the check-as uid argument
5703  */
5704 int
5705 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5706 {
5707         struct accessx_descriptor *input = NULL;
5708         errno_t *result = NULL;
5709         errno_t error = 0;
5710         int wantdelete = 0;
5711         unsigned int desc_max, desc_actual, i, j;
5712         struct vfs_context context;
5713         struct nameidata nd;
5714         int niopts;
5715         vnode_t vp = NULL;
5716         vnode_t dvp = NULL;
5717 #define ACCESSX_MAX_DESCR_ON_STACK 10
5718         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5719
5720         context.vc_ucred = NULL;
5721
5722         /*
5723          * Validate parameters; if valid, copy the descriptor array and string
5724          * arguments into local memory.  Before proceeding, the following
5725          * conditions must have been met:
5726          *
5727          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5728          * o    There must be sufficient room in the request for at least one
5729          *      descriptor and a one yte NUL terminated string.
5730          * o    The allocation of local storage must not fail.
5731          */
5732         if (uap->size > ACCESSX_MAX_TABLESIZE) {
5733                 return ENOMEM;
5734         }
5735         if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
5736                 return EINVAL;
5737         }
5738         if (uap->size <= sizeof(stack_input)) {
5739                 input = stack_input;
5740         } else {
5741                 MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5742                 if (input == NULL) {
5743                         error = ENOMEM;
5744                         goto out;
5745                 }
5746         }
5747         error = copyin(uap->entries, input, uap->size);
5748         if (error) {
5749                 goto out;
5750         }
5751
5752         AUDIT_ARG(opaque, input, uap->size);
5753
5754         /*
5755          * Force NUL termination of the copyin buffer to avoid nami() running
5756          * off the end.  If the caller passes us bogus data, they may get a
5757          * bogus result.
5758          */
5759         ((char *)input)[uap->size - 1] = 0;
5760
5761         /*
5762          * Access is defined as checking against the process' real identity,
5763          * even if operations are checking the effective identity.  This
5764          * requires that we use a local vfs context.
5765          */
5766         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5767         context.vc_thread = current_thread();
5768
5769         /*
5770          * Find out how many entries we have, so we can allocate the result
5771          * array by walking the list and adjusting the count downward by the
5772          * earliest string offset we see.
5773          */
5774         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5775         desc_actual = desc_max;
5776         for (i = 0; i < desc_actual; i++) {
5777                 /*
5778                  * Take the offset to the name string for this entry and
5779                  * convert to an input array index, which would be one off
5780                  * the end of the array if this entry was the lowest-addressed
5781                  * name string.
5782                  */
5783                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5784
5785                 /*
5786                  * An offset greater than the max allowable offset is an error.
5787                  * It is also an error for any valid entry to point
5788                  * to a location prior to the end of the current entry, if
5789                  * it's not a reference to the string of the previous entry.
5790                  */
5791                 if (j > desc_max || (j != 0 && j <= i)) {
5792                         error = EINVAL;
5793                         goto out;
5794                 }
5795
5796                 /* Also do not let ad_name_offset point to something beyond the size of the input */
5797                 if (input[i].ad_name_offset >= uap->size) {
5798                         error = EINVAL;
5799                         goto out;
5800                 }
5801
5802                 /*
5803                  * An offset of 0 means use the previous descriptor's offset;
5804                  * this is used to chain multiple requests for the same file
5805                  * to avoid multiple lookups.
5806                  */
5807                 if (j == 0) {
5808                         /* This is not valid for the first entry */
5809                         if (i == 0) {
5810                                 error = EINVAL;
5811                                 goto out;
5812                         }
5813                         continue;
5814                 }
5815
5816                 /*
5817                  * If the offset of the string for this descriptor is before
5818                  * what we believe is the current actual last descriptor,
5819                  * then we need to adjust our estimate downward; this permits
5820                  * the string table following the last descriptor to be out
5821                  * of order relative to the descriptor list.
5822                  */
5823                 if (j < desc_actual) {
5824                         desc_actual = j;
5825                 }
5826         }
5827
5828         /*
5829          * We limit the actual number of descriptors we are willing to process
5830          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5831          * requested does not exceed this limit,
5832          */
5833         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5834                 error = ENOMEM;
5835                 goto out;
5836         }
5837         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK | M_ZERO);
5838         if (result == NULL) {
5839                 error = ENOMEM;
5840                 goto out;
5841         }
5842
5843         /*
5844          * Do the work by iterating over the descriptor entries we know to
5845          * at least appear to contain valid data.
5846          */
5847         error = 0;
5848         for (i = 0; i < desc_actual; i++) {
5849                 /*
5850                  * If the ad_name_offset is 0, then we use the previous
5851                  * results to make the check; otherwise, we are looking up
5852                  * a new file name.
5853                  */
5854                 if (input[i].ad_name_offset != 0) {
5855                         /* discard old vnodes */
5856                         if (vp) {
5857                                 vnode_put(vp);
5858                                 vp = NULL;
5859                         }
5860                         if (dvp) {
5861                                 vnode_put(dvp);
5862                                 dvp = NULL;
5863                         }
5864
5865                         /*
5866                          * Scan forward in the descriptor list to see if we
5867                          * need the parent vnode.  We will need it if we are
5868                          * deleting, since we must have rights  to remove
5869                          * entries in the parent directory, as well as the
5870                          * rights to delete the object itself.
5871                          */
5872                         wantdelete = input[i].ad_flags & _DELETE_OK;
5873                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
5874                                 if (input[j].ad_flags & _DELETE_OK) {
5875                                         wantdelete = 1;
5876                                 }
5877                         }
5878
5879                         niopts = FOLLOW | AUDITVNPATH1;
5880
5881                         /* need parent for vnode_authorize for deletion test */
5882                         if (wantdelete) {
5883                                 niopts |= WANTPARENT;
5884                         }
5885
5886                         /* do the lookup */
5887                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5888                             CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5889                             &context);
5890                         error = namei(&nd);
5891                         if (!error) {
5892                                 vp = nd.ni_vp;
5893                                 if (wantdelete) {
5894                                         dvp = nd.ni_dvp;
5895                                 }
5896                         }
5897                         nameidone(&nd);
5898                 }
5899
5900                 /*
5901                  * Handle lookup errors.
5902                  */
5903                 switch (error) {
5904                 case ENOENT:
5905                 case EACCES:
5906                 case EPERM:
5907                 case ENOTDIR:
5908                         result[i] = error;
5909                         break;
5910                 case 0:
5911                         /* run this access check */
5912                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5913                         break;
5914                 default:
5915                         /* fatal lookup error */
5916
5917                         goto out;
5918                 }
5919         }
5920
5921         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5922
5923         /* copy out results */
5924         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5925
5926 out:
5927         if (input && input != stack_input) {
5928                 FREE(input, M_TEMP);
5929         }
5930         if (result) {
5931                 FREE(result, M_TEMP);
5932         }
5933         if (vp) {
5934                 vnode_put(vp);
5935         }
5936         if (dvp) {
5937                 vnode_put(dvp);
5938         }
5939         if (IS_VALID_CRED(context.vc_ucred)) {
5940                 kauth_cred_unref(&context.vc_ucred);
5941         }
5942         return error;
5943 }
5944
5945
5946 /*
5947  * Returns:     0                       Success
5948  *              namei:EFAULT            Bad address
5949  *              namei:ENAMETOOLONG      Filename too long
5950  *              namei:ENOENT            No such file or directory
5951  *              namei:ELOOP             Too many levels of symbolic links
5952  *              namei:EBADF             Bad file descriptor
5953  *              namei:ENOTDIR           Not a directory
5954  *              namei:???
5955  *              access1:
5956  */
5957 static int
5958 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5959     int flag, enum uio_seg segflg)
5960 {
5961         int error;
5962         struct nameidata nd;
5963         int niopts;
5964         struct vfs_context context;
5965 #if NAMEDRSRCFORK
5966         int is_namedstream = 0;
5967 #endif
5968
5969         /*
5970          * Unless the AT_EACCESS option is used, Access is defined as checking
5971          * against the process' real identity, even if operations are checking
5972          * the effective identity.  So we need to tweak the credential
5973          * in the context for that case.
5974          */
5975         if (!(flag & AT_EACCESS)) {
5976                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5977         } else {
5978                 context.vc_ucred = ctx->vc_ucred;
5979         }
5980         context.vc_thread = ctx->vc_thread;
5981
5982
5983         niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
5984         /* need parent for vnode_authorize for deletion test */
5985         if (amode & _DELETE_OK) {
5986                 niopts |= WANTPARENT;
5987         }
5988         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5989             path, &context);
5990
5991 #if NAMEDRSRCFORK
5992         /* access(F_OK) calls are allowed for resource forks. */
5993         if (amode == F_OK) {
5994                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5995         }
5996 #endif
5997         error = nameiat(&nd, fd);
5998         if (error) {
5999                 goto out;
6000         }
6001
6002 #if NAMEDRSRCFORK
6003         /* Grab reference on the shadow stream file vnode to
6004          * force an inactive on release which will mark it
6005          * for recycle.
6006          */
6007         if (vnode_isnamedstream(nd.ni_vp) &&
6008             (nd.ni_vp->v_parent != NULLVP) &&
6009             vnode_isshadow(nd.ni_vp)) {
6010                 is_namedstream = 1;
6011                 vnode_ref(nd.ni_vp);
6012         }
6013 #endif
6014
6015         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6016
6017 #if NAMEDRSRCFORK
6018         if (is_namedstream) {
6019                 vnode_rele(nd.ni_vp);
6020         }
6021 #endif
6022
6023         vnode_put(nd.ni_vp);
6024         if (amode & _DELETE_OK) {
6025                 vnode_put(nd.ni_dvp);
6026         }
6027         nameidone(&nd);
6028
6029 out:
6030         if (!(flag & AT_EACCESS)) {
6031                 kauth_cred_unref(&context.vc_ucred);
6032         }
6033         return error;
6034 }
6035
6036 int
6037 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6038 {
6039         return faccessat_internal(vfs_context_current(), AT_FDCWD,
6040                    uap->path, uap->flags, 0, UIO_USERSPACE);
6041 }
6042
6043 int
6044 faccessat(__unused proc_t p, struct faccessat_args *uap,
6045     __unused int32_t *retval)
6046 {
6047         if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
6048                 return EINVAL;
6049         }
6050
6051         return faccessat_internal(vfs_context_current(), uap->fd,
6052                    uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6053 }
6054
6055 /*
6056  * Returns:     0                       Success
6057  *              EFAULT
6058  *      copyout:EFAULT
6059  *      namei:???
6060  *      vn_stat:???
6061  */
6062 static int
6063 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6064     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6065     enum uio_seg segflg, int fd, int flag)
6066 {
6067         struct nameidata nd;
6068         int follow;
6069         union {
6070                 struct stat sb;
6071                 struct stat64 sb64;
6072         } source = {};
6073         union {
6074                 struct user64_stat user64_sb;
6075                 struct user32_stat user32_sb;
6076                 struct user64_stat64 user64_sb64;
6077                 struct user32_stat64 user32_sb64;
6078         } dest = {};
6079         caddr_t sbp;
6080         int error, my_size;
6081         kauth_filesec_t fsec;
6082         size_t xsecurity_bufsize;
6083         void * statptr;
6084         struct fileproc *fp = NULL;
6085         int needsrealdev = 0;
6086
6087         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6088         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6089             segflg, path, ctx);
6090
6091 #if NAMEDRSRCFORK
6092         int is_namedstream = 0;
6093         /* stat calls are allowed for resource forks. */
6094         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6095 #endif
6096
6097         if (flag & AT_FDONLY) {
6098                 vnode_t fvp;
6099
6100                 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6101                 if (error) {
6102                         return error;
6103                 }
6104                 if ((error = vnode_getwithref(fvp))) {
6105                         file_drop(fd);
6106                         return error;
6107                 }
6108                 nd.ni_vp = fvp;
6109         } else {
6110                 error = nameiat(&nd, fd);
6111                 if (error) {
6112                         return error;
6113                 }
6114         }
6115         fsec = KAUTH_FILESEC_NONE;
6116
6117         statptr = (void *)&source;
6118
6119 #if NAMEDRSRCFORK
6120         /* Grab reference on the shadow stream file vnode to
6121          * force an inactive on release which will mark it
6122          * for recycle.
6123          */
6124         if (vnode_isnamedstream(nd.ni_vp) &&
6125             (nd.ni_vp->v_parent != NULLVP) &&
6126             vnode_isshadow(nd.ni_vp)) {
6127                 is_namedstream = 1;
6128                 vnode_ref(nd.ni_vp);
6129         }
6130 #endif
6131
6132         needsrealdev = flag & AT_REALDEV ? 1 : 0;
6133         if (fp && (xsecurity == USER_ADDR_NULL)) {
6134                 /*
6135                  * If the caller has the file open, and is not
6136                  * requesting extended security information, we are
6137                  * going to let them get the basic stat information.
6138                  */
6139                 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6140                     fp->f_fglob->fg_cred);
6141         } else {
6142                 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6143                     isstat64, needsrealdev, ctx);
6144         }
6145
6146 #if NAMEDRSRCFORK
6147         if (is_namedstream) {
6148                 vnode_rele(nd.ni_vp);
6149         }
6150 #endif
6151         vnode_put(nd.ni_vp);
6152         nameidone(&nd);
6153         if (fp) {
6154                 file_drop(fd);
6155                 fp = NULL;
6156         }
6157
6158         if (error) {
6159                 return error;
6160         }
6161         /* Zap spare fields */
6162         if (isstat64 != 0) {
6163                 source.sb64.st_lspare = 0;
6164                 source.sb64.st_qspare[0] = 0LL;
6165                 source.sb64.st_qspare[1] = 0LL;
6166                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6167                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6168                         my_size = sizeof(dest.user64_sb64);
6169                         sbp = (caddr_t)&dest.user64_sb64;
6170                 } else {
6171                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6172                         my_size = sizeof(dest.user32_sb64);
6173                         sbp = (caddr_t)&dest.user32_sb64;
6174                 }
6175                 /*
6176                  * Check if we raced (post lookup) against the last unlink of a file.
6177                  */
6178                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6179                         source.sb64.st_nlink = 1;
6180                 }
6181         } else {
6182                 source.sb.st_lspare = 0;
6183                 source.sb.st_qspare[0] = 0LL;
6184                 source.sb.st_qspare[1] = 0LL;
6185                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6186                         munge_user64_stat(&source.sb, &dest.user64_sb);
6187                         my_size = sizeof(dest.user64_sb);
6188                         sbp = (caddr_t)&dest.user64_sb;
6189                 } else {
6190                         munge_user32_stat(&source.sb, &dest.user32_sb);
6191                         my_size = sizeof(dest.user32_sb);
6192                         sbp = (caddr_t)&dest.user32_sb;
6193                 }
6194
6195                 /*
6196                  * Check if we raced (post lookup) against the last unlink of a file.
6197                  */
6198                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6199                         source.sb.st_nlink = 1;
6200                 }
6201         }
6202         if ((error = copyout(sbp, ub, my_size)) != 0) {
6203                 goto out;
6204         }
6205
6206         /* caller wants extended security information? */
6207         if (xsecurity != USER_ADDR_NULL) {
6208                 /* did we get any? */
6209                 if (fsec == KAUTH_FILESEC_NONE) {
6210                         if (susize(xsecurity_size, 0) != 0) {
6211                                 error = EFAULT;
6212                                 goto out;
6213                         }
6214                 } else {
6215                         /* find the user buffer size */
6216                         xsecurity_bufsize = fusize(xsecurity_size);
6217
6218                         /* copy out the actual data size */
6219                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6220                                 error = EFAULT;
6221                                 goto out;
6222                         }
6223
6224                         /* if the caller supplied enough room, copy out to it */
6225                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6226                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6227                         }
6228                 }
6229         }
6230 out:
6231         if (fsec != KAUTH_FILESEC_NONE) {
6232                 kauth_filesec_free(fsec);
6233         }
6234         return error;
6235 }
6236
6237 /*
6238  * stat_extended: Get file status; with extended security (ACL).
6239  *
6240  * Parameters:    p                       (ignored)
6241  *                uap                     User argument descriptor (see below)
6242  *                retval                  (ignored)
6243  *
6244  * Indirect:      uap->path               Path of file to get status from
6245  *                uap->ub                 User buffer (holds file status info)
6246  *                uap->xsecurity          ACL to get (extended security)
6247  *                uap->xsecurity_size     Size of ACL
6248  *
6249  * Returns:        0                      Success
6250  *                !0                      errno value
6251  *
6252  */
6253 int
6254 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6255     __unused int32_t *retval)
6256 {
6257         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6258                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6259                    0);
6260 }
6261
6262 /*
6263  * Returns:     0                       Success
6264  *      fstatat_internal:???            [see fstatat_internal() in this file]
6265  */
6266 int
6267 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6268 {
6269         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6270                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6271 }
6272
6273 int
6274 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6275 {
6276         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6277                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6278 }
6279
6280 /*
6281  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6282  *
6283  * Parameters:    p                       (ignored)
6284  *                uap                     User argument descriptor (see below)
6285  *                retval                  (ignored)
6286  *
6287  * Indirect:      uap->path               Path of file to get status from
6288  *                uap->ub                 User buffer (holds file status info)
6289  *                uap->xsecurity          ACL to get (extended security)
6290  *                uap->xsecurity_size     Size of ACL
6291  *
6292  * Returns:        0                      Success
6293  *                !0                      errno value
6294  *
6295  */
6296 int
6297 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6298 {
6299         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6300                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6301                    0);
6302 }
6303
6304 /*
6305  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6306  *
6307  * Parameters:    p                       (ignored)
6308  *                uap                     User argument descriptor (see below)
6309  *                retval                  (ignored)
6310  *
6311  * Indirect:      uap->path               Path of file to get status from
6312  *                uap->ub                 User buffer (holds file status info)
6313  *                uap->xsecurity          ACL to get (extended security)
6314  *                uap->xsecurity_size     Size of ACL
6315  *
6316  * Returns:        0                      Success
6317  *                !0                      errno value
6318  *
6319  */
6320 int
6321 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6322 {
6323         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6324                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6325                    AT_SYMLINK_NOFOLLOW);
6326 }
6327
6328 /*
6329  * Get file status; this version does not follow links.
6330  */
6331 int
6332 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6333 {
6334         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6335                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6336 }
6337
6338 int
6339 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6340 {
6341         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6342                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6343 }
6344
6345 /*
6346  * lstat64_extended: Get file status; can handle large inode numbers; does not
6347  * follow links; with extended security (ACL).
6348  *
6349  * Parameters:    p                       (ignored)
6350  *                uap                     User argument descriptor (see below)
6351  *                retval                  (ignored)
6352  *
6353  * Indirect:      uap->path               Path of file to get status from
6354  *                uap->ub                 User buffer (holds file status info)
6355  *                uap->xsecurity          ACL to get (extended security)
6356  *                uap->xsecurity_size     Size of ACL
6357  *
6358  * Returns:        0                      Success
6359  *                !0                      errno value
6360  *
6361  */
6362 int
6363 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6364 {
6365         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6366                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6367                    AT_SYMLINK_NOFOLLOW);
6368 }
6369
6370 int
6371 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6372 {
6373         if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6374                 return EINVAL;
6375         }
6376
6377         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6378                    0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6379 }
6380
6381 int
6382 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6383     __unused int32_t *retval)
6384 {
6385         if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6386                 return EINVAL;
6387         }
6388
6389         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6390                    0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6391 }
6392
6393 /*
6394  * Get configurable pathname variables.
6395  *
6396  * Returns:     0                       Success
6397  *      namei:???
6398  *      vn_pathconf:???
6399  *
6400  * Notes:       Global implementation  constants are intended to be
6401  *              implemented in this function directly; all other constants
6402  *              are per-FS implementation, and therefore must be handled in
6403  *              each respective FS, instead.
6404  *
6405  * XXX We implement some things globally right now that should actually be
6406  * XXX per-FS; we will need to deal with this at some point.
6407  */
6408 /* ARGSUSED */
6409 int
6410 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6411 {
6412         int error;
6413         struct nameidata nd;
6414         vfs_context_t ctx = vfs_context_current();
6415
6416         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6417             UIO_USERSPACE, uap->path, ctx);
6418         error = namei(&nd);
6419         if (error) {
6420                 return error;
6421         }
6422
6423         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6424
6425         vnode_put(nd.ni_vp);
6426         nameidone(&nd);
6427         return error;
6428 }
6429
6430 /*
6431  * Return target name of a symbolic link.
6432  */
6433 /* ARGSUSED */
6434 static int
6435 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
6436     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6437     int *retval)
6438 {
6439         vnode_t vp;
6440         uio_t auio;
6441         int error;
6442         struct nameidata nd;
6443         char uio_buf[UIO_SIZEOF(1)];
6444
6445         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6446             seg, path, ctx);
6447
6448         error = nameiat(&nd, fd);
6449         if (error) {
6450                 return error;
6451         }
6452         vp = nd.ni_vp;
6453
6454         nameidone(&nd);
6455
6456         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6457             &uio_buf[0], sizeof(uio_buf));
6458         uio_addiov(auio, buf, bufsize);
6459         if (vp->v_type != VLNK) {
6460                 error = EINVAL;
6461         } else {
6462 #if CONFIG_MACF
6463                 error = mac_vnode_check_readlink(ctx, vp);
6464 #endif
6465                 if (error == 0) {
6466                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6467                             ctx);
6468                 }
6469                 if (error == 0) {
6470                         error = VNOP_READLINK(vp, auio, ctx);
6471                 }
6472         }
6473         vnode_put(vp);
6474
6475         *retval = bufsize - (int)uio_resid(auio);
6476         return error;
6477 }
6478
6479 int
6480 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6481 {
6482         enum uio_seg procseg;
6483
6484         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6485         return readlinkat_internal(vfs_context_current(), AT_FDCWD,
6486                    CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6487                    uap->count, procseg, retval);
6488 }
6489
6490 int
6491 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6492 {
6493         enum uio_seg procseg;
6494
6495         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6496         return readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
6497                    procseg, uap->buf, uap->bufsize, procseg, retval);
6498 }
6499
6500 /*
6501  * Change file flags, the deep inner layer.
6502  */
6503 static int
6504 chflags0(vnode_t vp, struct vnode_attr *va,
6505     int (*setattr)(vnode_t, void *, vfs_context_t),
6506     void *arg, vfs_context_t ctx)
6507 {
6508         kauth_action_t action = 0;
6509         int error;
6510
6511 #if CONFIG_MACF
6512         error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6513         if (error) {
6514                 goto out;
6515         }
6516 #endif
6517
6518         /* request authorisation, disregard immutability */
6519         if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6520                 goto out;
6521         }
6522         /*
6523          * Request that the auth layer disregard those file flags it's allowed to when
6524          * authorizing this operation; we need to do this in order to be able to
6525          * clear immutable flags.
6526          */
6527         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6528                 goto out;
6529         }
6530         error = (*setattr)(vp, arg, ctx);
6531
6532 #if CONFIG_MACF
6533         if (error == 0) {
6534                 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6535         }
6536 #endif
6537
6538 out:
6539         return error;
6540 }
6541
6542 /*
6543  * Change file flags.
6544  *
6545  * NOTE: this will vnode_put() `vp'
6546  */
6547 static int
6548 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6549 {
6550         struct vnode_attr va;
6551         int error;
6552
6553         VATTR_INIT(&va);
6554         VATTR_SET(&va, va_flags, flags);
6555
6556         error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6557         vnode_put(vp);
6558
6559         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6560                 error = ENOTSUP;
6561         }
6562
6563         return error;
6564 }
6565
6566 /*
6567  * Change flags of a file given a path name.
6568  */
6569 /* ARGSUSED */
6570 int
6571 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6572 {
6573         vnode_t vp;
6574         vfs_context_t ctx = vfs_context_current();
6575         int error;
6576         struct nameidata nd;
6577
6578         AUDIT_ARG(fflags, uap->flags);
6579         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6580             UIO_USERSPACE, uap->path, ctx);
6581         error = namei(&nd);
6582         if (error) {
6583                 return error;
6584         }
6585         vp = nd.ni_vp;
6586         nameidone(&nd);
6587
6588         /* we don't vnode_put() here because chflags1 does internally */
6589         error = chflags1(vp, uap->flags, ctx);
6590
6591         return error;
6592 }
6593
6594 /*
6595  * Change flags of a file given a file descriptor.
6596  */
6597 /* ARGSUSED */
6598 int
6599 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6600 {
6601         vnode_t vp;
6602         int error;
6603
6604         AUDIT_ARG(fd, uap->fd);
6605         AUDIT_ARG(fflags, uap->flags);
6606         if ((error = file_vnode(uap->fd, &vp))) {
6607                 return error;
6608         }
6609
6610         if ((error = vnode_getwithref(vp))) {
6611                 file_drop(uap->fd);
6612                 return error;
6613         }
6614
6615         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6616
6617         /* we don't vnode_put() here because chflags1 does internally */
6618         error = chflags1(vp, uap->flags, vfs_context_current());
6619
6620         file_drop(uap->fd);
6621         return error;
6622 }
6623
6624 /*
6625  * Change security information on a filesystem object.
6626  *
6627  * Returns:     0                       Success
6628  *              EPERM                   Operation not permitted
6629  *              vnode_authattr:???      [anything vnode_authattr can return]
6630  *              vnode_authorize:???     [anything vnode_authorize can return]
6631  *              vnode_setattr:???       [anything vnode_setattr can return]
6632  *
6633  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
6634  *              translated to EPERM before being returned.
6635  */
6636 static int
6637 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6638 {
6639         kauth_action_t action;
6640         int error;
6641
6642         AUDIT_ARG(mode, vap->va_mode);
6643         /* XXX audit new args */
6644
6645 #if NAMEDSTREAMS
6646         /* chmod calls are not allowed for resource forks. */
6647         if (vp->v_flag & VISNAMEDSTREAM) {
6648                 return EPERM;
6649         }
6650 #endif
6651
6652 #if CONFIG_MACF
6653         if (VATTR_IS_ACTIVE(vap, va_mode) &&
6654             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6655                 return error;
6656         }
6657
6658         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6659                 if ((error = mac_vnode_check_setowner(ctx, vp,
6660                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6661                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6662                         return error;
6663                 }
6664         }
6665
6666         if (VATTR_IS_ACTIVE(vap, va_acl) &&
6667             (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
6668                 return error;
6669         }
6670 #endif
6671
6672         /* make sure that the caller is allowed to set this security information */
6673         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6674             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6675                 if (error == EACCES) {
6676                         error = EPERM;
6677                 }
6678                 return error;
6679         }
6680
6681         if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
6682                 return error;
6683         }
6684
6685 #if CONFIG_MACF
6686         if (VATTR_IS_ACTIVE(vap, va_mode)) {
6687                 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6688         }
6689
6690         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6691                 mac_vnode_notify_setowner(ctx, vp,
6692                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6693                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6694         }
6695
6696         if (VATTR_IS_ACTIVE(vap, va_acl)) {
6697                 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6698         }
6699 #endif
6700
6701         return error;
6702 }
6703
6704
6705 /*
6706  * Change mode of a file given a path name.
6707  *
6708  * Returns:     0                       Success
6709  *              namei:???               [anything namei can return]
6710  *              chmod_vnode:???         [anything chmod_vnode can return]
6711  */
6712 static int
6713 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6714     int fd, int flag, enum uio_seg segflg)
6715 {
6716         struct nameidata nd;
6717         int follow, error;
6718
6719         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6720         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
6721             segflg, path, ctx);
6722         if ((error = nameiat(&nd, fd))) {
6723                 return error;
6724         }
6725         error = chmod_vnode(ctx, nd.ni_vp, vap);
6726         vnode_put(nd.ni_vp);
6727         nameidone(&nd);
6728         return error;
6729 }
6730
6731 /*
6732  * chmod_extended: Change the mode of a file given a path name; with extended
6733  * argument list (including extended security (ACL)).
6734  *
6735  * Parameters:  p                       Process requesting the open
6736  *              uap                     User argument descriptor (see below)
6737  *              retval                  (ignored)
6738  *
6739  * Indirect:    uap->path               Path to object (same as 'chmod')
6740  *              uap->uid                UID to set
6741  *              uap->gid                GID to set
6742  *              uap->mode               File mode to set (same as 'chmod')
6743  *              uap->xsecurity          ACL to set (or delete)
6744  *
6745  * Returns:     0                       Success
6746  *              !0                      errno value
6747  *
6748  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
6749  *
6750  * XXX:         We should enummerate the possible errno values here, and where
6751  *              in the code they originated.
6752  */
6753 int
6754 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
6755 {
6756         int error;
6757         struct vnode_attr va;
6758         kauth_filesec_t xsecdst;
6759
6760         AUDIT_ARG(owner, uap->uid, uap->gid);
6761
6762         VATTR_INIT(&va);
6763         if (uap->mode != -1) {
6764                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6765         }
6766         if (uap->uid != KAUTH_UID_NONE) {
6767                 VATTR_SET(&va, va_uid, uap->uid);
6768         }
6769         if (uap->gid != KAUTH_GID_NONE) {
6770                 VATTR_SET(&va, va_gid, uap->gid);
6771         }
6772
6773         xsecdst = NULL;
6774         switch (uap->xsecurity) {
6775         /* explicit remove request */
6776         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6777                 VATTR_SET(&va, va_acl, NULL);
6778                 break;
6779         /* not being set */
6780         case USER_ADDR_NULL:
6781                 break;
6782         default:
6783                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6784                         return error;
6785                 }
6786                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6787                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6788         }
6789
6790         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6791             UIO_USERSPACE);
6792
6793         if (xsecdst != NULL) {
6794                 kauth_filesec_free(xsecdst);
6795         }
6796         return error;
6797 }
6798
6799 /*
6800  * Returns:     0                       Success
6801  *              chmodat:???             [anything chmodat can return]
6802  */
6803 static int
6804 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6805     int flag, enum uio_seg segflg)
6806 {
6807         struct vnode_attr va;
6808
6809         VATTR_INIT(&va);
6810         VATTR_SET(&va, va_mode, mode & ALLPERMS);
6811
6812         return chmodat(ctx, path, &va, fd, flag, segflg);
6813 }
6814
6815 int
6816 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6817 {
6818         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6819                    AT_FDCWD, 0, UIO_USERSPACE);
6820 }
6821
6822 int
6823 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6824 {
6825         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
6826                 return EINVAL;
6827         }
6828
6829         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6830                    uap->fd, uap->flag, UIO_USERSPACE);
6831 }
6832
6833 /*
6834  * Change mode of a file given a file descriptor.
6835  */
6836 static int
6837 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6838 {
6839         vnode_t vp;
6840         int error;
6841
6842         AUDIT_ARG(fd, fd);
6843
6844         if ((error = file_vnode(fd, &vp)) != 0) {
6845                 return error;
6846         }
6847         if ((error = vnode_getwithref(vp)) != 0) {
6848                 file_drop(fd);
6849                 return error;
6850         }
6851         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6852
6853         error = chmod_vnode(vfs_context_current(), vp, vap);
6854         (void)vnode_put(vp);
6855         file_drop(fd);
6856
6857         return error;
6858 }
6859
6860 /*
6861  * fchmod_extended: Change mode of a file given a file descriptor; with
6862  * extended argument list (including extended security (ACL)).
6863  *
6864  * Parameters:    p                       Process requesting to change file mode
6865  *                uap                     User argument descriptor (see below)
6866  *                retval                  (ignored)
6867  *
6868  * Indirect:      uap->mode               File mode to set (same as 'chmod')
6869  *                uap->uid                UID to set
6870  *                uap->gid                GID to set
6871  *                uap->xsecurity          ACL to set (or delete)
6872  *                uap->fd                 File descriptor of file to change mode
6873  *
6874  * Returns:        0                      Success
6875  *                !0                      errno value
6876  *
6877  */
6878 int
6879 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6880 {
6881         int error;
6882         struct vnode_attr va;
6883         kauth_filesec_t xsecdst;
6884
6885         AUDIT_ARG(owner, uap->uid, uap->gid);
6886
6887         VATTR_INIT(&va);
6888         if (uap->mode != -1) {
6889                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6890         }
6891         if (uap->uid != KAUTH_UID_NONE) {
6892                 VATTR_SET(&va, va_uid, uap->uid);
6893         }
6894         if (uap->gid != KAUTH_GID_NONE) {
6895                 VATTR_SET(&va, va_gid, uap->gid);
6896         }
6897
6898         xsecdst = NULL;
6899         switch (uap->xsecurity) {
6900         case USER_ADDR_NULL:
6901                 VATTR_SET(&va, va_acl, NULL);
6902                 break;
6903         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6904                 VATTR_SET(&va, va_acl, NULL);
6905                 break;
6906         /* not being set */
6907         case CAST_USER_ADDR_T(-1):
6908                 break;
6909         default:
6910                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6911                         return error;
6912                 }
6913                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6914         }
6915
6916         error = fchmod1(p, uap->fd, &va);
6917
6918
6919         switch (uap->xsecurity) {
6920         case USER_ADDR_NULL:
6921         case CAST_USER_ADDR_T(-1):
6922                 break;
6923         default:
6924                 if (xsecdst != NULL) {
6925                         kauth_filesec_free(xsecdst);
6926                 }
6927         }
6928         return error;
6929 }
6930
6931 int
6932 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6933 {
6934         struct vnode_attr va;
6935
6936         VATTR_INIT(&va);
6937         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6938
6939         return fchmod1(p, uap->fd, &va);
6940 }
6941
6942
6943 /*
6944  * Set ownership given a path name.
6945  */
6946 /* ARGSUSED */
6947 static int
6948 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6949     gid_t gid, int flag, enum uio_seg segflg)
6950 {
6951         vnode_t vp;
6952         struct vnode_attr va;
6953         int error;
6954         struct nameidata nd;
6955         int follow;
6956         kauth_action_t action;
6957
6958         AUDIT_ARG(owner, uid, gid);
6959
6960         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6961         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6962             path, ctx);
6963         error = nameiat(&nd, fd);
6964         if (error) {
6965                 return error;
6966         }
6967         vp = nd.ni_vp;
6968
6969         nameidone(&nd);
6970
6971         VATTR_INIT(&va);
6972         if (uid != (uid_t)VNOVAL) {
6973                 VATTR_SET(&va, va_uid, uid);
6974         }
6975         if (gid != (gid_t)VNOVAL) {
6976                 VATTR_SET(&va, va_gid, gid);
6977         }
6978
6979 #if CONFIG_MACF
6980         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6981         if (error) {
6982                 goto out;
6983         }
6984 #endif
6985
6986         /* preflight and authorize attribute changes */
6987         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6988                 goto out;
6989         }
6990         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6991                 goto out;
6992         }
6993         error = vnode_setattr(vp, &va, ctx);
6994
6995 #if CONFIG_MACF
6996         if (error == 0) {
6997                 mac_vnode_notify_setowner(ctx, vp, uid, gid);
6998         }
6999 #endif
7000
7001 out:
7002         /*
7003          * EACCES is only allowed from namei(); permissions failure should
7004          * return EPERM, so we need to translate the error code.
7005          */
7006         if (error == EACCES) {
7007                 error = EPERM;
7008         }
7009
7010         vnode_put(vp);
7011         return error;
7012 }
7013
7014 int
7015 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7016 {
7017         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7018                    uap->uid, uap->gid, 0, UIO_USERSPACE);
7019 }
7020
7021 int
7022 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7023 {
7024         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7025                    uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7026 }
7027
7028 int
7029 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7030 {
7031         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7032                 return EINVAL;
7033         }
7034
7035         return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7036                    uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7037 }
7038
7039 /*
7040  * Set ownership given a file descriptor.
7041  */
7042 /* ARGSUSED */
7043 int
7044 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7045 {
7046         struct vnode_attr va;
7047         vfs_context_t ctx = vfs_context_current();
7048         vnode_t vp;
7049         int error;
7050         kauth_action_t action;
7051
7052         AUDIT_ARG(owner, uap->uid, uap->gid);
7053         AUDIT_ARG(fd, uap->fd);
7054
7055         if ((error = file_vnode(uap->fd, &vp))) {
7056                 return error;
7057         }
7058
7059         if ((error = vnode_getwithref(vp))) {
7060                 file_drop(uap->fd);
7061                 return error;
7062         }
7063         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7064
7065         VATTR_INIT(&va);
7066         if (uap->uid != VNOVAL) {
7067                 VATTR_SET(&va, va_uid, uap->uid);
7068         }
7069         if (uap->gid != VNOVAL) {
7070                 VATTR_SET(&va, va_gid, uap->gid);
7071         }
7072
7073 #if NAMEDSTREAMS
7074         /* chown calls are not allowed for resource forks. */
7075         if (vp->v_flag & VISNAMEDSTREAM) {
7076                 error = EPERM;
7077                 goto out;
7078         }
7079 #endif
7080
7081 #if CONFIG_MACF
7082         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7083         if (error) {
7084                 goto out;
7085         }
7086 #endif
7087
7088         /* preflight and authorize attribute changes */
7089         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7090                 goto out;
7091         }
7092         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7093                 if (error == EACCES) {
7094                         error = EPERM;
7095                 }
7096                 goto out;
7097         }
7098         error = vnode_setattr(vp, &va, ctx);
7099
7100 #if CONFIG_MACF
7101         if (error == 0) {
7102                 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7103         }
7104 #endif
7105
7106 out:
7107         (void)vnode_put(vp);
7108         file_drop(uap->fd);
7109         return error;
7110 }
7111
7112 static int
7113 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7114 {
7115         int error;
7116
7117         if (usrtvp == USER_ADDR_NULL) {
7118                 struct timeval old_tv;
7119                 /* XXX Y2038 bug because of microtime argument */
7120                 microtime(&old_tv);
7121                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7122                 tsp[1] = tsp[0];
7123         } else {
7124                 if (IS_64BIT_PROCESS(current_proc())) {
7125                         struct user64_timeval tv[2];
7126                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
7127                         if (error) {
7128                                 return error;
7129                         }
7130                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7131                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7132                 } else {
7133                         struct user32_timeval tv[2];
7134                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
7135                         if (error) {
7136                                 return error;
7137                         }
7138                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7139                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7140                 }
7141         }
7142         return 0;
7143 }
7144
7145 static int
7146 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7147     int nullflag)
7148 {
7149         int error;
7150         struct vnode_attr va;
7151         kauth_action_t action;
7152
7153         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7154
7155         VATTR_INIT(&va);
7156         VATTR_SET(&va, va_access_time, ts[0]);
7157         VATTR_SET(&va, va_modify_time, ts[1]);
7158         if (nullflag) {
7159                 va.va_vaflags |= VA_UTIMES_NULL;
7160         }
7161
7162 #if NAMEDSTREAMS
7163         /* utimes calls are not allowed for resource forks. */
7164         if (vp->v_flag & VISNAMEDSTREAM) {
7165                 error = EPERM;
7166                 goto out;
7167         }
7168 #endif
7169
7170 #if CONFIG_MACF
7171         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7172         if (error) {
7173                 goto out;
7174         }
7175 #endif
7176         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7177                 if (!nullflag && error == EACCES) {
7178                         error = EPERM;
7179                 }
7180                 goto out;
7181         }
7182
7183         /* since we may not need to auth anything, check here */
7184         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7185                 if (!nullflag && error == EACCES) {
7186                         error = EPERM;
7187                 }
7188                 goto out;
7189         }
7190         error = vnode_setattr(vp, &va, ctx);
7191
7192 #if CONFIG_MACF
7193         if (error == 0) {
7194                 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7195         }
7196 #endif
7197
7198 out:
7199         return error;
7200 }
7201
7202 /*
7203  * Set the access and modification times of a file.
7204  */
7205 /* ARGSUSED */
7206 int
7207 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7208 {
7209         struct timespec ts[2];
7210         user_addr_t usrtvp;
7211         int error;
7212         struct nameidata nd;
7213         vfs_context_t ctx = vfs_context_current();
7214
7215         /*
7216          * AUDIT: Needed to change the order of operations to do the
7217          * name lookup first because auditing wants the path.
7218          */
7219         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7220             UIO_USERSPACE, uap->path, ctx);
7221         error = namei(&nd);
7222         if (error) {
7223                 return error;
7224         }
7225         nameidone(&nd);
7226
7227         /*
7228          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
7229          * the current time instead.
7230          */
7231         usrtvp = uap->tptr;
7232         if ((error = getutimes(usrtvp, ts)) != 0) {
7233                 goto out;
7234         }
7235
7236         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7237
7238 out:
7239         vnode_put(nd.ni_vp);
7240         return error;
7241 }
7242
7243 /*
7244  * Set the access and modification times of a file.
7245  */
7246 /* ARGSUSED */
7247 int
7248 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7249 {
7250         struct timespec ts[2];
7251         vnode_t vp;
7252         user_addr_t usrtvp;
7253         int error;
7254
7255         AUDIT_ARG(fd, uap->fd);
7256         usrtvp = uap->tptr;
7257         if ((error = getutimes(usrtvp, ts)) != 0) {
7258                 return error;
7259         }
7260         if ((error = file_vnode(uap->fd, &vp)) != 0) {
7261                 return error;
7262         }
7263         if ((error = vnode_getwithref(vp))) {
7264                 file_drop(uap->fd);
7265                 return error;
7266         }
7267
7268         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7269         vnode_put(vp);
7270         file_drop(uap->fd);
7271         return error;
7272 }
7273
7274 /*
7275  * Truncate a file given its path name.
7276  */
7277 /* ARGSUSED */
7278 int
7279 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7280 {
7281         vnode_t vp;
7282         struct vnode_attr va;
7283         vfs_context_t ctx = vfs_context_current();
7284         int error;
7285         struct nameidata nd;
7286         kauth_action_t action;
7287
7288         if (uap->length < 0) {
7289                 return EINVAL;
7290         }
7291         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7292             UIO_USERSPACE, uap->path, ctx);
7293         if ((error = namei(&nd))) {
7294                 return error;
7295         }
7296         vp = nd.ni_vp;
7297
7298         nameidone(&nd);
7299
7300         VATTR_INIT(&va);
7301         VATTR_SET(&va, va_data_size, uap->length);
7302
7303 #if CONFIG_MACF
7304         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7305         if (error) {
7306                 goto out;
7307         }
7308 #endif
7309
7310         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7311                 goto out;
7312         }
7313         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7314                 goto out;
7315         }
7316         error = vnode_setattr(vp, &va, ctx);
7317
7318 #if CONFIG_MACF
7319         if (error == 0) {
7320                 mac_vnode_notify_truncate(ctx, NOCRED, vp);
7321         }
7322 #endif
7323
7324 out:
7325         vnode_put(vp);
7326         return error;
7327 }
7328
7329 /*
7330  * Truncate a file given a file descriptor.
7331  */
7332 /* ARGSUSED */
7333 int
7334 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7335 {
7336         vfs_context_t ctx = vfs_context_current();
7337         struct vnode_attr va;
7338         vnode_t vp;
7339         struct fileproc *fp;
7340         int error;
7341         int fd = uap->fd;
7342
7343         AUDIT_ARG(fd, uap->fd);
7344         if (uap->length < 0) {
7345                 return EINVAL;
7346         }
7347
7348         if ((error = fp_lookup(p, fd, &fp, 0))) {
7349                 return error;
7350         }
7351
7352         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
7353         case DTYPE_PSXSHM:
7354                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7355                 goto out;
7356         case DTYPE_VNODE:
7357                 break;
7358         default:
7359                 error = EINVAL;
7360                 goto out;
7361         }
7362
7363         vp = (vnode_t)fp->f_fglob->fg_data;
7364
7365         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
7366                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7367                 error = EINVAL;
7368                 goto out;
7369         }
7370
7371         if ((error = vnode_getwithref(vp)) != 0) {
7372                 goto out;
7373         }
7374
7375         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7376
7377 #if CONFIG_MACF
7378         error = mac_vnode_check_truncate(ctx,
7379             fp->f_fglob->fg_cred, vp);
7380         if (error) {
7381                 (void)vnode_put(vp);
7382                 goto out;
7383         }
7384 #endif
7385         VATTR_INIT(&va);
7386         VATTR_SET(&va, va_data_size, uap->length);
7387         error = vnode_setattr(vp, &va, ctx);
7388
7389 #if CONFIG_MACF
7390         if (error == 0) {
7391                 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
7392         }
7393 #endif
7394
7395         (void)vnode_put(vp);
7396 out:
7397         file_drop(fd);
7398         return error;
7399 }
7400
7401
7402 /*
7403  * Sync an open file with synchronized I/O _file_ integrity completion
7404  */
7405 /* ARGSUSED */
7406 int
7407 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7408 {
7409         __pthread_testcancel(1);
7410         return fsync_common(p, uap, MNT_WAIT);
7411 }
7412
7413
7414 /*
7415  * Sync an open file with synchronized I/O _file_ integrity completion
7416  *
7417  * Notes:       This is a legacy support function that does not test for
7418  *              thread cancellation points.
7419  */
7420 /* ARGSUSED */
7421 int
7422 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7423 {
7424         return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7425 }
7426
7427
7428 /*
7429  * Sync an open file with synchronized I/O _data_ integrity completion
7430  */
7431 /* ARGSUSED */
7432 int
7433 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7434 {
7435         __pthread_testcancel(1);
7436         return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7437 }
7438
7439
7440 /*
7441  * fsync_common
7442  *
7443  * Common fsync code to support both synchronized I/O file integrity completion
7444  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7445  *
7446  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7447  * will only guarantee that the file data contents are retrievable.  If
7448  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7449  * includes additional metadata unnecessary for retrieving the file data
7450  * contents, such as atime, mtime, ctime, etc., also be committed to stable
7451  * storage.
7452  *
7453  * Parameters:  p                               The process
7454  *              uap->fd                         The descriptor to synchronize
7455  *              flags                           The data integrity flags
7456  *
7457  * Returns:     int                             Success
7458  *      fp_getfvp:EBADF                         Bad file descriptor
7459  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
7460  *      VNOP_FSYNC:???                          unspecified
7461  *
7462  * Notes:       We use struct fsync_args because it is a short name, and all
7463  *              caller argument structures are otherwise identical.
7464  */
7465 static int
7466 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7467 {
7468         vnode_t vp;
7469         struct fileproc *fp;
7470         vfs_context_t ctx = vfs_context_current();
7471         int error;
7472
7473         AUDIT_ARG(fd, uap->fd);
7474
7475         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7476                 return error;
7477         }
7478         if ((error = vnode_getwithref(vp))) {
7479                 file_drop(uap->fd);
7480                 return error;
7481         }
7482
7483         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7484
7485         error = VNOP_FSYNC(vp, flags, ctx);
7486
7487 #if NAMEDRSRCFORK
7488         /* Sync resource fork shadow file if necessary. */
7489         if ((error == 0) &&
7490             (vp->v_flag & VISNAMEDSTREAM) &&
7491             (vp->v_parent != NULLVP) &&
7492             vnode_isshadow(vp) &&
7493             (fp->f_flags & FP_WRITTEN)) {
7494                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7495         }
7496 #endif
7497
7498         (void)vnode_put(vp);
7499         file_drop(uap->fd);
7500         return error;
7501 }
7502
7503 /*
7504  * Duplicate files.  Source must be a file, target must be a file or
7505  * must not exist.
7506  *
7507  * XXX Copyfile authorisation checking is woefully inadequate, and will not
7508  *     perform inheritance correctly.
7509  */
7510 /* ARGSUSED */
7511 int
7512 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7513 {
7514         vnode_t tvp, fvp, tdvp, sdvp;
7515         struct nameidata fromnd, tond;
7516         int error;
7517         vfs_context_t ctx = vfs_context_current();
7518 #if CONFIG_MACF
7519         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
7520         struct vnode_attr va;
7521 #endif
7522
7523         /* Check that the flags are valid. */
7524
7525         if (uap->flags & ~CPF_MASK) {
7526                 return EINVAL;
7527         }
7528
7529         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7530             UIO_USERSPACE, uap->from, ctx);
7531         if ((error = namei(&fromnd))) {
7532                 return error;
7533         }
7534         fvp = fromnd.ni_vp;
7535
7536         NDINIT(&tond, CREATE, OP_LINK,
7537             LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7538             UIO_USERSPACE, uap->to, ctx);
7539         if ((error = namei(&tond))) {
7540                 goto out1;
7541         }
7542         tdvp = tond.ni_dvp;
7543         tvp = tond.ni_vp;
7544
7545         if (tvp != NULL) {
7546                 if (!(uap->flags & CPF_OVERWRITE)) {
7547                         error = EEXIST;
7548                         goto out;
7549                 }
7550         }
7551
7552         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7553                 error = EISDIR;
7554                 goto out;
7555         }
7556
7557         /* This calls existing MAC hooks for open  */
7558         if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
7559             NULL))) {
7560                 goto out;
7561         }
7562
7563         if (tvp) {
7564                 /*
7565                  * See unlinkat_internal for an explanation of the potential
7566                  * ENOENT from the MAC hook but the gist is that the MAC hook
7567                  * can fail because vn_getpath isn't able to return the full
7568                  * path. We choose to ignore this failure.
7569                  */
7570                 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
7571                 if (error && error != ENOENT) {
7572                         goto out;
7573                 }
7574                 error = 0;
7575         }
7576
7577 #if CONFIG_MACF
7578         VATTR_INIT(&va);
7579         VATTR_SET(&va, va_type, fvp->v_type);
7580         /* Mask off all but regular access permissions */
7581         VATTR_SET(&va, va_mode,
7582             ((((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
7583         error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
7584         if (error) {
7585                 goto out;
7586         }
7587 #endif /* CONFIG_MACF */
7588
7589         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7590                 goto out;
7591         }
7592
7593         if (fvp == tdvp) {
7594                 error = EINVAL;
7595         }
7596         /*
7597          * If source is the same as the destination (that is the
7598          * same inode number) then there is nothing to do.
7599          * (fixed to have POSIX semantics - CSM 3/2/98)
7600          */
7601         if (fvp == tvp) {
7602                 error = -1;
7603         }
7604         if (!error) {
7605                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7606         }
7607 out:
7608         sdvp = tond.ni_startdir;
7609         /*
7610          * nameidone has to happen before we vnode_put(tdvp)
7611          * since it may need to release the fs_nodelock on the tdvp
7612          */
7613         nameidone(&tond);
7614
7615         if (tvp) {
7616                 vnode_put(tvp);
7617         }
7618         vnode_put(tdvp);
7619         vnode_put(sdvp);
7620 out1:
7621         vnode_put(fvp);
7622
7623         nameidone(&fromnd);
7624
7625         if (error == -1) {
7626                 return 0;
7627         }
7628         return error;
7629 }
7630
7631 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7632
7633 /*
7634  * Helper function for doing clones. The caller is expected to provide an
7635  * iocounted source vnode and release it.
7636  */
7637 static int
7638 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7639     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7640 {
7641         vnode_t tvp, tdvp;
7642         struct nameidata tond;
7643         int error;
7644         int follow;
7645         boolean_t free_src_acl;
7646         boolean_t attr_cleanup;
7647         enum vtype v_type;
7648         kauth_action_t action;
7649         struct componentname *cnp;
7650         uint32_t defaulted;
7651         struct vnode_attr va;
7652         struct vnode_attr nva;
7653         uint32_t vnop_flags;
7654
7655         v_type = vnode_vtype(fvp);
7656         switch (v_type) {
7657         case VLNK:
7658         /* FALLTHRU */
7659         case VREG:
7660                 action = KAUTH_VNODE_ADD_FILE;
7661                 break;
7662         case VDIR:
7663                 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7664                     fvp->v_mountedhere) {
7665                         return EINVAL;
7666                 }
7667                 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7668                 break;
7669         default:
7670                 return EINVAL;
7671         }
7672
7673         AUDIT_ARG(fd2, dst_dirfd);
7674         AUDIT_ARG(value32, flags);
7675
7676         follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7677         NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7678             UIO_USERSPACE, dst, ctx);
7679         if ((error = nameiat(&tond, dst_dirfd))) {
7680                 return error;
7681         }
7682         cnp = &tond.ni_cnd;
7683         tdvp = tond.ni_dvp;
7684         tvp = tond.ni_vp;
7685
7686         free_src_acl = FALSE;
7687         attr_cleanup = FALSE;
7688
7689         if (tvp != NULL) {
7690                 error = EEXIST;
7691                 goto out;
7692         }
7693
7694         if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7695                 error = EXDEV;
7696                 goto out;
7697         }
7698
7699 #if CONFIG_MACF
7700         if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
7701                 goto out;
7702         }
7703 #endif
7704         if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
7705                 goto out;
7706         }
7707
7708         action = KAUTH_VNODE_GENERIC_READ_BITS;
7709         if (data_read_authorised) {
7710                 action &= ~KAUTH_VNODE_READ_DATA;
7711         }
7712         if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
7713                 goto out;
7714         }
7715
7716         /*
7717          * certain attributes may need to be changed from the source, we ask for
7718          * those here.
7719          */
7720         VATTR_INIT(&va);
7721         VATTR_WANTED(&va, va_uid);
7722         VATTR_WANTED(&va, va_gid);
7723         VATTR_WANTED(&va, va_mode);
7724         VATTR_WANTED(&va, va_flags);
7725         VATTR_WANTED(&va, va_acl);
7726
7727         if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
7728                 goto out;
7729         }
7730
7731         VATTR_INIT(&nva);
7732         VATTR_SET(&nva, va_type, v_type);
7733         if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
7734                 VATTR_SET(&nva, va_acl, va.va_acl);
7735                 free_src_acl = TRUE;
7736         }
7737
7738         /* Handle ACL inheritance, initialize vap. */
7739         if (v_type == VLNK) {
7740                 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
7741         } else {
7742                 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
7743                 if (error) {
7744                         goto out;
7745                 }
7746                 attr_cleanup = TRUE;
7747         }
7748
7749         vnop_flags = VNODE_CLONEFILE_DEFAULT;
7750         /*
7751          * We've got initial values for all security parameters,
7752          * If we are superuser, then we can change owners to be the
7753          * same as the source. Both superuser and the owner have default
7754          * WRITE_SECURITY privileges so all other fields can be taken
7755          * from source as well.
7756          */
7757         if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
7758                 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
7759                         VATTR_SET(&nva, va_uid, va.va_uid);
7760                 }
7761                 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
7762                         VATTR_SET(&nva, va_gid, va.va_gid);
7763                 }
7764         } else {
7765                 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
7766         }
7767
7768         if (VATTR_IS_SUPPORTED(&va, va_mode)) {
7769                 VATTR_SET(&nva, va_mode, va.va_mode);
7770         }
7771         if (VATTR_IS_SUPPORTED(&va, va_flags)) {
7772                 VATTR_SET(&nva, va_flags,
7773                     ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
7774                     (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
7775         }
7776
7777         error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
7778
7779         if (!error && tvp) {
7780                 int     update_flags = 0;
7781 #if CONFIG_FSE
7782                 int fsevent;
7783 #endif /* CONFIG_FSE */
7784
7785 #if CONFIG_MACF
7786                 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
7787                     VNODE_LABEL_CREATE, ctx);
7788 #endif
7789                 /*
7790                  * If some of the requested attributes weren't handled by the
7791                  * VNOP, use our fallback code.
7792                  */
7793                 if (!VATTR_ALL_SUPPORTED(&va)) {
7794                         (void)vnode_setattr_fallback(tvp, &nva, ctx);
7795                 }
7796
7797                 // Make sure the name & parent pointers are hooked up
7798                 if (tvp->v_name == NULL) {
7799                         update_flags |= VNODE_UPDATE_NAME;
7800                 }
7801                 if (tvp->v_parent == NULLVP) {
7802                         update_flags |= VNODE_UPDATE_PARENT;
7803                 }
7804
7805                 if (update_flags) {
7806                         (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
7807                             cnp->cn_namelen, cnp->cn_hash, update_flags);
7808                 }
7809
7810 #if CONFIG_FSE
7811                 switch (vnode_vtype(tvp)) {
7812                 case VLNK:
7813                 /* FALLTHRU */
7814                 case VREG:
7815                         fsevent = FSE_CREATE_FILE;
7816                         break;
7817                 case VDIR:
7818                         fsevent = FSE_CREATE_DIR;
7819                         break;
7820                 default:
7821                         goto out;
7822                 }
7823
7824                 if (need_fsevent(fsevent, tvp)) {
7825                         /*
7826                          * The following is a sequence of three explicit events.
7827                          * A pair of FSE_CLONE events representing the source and destination
7828                          * followed by an FSE_CREATE_[FILE | DIR] for the destination.
7829                          * fseventsd may coalesce the destination clone and create events
7830                          * into a single event resulting in the following sequence for a client
7831                          * FSE_CLONE (src)
7832                          * FSE_CLONE | FSE_CREATE (dst)
7833                          */
7834                         add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
7835                             FSE_ARG_DONE);
7836                         add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
7837                             FSE_ARG_DONE);
7838                 }
7839 #endif /* CONFIG_FSE */
7840         }
7841
7842 out:
7843         if (attr_cleanup) {
7844                 vn_attribute_cleanup(&nva, defaulted);
7845         }
7846         if (free_src_acl && va.va_acl) {
7847                 kauth_acl_free(va.va_acl);
7848         }
7849         nameidone(&tond);
7850         if (tvp) {
7851                 vnode_put(tvp);
7852         }
7853         vnode_put(tdvp);
7854         return error;
7855 }
7856
7857 /*
7858  * clone files or directories, target must not exist.
7859  */
7860 /* ARGSUSED */
7861 int
7862 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7863     __unused int32_t *retval)
7864 {
7865         vnode_t fvp;
7866         struct nameidata fromnd;
7867         int follow;
7868         int error;
7869         vfs_context_t ctx = vfs_context_current();
7870
7871         /* Check that the flags are valid. */
7872         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7873                 return EINVAL;
7874         }
7875
7876         AUDIT_ARG(fd, uap->src_dirfd);
7877
7878         follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7879         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7880             UIO_USERSPACE, uap->src, ctx);
7881         if ((error = nameiat(&fromnd, uap->src_dirfd))) {
7882                 return error;
7883         }
7884
7885         fvp = fromnd.ni_vp;
7886         nameidone(&fromnd);
7887
7888         error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7889             uap->flags, ctx);
7890
7891         vnode_put(fvp);
7892         return error;
7893 }
7894
7895 int
7896 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7897     __unused int32_t *retval)
7898 {
7899         vnode_t fvp;
7900         struct fileproc *fp;
7901         int error;
7902         vfs_context_t ctx = vfs_context_current();
7903
7904         /* Check that the flags are valid. */
7905         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7906                 return EINVAL;
7907         }
7908
7909         AUDIT_ARG(fd, uap->src_fd);
7910         error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7911         if (error) {
7912                 return error;
7913         }
7914
7915         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7916                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7917                 error = EBADF;
7918                 goto out;
7919         }
7920
7921         if ((error = vnode_getwithref(fvp))) {
7922                 goto out;
7923         }
7924
7925         AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7926
7927         error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7928             uap->flags, ctx);
7929
7930         vnode_put(fvp);
7931 out:
7932         file_drop(uap->src_fd);
7933         return error;
7934 }
7935
7936 static int
7937 rename_submounts_callback(mount_t mp, void *arg)
7938 {
7939         int error = 0;
7940         mount_t pmp = (mount_t)arg;
7941         int prefix_len = strlen(pmp->mnt_vfsstat.f_mntonname);
7942
7943         if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
7944                 return 0;
7945         }
7946
7947         if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
7948                 return 0;
7949         }
7950
7951         if ((error = vfs_busy(mp, LK_NOWAIT))) {
7952                 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
7953                 return -1;
7954         }
7955
7956         int pathlen = MAXPATHLEN;
7957         if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
7958                 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
7959         }
7960
7961         vfs_unbusy(mp);
7962
7963         return error;
7964 }
7965
7966 /*
7967  * Rename files.  Source and destination must either both be directories,
7968  * or both not be directories.  If target is a directory, it must be empty.
7969  */
7970 /* ARGSUSED */
7971 static int
7972 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
7973     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
7974 {
7975         if (flags & ~VFS_RENAME_FLAGS_MASK) {
7976                 return EINVAL;
7977         }
7978
7979         if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
7980                 return EINVAL;
7981         }
7982
7983         vnode_t tvp, tdvp;
7984         vnode_t fvp, fdvp;
7985         struct nameidata *fromnd, *tond;
7986         int error;
7987         int do_retry;
7988         int retry_count;
7989         int mntrename;
7990         int need_event;
7991         int need_kpath2;
7992         int has_listeners;
7993         const char *oname = NULL;
7994         char *from_name = NULL, *to_name = NULL;
7995         char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
7996         int from_len = 0, to_len = 0;
7997         int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
7998         int holding_mntlock;
7999         mount_t locked_mp = NULL;
8000         vnode_t oparent = NULLVP;
8001 #if CONFIG_FSE
8002         fse_info from_finfo, to_finfo;
8003 #endif
8004         int from_truncated = 0, to_truncated = 0;
8005         int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8006         int batched = 0;
8007         struct vnode_attr *fvap, *tvap;
8008         int continuing = 0;
8009         /* carving out a chunk for structs that are too big to be on stack. */
8010         struct {
8011                 struct nameidata from_node, to_node;
8012                 struct vnode_attr fv_attr, tv_attr;
8013         } * __rename_data;
8014         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
8015         fromnd = &__rename_data->from_node;
8016         tond = &__rename_data->to_node;
8017
8018         holding_mntlock = 0;
8019         do_retry = 0;
8020         retry_count = 0;
8021 retry:
8022         fvp = tvp = NULL;
8023         fdvp = tdvp = NULL;
8024         fvap = tvap = NULL;
8025         mntrename = FALSE;
8026
8027         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8028             segflg, from, ctx);
8029         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
8030
8031         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8032             segflg, to, ctx);
8033         tond->ni_flag = NAMEI_COMPOUNDRENAME;
8034
8035 continue_lookup:
8036         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8037                 if ((error = nameiat(fromnd, fromfd))) {
8038                         goto out1;
8039                 }
8040                 fdvp = fromnd->ni_dvp;
8041                 fvp  = fromnd->ni_vp;
8042
8043                 if (fvp && fvp->v_type == VDIR) {
8044                         tond->ni_cnd.cn_flags |= WILLBEDIR;
8045                 }
8046         }
8047
8048         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8049                 if ((error = nameiat(tond, tofd))) {
8050                         /*
8051                          * Translate error code for rename("dir1", "dir2/.").
8052                          */
8053                         if (error == EISDIR && fvp->v_type == VDIR) {
8054                                 error = EINVAL;
8055                         }
8056                         goto out1;
8057                 }
8058                 tdvp = tond->ni_dvp;
8059                 tvp  = tond->ni_vp;
8060         }
8061
8062 #if DEVELOPMENT || DEBUG
8063         /*
8064          * XXX VSWAP: Check for entitlements or special flag here
8065          * so we can restrict access appropriately.
8066          */
8067 #else /* DEVELOPMENT || DEBUG */
8068
8069         if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8070                 error = EPERM;
8071                 goto out1;
8072         }
8073
8074         if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8075                 error = EPERM;
8076                 goto out1;
8077         }
8078 #endif /* DEVELOPMENT || DEBUG */
8079
8080         if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8081                 error = ENOENT;
8082                 goto out1;
8083         }
8084
8085         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8086                 error = EEXIST;
8087                 goto out1;
8088         }
8089
8090         batched = vnode_compound_rename_available(fdvp);
8091
8092 #if CONFIG_FSE
8093         need_event = need_fsevent(FSE_RENAME, fdvp);
8094         if (need_event) {
8095                 if (fvp) {
8096                         get_fse_info(fvp, &from_finfo, ctx);
8097                 } else {
8098                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8099                         if (error) {
8100                                 goto out1;
8101                         }
8102
8103                         fvap = &__rename_data->fv_attr;
8104                 }
8105
8106                 if (tvp) {
8107                         get_fse_info(tvp, &to_finfo, ctx);
8108                 } else if (batched) {
8109                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8110                         if (error) {
8111                                 goto out1;
8112                         }
8113
8114                         tvap = &__rename_data->tv_attr;
8115                 }
8116         }
8117 #else
8118         need_event = 0;
8119 #endif /* CONFIG_FSE */
8120
8121         has_listeners = kauth_authorize_fileop_has_listeners();
8122
8123         need_kpath2 = 0;
8124 #if CONFIG_AUDIT
8125         if (AUDIT_RECORD_EXISTS()) {
8126                 need_kpath2 = 1;
8127         }
8128 #endif
8129
8130         if (need_event || has_listeners) {
8131                 if (from_name == NULL) {
8132                         GET_PATH(from_name);
8133                         if (from_name == NULL) {
8134                                 error = ENOMEM;
8135                                 goto out1;
8136                         }
8137                 }
8138
8139                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8140
8141                 if (from_name_no_firmlink == NULL) {
8142                         GET_PATH(from_name_no_firmlink);
8143                         if (from_name_no_firmlink == NULL) {
8144                                 error = ENOMEM;
8145                                 goto out1;
8146                         }
8147                 }
8148
8149                 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8150         }
8151
8152         if (need_event || need_kpath2 || has_listeners) {
8153                 if (to_name == NULL) {
8154                         GET_PATH(to_name);
8155                         if (to_name == NULL) {
8156                                 error = ENOMEM;
8157                                 goto out1;
8158                         }
8159                 }
8160
8161                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8162
8163                 if (to_name_no_firmlink == NULL) {
8164                         GET_PATH(to_name_no_firmlink);
8165                         if (to_name_no_firmlink == NULL) {
8166                                 error = ENOMEM;
8167                                 goto out1;
8168                         }
8169                 }
8170
8171                 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8172                 if (to_name && need_kpath2) {
8173                         AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8174                 }
8175         }
8176         if (!fvp) {
8177                 /*
8178                  * Claim: this check will never reject a valid rename.
8179                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8180                  * Suppose fdvp and tdvp are not on the same mount.
8181                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
8182                  *      then you can't move it to within another dir on the same mountpoint.
8183                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8184                  *
8185                  * If this check passes, then we are safe to pass these vnodes to the same FS.
8186                  */
8187                 if (fdvp->v_mount != tdvp->v_mount) {
8188                         error = EXDEV;
8189                         goto out1;
8190                 }
8191                 goto skipped_lookup;
8192         }
8193
8194         if (!batched) {
8195                 error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
8196                 if (error) {
8197                         if (error == ENOENT) {
8198                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8199                                         /*
8200                                          * We encountered a race where after doing the namei, tvp stops
8201                                          * being valid. If so, simply re-drive the rename call from the
8202                                          * top.
8203                                          */
8204                                         do_retry = 1;
8205                                         retry_count += 1;
8206                                 }
8207                         }
8208                         goto out1;
8209                 }
8210         }
8211
8212         /*
8213          * If the source and destination are the same (i.e. they're
8214          * links to the same vnode) and the target file system is
8215          * case sensitive, then there is nothing to do.
8216          *
8217          * XXX Come back to this.
8218          */
8219         if (fvp == tvp) {
8220                 int pathconf_val;
8221
8222                 /*
8223                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8224                  * then assume that this file system is case sensitive.
8225                  */
8226                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8227                     pathconf_val != 0) {
8228                         goto out1;
8229                 }
8230         }
8231
8232         /*
8233          * Allow the renaming of mount points.
8234          * - target must not exist
8235          * - target must reside in the same directory as source
8236          * - union mounts cannot be renamed
8237          * - "/" cannot be renamed
8238          *
8239          * XXX Handle this in VFS after a continued lookup (if we missed
8240          * in the cache to start off)
8241          *
8242          * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8243          * we'll skip past here.  The file system is responsible for
8244          * checking that @tvp is not a descendent of @fvp and vice versa
8245          * so it should always return EINVAL if either @tvp or @fvp is the
8246          * root of a volume.
8247          */
8248         if ((fvp->v_flag & VROOT) &&
8249             (fvp->v_type == VDIR) &&
8250             (tvp == NULL) &&
8251             (fvp->v_mountedhere == NULL) &&
8252             (fdvp == tdvp) &&
8253             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8254             ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8255             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8256                 vnode_t coveredvp;
8257
8258                 /* switch fvp to the covered vnode */
8259                 coveredvp = fvp->v_mount->mnt_vnodecovered;
8260                 if ((vnode_getwithref(coveredvp))) {
8261                         error = ENOENT;
8262                         goto out1;
8263                 }
8264                 vnode_put(fvp);
8265
8266                 fvp = coveredvp;
8267                 mntrename = TRUE;
8268         }
8269         /*
8270          * Check for cross-device rename.
8271          */
8272         if ((fvp->v_mount != tdvp->v_mount) ||
8273             (tvp && (fvp->v_mount != tvp->v_mount))) {
8274                 error = EXDEV;
8275                 goto out1;
8276         }
8277
8278         /*
8279          * If source is the same as the destination (that is the
8280          * same inode number) then there is nothing to do...
8281          * EXCEPT if the underlying file system supports case
8282          * insensitivity and is case preserving.  In this case
8283          * the file system needs to handle the special case of
8284          * getting the same vnode as target (fvp) and source (tvp).
8285          *
8286          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8287          * and _PC_CASE_PRESERVING can have this exception, and they need to
8288          * handle the special case of getting the same vnode as target and
8289          * source.  NOTE: Then the target is unlocked going into vnop_rename,
8290          * so not to cause locking problems. There is a single reference on tvp.
8291          *
8292          * NOTE - that fvp == tvp also occurs if they are hard linked and
8293          * that correct behaviour then is just to return success without doing
8294          * anything.
8295          *
8296          * XXX filesystem should take care of this itself, perhaps...
8297          */
8298         if (fvp == tvp && fdvp == tdvp) {
8299                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8300                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8301                     fromnd->ni_cnd.cn_namelen)) {
8302                         goto out1;
8303                 }
8304         }
8305
8306         if (holding_mntlock && fvp->v_mount != locked_mp) {
8307                 /*
8308                  * we're holding a reference and lock
8309                  * on locked_mp, but it no longer matches
8310                  * what we want to do... so drop our hold
8311                  */
8312                 mount_unlock_renames(locked_mp);
8313                 mount_drop(locked_mp, 0);
8314                 holding_mntlock = 0;
8315         }
8316         if (tdvp != fdvp && fvp->v_type == VDIR) {
8317                 /*
8318                  * serialize renames that re-shape
8319                  * the tree... if holding_mntlock is
8320                  * set, then we're ready to go...
8321                  * otherwise we
8322                  * first need to drop the iocounts
8323                  * we picked up, second take the
8324                  * lock to serialize the access,
8325                  * then finally start the lookup
8326                  * process over with the lock held
8327                  */
8328                 if (!holding_mntlock) {
8329                         /*
8330                          * need to grab a reference on
8331                          * the mount point before we
8332                          * drop all the iocounts... once
8333                          * the iocounts are gone, the mount
8334                          * could follow
8335                          */
8336                         locked_mp = fvp->v_mount;
8337                         mount_ref(locked_mp, 0);
8338
8339                         /*
8340                          * nameidone has to happen before we vnode_put(tvp)
8341                          * since it may need to release the fs_nodelock on the tvp
8342                          */
8343                         nameidone(tond);
8344
8345                         if (tvp) {
8346                                 vnode_put(tvp);
8347                         }
8348                         vnode_put(tdvp);
8349
8350                         /*
8351                          * nameidone has to happen before we vnode_put(fdvp)
8352                          * since it may need to release the fs_nodelock on the fvp
8353                          */
8354                         nameidone(fromnd);
8355
8356                         vnode_put(fvp);
8357                         vnode_put(fdvp);
8358
8359                         mount_lock_renames(locked_mp);
8360                         holding_mntlock = 1;
8361
8362                         goto retry;
8363                 }
8364         } else {
8365                 /*
8366                  * when we dropped the iocounts to take
8367                  * the lock, we allowed the identity of
8368                  * the various vnodes to change... if they did,
8369                  * we may no longer be dealing with a rename
8370                  * that reshapes the tree... once we're holding
8371                  * the iocounts, the vnodes can't change type
8372                  * so we're free to drop the lock at this point
8373                  * and continue on
8374                  */
8375                 if (holding_mntlock) {
8376                         mount_unlock_renames(locked_mp);
8377                         mount_drop(locked_mp, 0);
8378                         holding_mntlock = 0;
8379                 }
8380         }
8381
8382         // save these off so we can later verify that fvp is the same
8383         oname   = fvp->v_name;
8384         oparent = fvp->v_parent;
8385
8386 skipped_lookup:
8387         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8388             tdvp, &tvp, &tond->ni_cnd, tvap,
8389             flags, ctx);
8390
8391         if (holding_mntlock) {
8392                 /*
8393                  * we can drop our serialization
8394                  * lock now
8395                  */
8396                 mount_unlock_renames(locked_mp);
8397                 mount_drop(locked_mp, 0);
8398                 holding_mntlock = 0;
8399         }
8400         if (error) {
8401                 if (error == EDATALESS) {
8402                         /*
8403                          * If we've been here before, something has gone
8404                          * horribly wrong and we should just get out lest
8405                          * we spiral around the drain forever.
8406                          */
8407                         if (flags & VFS_RENAME_DATALESS) {
8408                                 error = EIO;
8409                                 goto out1;
8410                         }
8411
8412                         /*
8413                          * The object we're renaming is dataless (or has a
8414                          * dataless descendent) and requires materialization
8415                          * before the rename occurs.  But we're holding the
8416                          * mount point's rename lock, so it's not safe to
8417                          * make the upcall.
8418                          *
8419                          * In this case, we release the lock, perform the
8420                          * materialization, and start the whole thing over.
8421                          */
8422                         error = vnode_materialize_dataless_file(fvp,
8423                             NAMESPACE_HANDLER_RENAME_OP);
8424
8425                         if (error == 0) {
8426                                 /*
8427                                  * The next time around we need to tell the
8428                                  * file system that the materializtaion has
8429                                  * been performed.
8430                                  */
8431                                 flags |= VFS_RENAME_DATALESS;
8432                                 do_retry = 1;
8433                         }
8434                         goto out1;
8435                 }
8436                 if (error == EKEEPLOOKING) {
8437                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8438                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8439                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8440                                 }
8441                         }
8442
8443                         fromnd->ni_vp = fvp;
8444                         tond->ni_vp = tvp;
8445
8446                         goto continue_lookup;
8447                 }
8448
8449                 /*
8450                  * We may encounter a race in the VNOP where the destination didn't
8451                  * exist when we did the namei, but it does by the time we go and
8452                  * try to create the entry. In this case, we should re-drive this rename
8453                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
8454                  * but other filesystems susceptible to this race could return it, too.
8455                  */
8456                 if (error == ERECYCLE) {
8457                         do_retry = 1;
8458                 }
8459
8460                 /*
8461                  * For compound VNOPs, the authorization callback may return
8462                  * ENOENT in case of racing hardlink lookups hitting the name
8463                  * cache, redrive the lookup.
8464                  */
8465                 if (batched && error == ENOENT) {
8466                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8467                                 do_retry = 1;
8468                                 retry_count += 1;
8469                         }
8470                 }
8471
8472                 goto out1;
8473         }
8474
8475         /* call out to allow 3rd party notification of rename.
8476          * Ignore result of kauth_authorize_fileop call.
8477          */
8478         kauth_authorize_fileop(vfs_context_ucred(ctx),
8479             KAUTH_FILEOP_RENAME,
8480             (uintptr_t)from_name, (uintptr_t)to_name);
8481         if (flags & VFS_RENAME_SWAP) {
8482                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8483                     KAUTH_FILEOP_RENAME,
8484                     (uintptr_t)to_name, (uintptr_t)from_name);
8485         }
8486
8487 #if CONFIG_FSE
8488         if (from_name != NULL && to_name != NULL) {
8489                 if (from_truncated || to_truncated) {
8490                         // set it here since only the from_finfo gets reported up to user space
8491                         from_finfo.mode |= FSE_TRUNCATED_PATH;
8492                 }
8493
8494                 if (tvap && tvp) {
8495                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8496                 }
8497                 if (fvap) {
8498                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8499                 }
8500
8501                 if (tvp) {
8502                         add_fsevent(FSE_RENAME, ctx,
8503                             FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8504                             FSE_ARG_FINFO, &from_finfo,
8505                             FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8506                             FSE_ARG_FINFO, &to_finfo,
8507                             FSE_ARG_DONE);
8508                         if (flags & VFS_RENAME_SWAP) {
8509                                 /*
8510                                  * Strictly speaking, swap is the equivalent of
8511                                  * *three* renames.  FSEvents clients should only take
8512                                  * the events as a hint, so we only bother reporting
8513                                  * two.
8514                                  */
8515                                 add_fsevent(FSE_RENAME, ctx,
8516                                     FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8517                                     FSE_ARG_FINFO, &to_finfo,
8518                                     FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8519                                     FSE_ARG_FINFO, &from_finfo,
8520                                     FSE_ARG_DONE);
8521                         }
8522                 } else {
8523                         add_fsevent(FSE_RENAME, ctx,
8524                             FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8525                             FSE_ARG_FINFO, &from_finfo,
8526                             FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8527                             FSE_ARG_DONE);
8528                 }
8529         }
8530 #endif /* CONFIG_FSE */
8531
8532         /*
8533          * update filesystem's mount point data
8534          */
8535         if (mntrename) {
8536                 char *cp, *pathend, *mpname;
8537                 char * tobuf;
8538                 struct mount *mp;
8539                 int maxlen;
8540                 size_t len = 0;
8541
8542                 mp = fvp->v_mountedhere;
8543
8544                 if (vfs_busy(mp, LK_NOWAIT)) {
8545                         error = EBUSY;
8546                         goto out1;
8547                 }
8548                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
8549
8550                 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8551                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8552                 } else {
8553                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8554                 }
8555                 if (!error) {
8556                         /* find current mount point prefix */
8557                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
8558                         for (cp = pathend; *cp != '\0'; ++cp) {
8559                                 if (*cp == '/') {
8560                                         pathend = cp + 1;
8561                                 }
8562                         }
8563                         /* find last component of target name */
8564                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8565                                 if (*cp == '/') {
8566                                         mpname = cp + 1;
8567                                 }
8568                         }
8569
8570                         /* Update f_mntonname of sub mounts */
8571                         vfs_iterate(0, rename_submounts_callback, (void *)mp);
8572
8573                         /* append name to prefix */
8574                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
8575                         bzero(pathend, maxlen);
8576
8577                         strlcpy(pathend, mpname, maxlen);
8578                 }
8579                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
8580
8581                 vfs_unbusy(mp);
8582
8583                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8584         }
8585         /*
8586          * fix up name & parent pointers.  note that we first
8587          * check that fvp has the same name/parent pointers it
8588          * had before the rename call... this is a 'weak' check
8589          * at best...
8590          *
8591          * XXX oparent and oname may not be set in the compound vnop case
8592          */
8593         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8594                 int update_flags;
8595
8596                 update_flags = VNODE_UPDATE_NAME;
8597
8598                 if (fdvp != tdvp) {
8599                         update_flags |= VNODE_UPDATE_PARENT;
8600                 }
8601
8602                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8603         }
8604 out1:
8605         if (to_name != NULL) {
8606                 RELEASE_PATH(to_name);
8607                 to_name = NULL;
8608         }
8609         if (to_name_no_firmlink != NULL) {
8610                 RELEASE_PATH(to_name_no_firmlink);
8611                 to_name_no_firmlink = NULL;
8612         }
8613         if (from_name != NULL) {
8614                 RELEASE_PATH(from_name);
8615                 from_name = NULL;
8616         }
8617         if (from_name_no_firmlink != NULL) {
8618                 RELEASE_PATH(from_name_no_firmlink);
8619                 from_name_no_firmlink = NULL;
8620         }
8621         if (holding_mntlock) {
8622                 mount_unlock_renames(locked_mp);
8623                 mount_drop(locked_mp, 0);
8624                 holding_mntlock = 0;
8625         }
8626         if (tdvp) {
8627                 /*
8628                  * nameidone has to happen before we vnode_put(tdvp)
8629                  * since it may need to release the fs_nodelock on the tdvp
8630                  */
8631                 nameidone(tond);
8632
8633                 if (tvp) {
8634                         vnode_put(tvp);
8635                 }
8636                 vnode_put(tdvp);
8637         }
8638         if (fdvp) {
8639                 /*
8640                  * nameidone has to happen before we vnode_put(fdvp)
8641                  * since it may need to release the fs_nodelock on the fdvp
8642                  */
8643                 nameidone(fromnd);
8644
8645                 if (fvp) {
8646                         vnode_put(fvp);
8647                 }
8648                 vnode_put(fdvp);
8649         }
8650
8651         /*
8652          * If things changed after we did the namei, then we will re-drive
8653          * this rename call from the top.
8654          */
8655         if (do_retry) {
8656                 do_retry = 0;
8657                 goto retry;
8658         }
8659
8660         FREE(__rename_data, M_TEMP);
8661         return error;
8662 }
8663
8664 int
8665 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
8666 {
8667         return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
8668                    AT_FDCWD, uap->to, UIO_USERSPACE, 0);
8669 }
8670
8671 int
8672 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
8673 {
8674         return renameat_internal(
8675                 vfs_context_current(),
8676                 uap->fromfd, uap->from,
8677                 uap->tofd, uap->to,
8678                 UIO_USERSPACE, uap->flags);
8679 }
8680
8681 int
8682 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
8683 {
8684         return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
8685                    uap->tofd, uap->to, UIO_USERSPACE, 0);
8686 }
8687
8688 /*
8689  * Make a directory file.
8690  *
8691  * Returns:     0                       Success
8692  *              EEXIST
8693  *      namei:???
8694  *      vnode_authorize:???
8695  *      vn_create:???
8696  */
8697 /* ARGSUSED */
8698 static int
8699 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
8700     enum uio_seg segflg)
8701 {
8702         vnode_t vp, dvp;
8703         int error;
8704         int update_flags = 0;
8705         int batched;
8706         struct nameidata nd;
8707
8708         AUDIT_ARG(mode, vap->va_mode);
8709         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
8710             path, ctx);
8711         nd.ni_cnd.cn_flags |= WILLBEDIR;
8712         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
8713
8714 continue_lookup:
8715         error = nameiat(&nd, fd);
8716         if (error) {
8717                 return error;
8718         }
8719         dvp = nd.ni_dvp;
8720         vp = nd.ni_vp;
8721
8722         if (vp != NULL) {
8723                 error = EEXIST;
8724                 goto out;
8725         }
8726
8727         batched = vnode_compound_mkdir_available(dvp);
8728
8729         VATTR_SET(vap, va_type, VDIR);
8730
8731         /*
8732          * XXX
8733          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
8734          * only get EXISTS or EISDIR for existing path components, and not that it could see
8735          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
8736          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
8737          */
8738         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
8739                 if (error == EACCES || error == EPERM) {
8740                         int error2;
8741
8742                         nameidone(&nd);
8743                         vnode_put(dvp);
8744                         dvp = NULLVP;
8745
8746                         /*
8747                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
8748                          * rather than EACCESS if the target exists.
8749                          */
8750                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
8751                             path, ctx);
8752                         error2 = nameiat(&nd, fd);
8753                         if (error2) {
8754                                 goto out;
8755                         } else {
8756                                 vp = nd.ni_vp;
8757                                 error = EEXIST;
8758                                 goto out;
8759                         }
8760                 }
8761
8762                 goto out;
8763         }
8764
8765         /*
8766          * make the directory
8767          */
8768         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
8769                 if (error == EKEEPLOOKING) {
8770                         nd.ni_vp = vp;
8771                         goto continue_lookup;
8772                 }
8773
8774                 goto out;
8775         }
8776
8777         // Make sure the name & parent pointers are hooked up
8778         if (vp->v_name == NULL) {
8779                 update_flags |= VNODE_UPDATE_NAME;
8780         }
8781         if (vp->v_parent == NULLVP) {
8782                 update_flags |= VNODE_UPDATE_PARENT;
8783         }
8784
8785         if (update_flags) {
8786                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
8787         }
8788
8789 #if CONFIG_FSE
8790         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
8791 #endif
8792
8793 out:
8794         /*
8795          * nameidone has to happen before we vnode_put(dvp)
8796          * since it may need to release the fs_nodelock on the dvp
8797          */
8798         nameidone(&nd);
8799
8800         if (vp) {
8801                 vnode_put(vp);
8802         }
8803         if (dvp) {
8804                 vnode_put(dvp);
8805         }
8806
8807         return error;
8808 }
8809
8810 /*
8811  * mkdir_extended: Create a directory; with extended security (ACL).
8812  *
8813  * Parameters:    p                       Process requesting to create the directory
8814  *                uap                     User argument descriptor (see below)
8815  *                retval                  (ignored)
8816  *
8817  * Indirect:      uap->path               Path of directory to create
8818  *                uap->mode               Access permissions to set
8819  *                uap->xsecurity          ACL to set
8820  *
8821  * Returns:        0                      Success
8822  *                !0                      Not success
8823  *
8824  */
8825 int
8826 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
8827 {
8828         int ciferror;
8829         kauth_filesec_t xsecdst;
8830         struct vnode_attr va;
8831
8832         AUDIT_ARG(owner, uap->uid, uap->gid);
8833
8834         xsecdst = NULL;
8835         if ((uap->xsecurity != USER_ADDR_NULL) &&
8836             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
8837                 return ciferror;
8838         }
8839
8840         VATTR_INIT(&va);
8841         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8842         if (xsecdst != NULL) {
8843                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
8844         }
8845
8846         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8847             UIO_USERSPACE);
8848         if (xsecdst != NULL) {
8849                 kauth_filesec_free(xsecdst);
8850         }
8851         return ciferror;
8852 }
8853
8854 int
8855 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
8856 {
8857         struct vnode_attr va;
8858
8859         VATTR_INIT(&va);
8860         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8861
8862         return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8863                    UIO_USERSPACE);
8864 }
8865
8866 int
8867 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
8868 {
8869         struct vnode_attr va;
8870
8871         VATTR_INIT(&va);
8872         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8873
8874         return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
8875                    UIO_USERSPACE);
8876 }
8877
8878 static int
8879 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
8880     enum uio_seg segflg, int unlink_flags)
8881 {
8882         vnode_t vp, dvp;
8883         int error;
8884         struct nameidata nd;
8885         char     *path = NULL;
8886         char     *no_firmlink_path = NULL;
8887         int       len_path = 0;
8888         int       len_no_firmlink_path = 0;
8889         int has_listeners = 0;
8890         int need_event = 0;
8891         int truncated_path = 0;
8892         int truncated_no_firmlink_path = 0;
8893 #if CONFIG_FSE
8894         struct vnode_attr va;
8895 #endif /* CONFIG_FSE */
8896         struct vnode_attr *vap = NULL;
8897         int restart_count = 0;
8898         int batched;
8899
8900         int restart_flag;
8901
8902         /*
8903          * This loop exists to restart rmdir in the unlikely case that two
8904          * processes are simultaneously trying to remove the same directory
8905          * containing orphaned appleDouble files.
8906          */
8907         do {
8908                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
8909                     segflg, dirpath, ctx);
8910                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
8911 continue_lookup:
8912                 restart_flag = 0;
8913                 vap = NULL;
8914
8915                 error = nameiat(&nd, fd);
8916                 if (error) {
8917                         return error;
8918                 }
8919
8920                 dvp = nd.ni_dvp;
8921                 vp = nd.ni_vp;
8922
8923                 if (vp) {
8924                         batched = vnode_compound_rmdir_available(vp);
8925
8926                         if (vp->v_flag & VROOT) {
8927                                 /*
8928                                  * The root of a mounted filesystem cannot be deleted.
8929                                  */
8930                                 error = EBUSY;
8931                                 goto out;
8932                         }
8933
8934 #if DEVELOPMENT || DEBUG
8935                         /*
8936                          * XXX VSWAP: Check for entitlements or special flag here
8937                          * so we can restrict access appropriately.
8938                          */
8939 #else /* DEVELOPMENT || DEBUG */
8940
8941                         if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
8942                                 error = EPERM;
8943                                 goto out;
8944                         }
8945 #endif /* DEVELOPMENT || DEBUG */
8946
8947                         /*
8948                          * Removed a check here; we used to abort if vp's vid
8949                          * was not the same as what we'd seen the last time around.
8950                          * I do not think that check was valid, because if we retry
8951                          * and all dirents are gone, the directory could legitimately
8952                          * be recycled but still be present in a situation where we would
8953                          * have had permission to delete.  Therefore, we won't make
8954                          * an effort to preserve that check now that we may not have a
8955                          * vp here.
8956                          */
8957
8958                         if (!batched) {
8959                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
8960                                 if (error) {
8961                                         if (error == ENOENT) {
8962                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8963                                                         restart_flag = 1;
8964                                                         restart_count += 1;
8965                                                 }
8966                                         }
8967                                         goto out;
8968                                 }
8969                         }
8970                 } else {
8971                         batched = 1;
8972
8973                         if (!vnode_compound_rmdir_available(dvp)) {
8974                                 panic("No error, but no compound rmdir?");
8975                         }
8976                 }
8977
8978 #if CONFIG_FSE
8979                 fse_info  finfo;
8980
8981                 need_event = need_fsevent(FSE_DELETE, dvp);
8982                 if (need_event) {
8983                         if (!batched) {
8984                                 get_fse_info(vp, &finfo, ctx);
8985                         } else {
8986                                 error = vfs_get_notify_attributes(&va);
8987                                 if (error) {
8988                                         goto out;
8989                                 }
8990
8991                                 vap = &va;
8992                         }
8993                 }
8994 #endif
8995                 has_listeners = kauth_authorize_fileop_has_listeners();
8996                 if (need_event || has_listeners) {
8997                         if (path == NULL) {
8998                                 GET_PATH(path);
8999                                 if (path == NULL) {
9000                                         error = ENOMEM;
9001                                         goto out;
9002                                 }
9003                         }
9004
9005                         len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9006
9007                         if (no_firmlink_path == NULL) {
9008                                 GET_PATH(no_firmlink_path);
9009                                 if (no_firmlink_path == NULL) {
9010                                         error = ENOMEM;
9011                                         goto out;
9012                                 }
9013                         }
9014
9015                         len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9016 #if CONFIG_FSE
9017                         if (truncated_no_firmlink_path) {
9018                                 finfo.mode |= FSE_TRUNCATED_PATH;
9019                         }
9020 #endif
9021                 }
9022
9023                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9024                 nd.ni_vp = vp;
9025                 if (vp == NULLVP) {
9026                         /* Couldn't find a vnode */
9027                         goto out;
9028                 }
9029
9030                 if (error == EKEEPLOOKING) {
9031                         goto continue_lookup;
9032                 } else if (batched && error == ENOENT) {
9033                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9034                                 /*
9035                                  * For compound VNOPs, the authorization callback
9036                                  * may return ENOENT in case of racing hard link lookups
9037                                  * redrive the lookup.
9038                                  */
9039                                 restart_flag = 1;
9040                                 restart_count += 1;
9041                                 goto out;
9042                         }
9043                 }
9044
9045                 /*
9046                  * XXX There's no provision for passing flags
9047                  * to VNOP_RMDIR().  So, if vn_rmdir() fails
9048                  * because it's not empty, then we try again
9049                  * with VNOP_REMOVE(), passing in a special
9050                  * flag that clever file systems will know
9051                  * how to handle.
9052                  */
9053                 if (error == ENOTEMPTY &&
9054                     (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9055                         /*
9056                          * If this fails, we want to keep the original
9057                          * error.
9058                          */
9059                         if (vn_remove(dvp, &vp, &nd,
9060                             VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9061                                 error = 0;
9062                         }
9063                 }
9064
9065 #if CONFIG_APPLEDOUBLE
9066                 /*
9067                  * Special case to remove orphaned AppleDouble
9068                  * files. I don't like putting this in the kernel,
9069                  * but carbon does not like putting this in carbon either,
9070                  * so here we are.
9071                  */
9072                 if (error == ENOTEMPTY) {
9073                         int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9074                         if (ad_error == EBUSY) {
9075                                 error = ad_error;
9076                                 goto out;
9077                         }
9078
9079
9080                         /*
9081                          * Assuming everything went well, we will try the RMDIR again
9082                          */
9083                         if (!ad_error) {
9084                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9085                         }
9086                 }
9087 #endif /* CONFIG_APPLEDOUBLE */
9088                 /*
9089                  * Call out to allow 3rd party notification of delete.
9090                  * Ignore result of kauth_authorize_fileop call.
9091                  */
9092                 if (!error) {
9093                         if (has_listeners) {
9094                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
9095                                     KAUTH_FILEOP_DELETE,
9096                                     (uintptr_t)vp,
9097                                     (uintptr_t)path);
9098                         }
9099
9100                         if (vp->v_flag & VISHARDLINK) {
9101                                 // see the comment in unlink1() about why we update
9102                                 // the parent of a hard link when it is removed
9103                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9104                         }
9105
9106 #if CONFIG_FSE
9107                         if (need_event) {
9108                                 if (vap) {
9109                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
9110                                 }
9111                                 add_fsevent(FSE_DELETE, ctx,
9112                                     FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9113                                     FSE_ARG_FINFO, &finfo,
9114                                     FSE_ARG_DONE);
9115                         }
9116 #endif
9117                 }
9118
9119 out:
9120                 if (path != NULL) {
9121                         RELEASE_PATH(path);
9122                         path = NULL;
9123                 }
9124
9125                 if (no_firmlink_path != NULL) {
9126                         RELEASE_PATH(no_firmlink_path);
9127                         no_firmlink_path = NULL;
9128                 }
9129
9130                 /*
9131                  * nameidone has to happen before we vnode_put(dvp)
9132                  * since it may need to release the fs_nodelock on the dvp
9133                  */
9134                 nameidone(&nd);
9135                 vnode_put(dvp);
9136
9137                 if (vp) {
9138                         vnode_put(vp);
9139                 }
9140
9141                 if (restart_flag == 0) {
9142                         wakeup_one((caddr_t)vp);
9143                         return error;
9144                 }
9145                 tsleep(vp, PVFS, "rm AD", 1);
9146         } while (restart_flag != 0);
9147
9148         return error;
9149 }
9150
9151 /*
9152  * Remove a directory file.
9153  */
9154 /* ARGSUSED */
9155 int
9156 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9157 {
9158         return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9159                    CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9160 }
9161
9162 /* Get direntry length padded to 8 byte alignment */
9163 #define DIRENT64_LEN(namlen) \
9164         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9165
9166 /* Get dirent length padded to 4 byte alignment */
9167 #define DIRENT_LEN(namelen) \
9168         ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9169
9170 /* Get the end of this dirent */
9171 #define DIRENT_END(dep) \
9172         (((char *)(dep)) + (dep)->d_reclen - 1)
9173
9174 errno_t
9175 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9176     int *numdirent, vfs_context_t ctxp)
9177 {
9178         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
9179         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9180             ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9181                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9182         } else {
9183                 size_t bufsize;
9184                 void * bufptr;
9185                 uio_t auio;
9186                 struct direntry *entry64;
9187                 struct dirent *dep;
9188                 int bytesread;
9189                 int error;
9190
9191                 /*
9192                  * We're here because the underlying file system does not
9193                  * support direnties or we mounted denying support so we must
9194                  * fall back to dirents and convert them to direntries.
9195                  *
9196                  * Our kernel buffer needs to be smaller since re-packing will
9197                  * expand each dirent.  The worse case (when the name length
9198                  * is 3 or less) corresponds to a struct direntry size of 32
9199                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9200                  * (4-byte aligned).  So having a buffer that is 3/8 the size
9201                  * will prevent us from reading more than we can pack.
9202                  *
9203                  * Since this buffer is wired memory, we will limit the
9204                  * buffer size to a maximum of 32K. We would really like to
9205                  * use 32K in the MIN(), but we use magic number 87371 to
9206                  * prevent uio_resid() * 3 / 8 from overflowing.
9207                  */
9208                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9209                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
9210                 if (bufptr == NULL) {
9211                         return ENOMEM;
9212                 }
9213
9214                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9215                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9216                 auio->uio_offset = uio->uio_offset;
9217
9218                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9219
9220                 dep = (struct dirent *)bufptr;
9221                 bytesread = bufsize - uio_resid(auio);
9222
9223                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
9224                     M_TEMP, M_WAITOK);
9225                 /*
9226                  * Convert all the entries and copy them out to user's buffer.
9227                  */
9228                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9229                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
9230
9231                         if (DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9232                             DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9233                                 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9234                                     vp->v_mount->mnt_vfsstat.f_mntonname,
9235                                     vp->v_name ? vp->v_name : "<unknown>");
9236                                 error = EIO;
9237                                 break;
9238                         }
9239
9240                         bzero(entry64, enbufsize);
9241                         /* Convert a dirent to a dirent64. */
9242                         entry64->d_ino = dep->d_ino;
9243                         entry64->d_seekoff = 0;
9244                         entry64->d_reclen = enbufsize;
9245                         entry64->d_namlen = dep->d_namlen;
9246                         entry64->d_type = dep->d_type;
9247                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9248
9249                         /* Move to next entry. */
9250                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
9251
9252                         /* Copy entry64 to user's buffer. */
9253                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9254                 }
9255
9256                 /* Update the real offset using the offset we got from VNOP_READDIR. */
9257                 if (error == 0) {
9258                         uio->uio_offset = auio->uio_offset;
9259                 }
9260                 uio_free(auio);
9261                 FREE(bufptr, M_TEMP);
9262                 FREE(entry64, M_TEMP);
9263                 return error;
9264         }
9265 }
9266
9267 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
9268
9269 /*
9270  * Read a block of directory entries in a file system independent format.
9271  */
9272 static int
9273 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9274     off_t *offset, int *eofflag, int flags)
9275 {
9276         vnode_t vp;
9277         struct vfs_context context = *vfs_context_current();    /* local copy */
9278         struct fileproc *fp;
9279         uio_t auio;
9280         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9281         off_t loff;
9282         int error, numdirent;
9283         char uio_buf[UIO_SIZEOF(1)];
9284
9285         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9286         if (error) {
9287                 return error;
9288         }
9289         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9290                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9291                 error = EBADF;
9292                 goto out;
9293         }
9294
9295         if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9296                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
9297         }
9298
9299 #if CONFIG_MACF
9300         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
9301         if (error) {
9302                 goto out;
9303         }
9304 #endif
9305         if ((error = vnode_getwithref(vp))) {
9306                 goto out;
9307         }
9308         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9309
9310 unionread:
9311         if (vp->v_type != VDIR) {
9312                 (void)vnode_put(vp);
9313                 error = EINVAL;
9314                 goto out;
9315         }
9316
9317 #if CONFIG_MACF
9318         error = mac_vnode_check_readdir(&context, vp);
9319         if (error != 0) {
9320                 (void)vnode_put(vp);
9321                 goto out;
9322         }
9323 #endif /* MAC */
9324
9325         loff = fp->f_fglob->fg_offset;
9326         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9327         uio_addiov(auio, bufp, bufsize);
9328
9329         if (flags & VNODE_READDIR_EXTENDED) {
9330                 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9331                 fp->f_fglob->fg_offset = uio_offset(auio);
9332         } else {
9333                 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9334                 fp->f_fglob->fg_offset = uio_offset(auio);
9335         }
9336         if (error) {
9337                 (void)vnode_put(vp);
9338                 goto out;
9339         }
9340
9341         if ((user_ssize_t)bufsize == uio_resid(auio)) {
9342                 if (union_dircheckp) {
9343                         error = union_dircheckp(&vp, fp, &context);
9344                         if (error == -1) {
9345                                 goto unionread;
9346                         }
9347                         if (error) {
9348                                 (void)vnode_put(vp);
9349                                 goto out;
9350                         }
9351                 }
9352
9353                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
9354                         struct vnode *tvp = vp;
9355                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
9356                                 vnode_ref(vp);
9357                                 fp->f_fglob->fg_data = (caddr_t) vp;
9358                                 fp->f_fglob->fg_offset = 0;
9359                                 vnode_rele(tvp);
9360                                 vnode_put(tvp);
9361                                 goto unionread;
9362                         }
9363                         vp = tvp;
9364                 }
9365         }
9366
9367         vnode_put(vp);
9368         if (offset) {
9369                 *offset = loff;
9370         }
9371
9372         *bytesread = bufsize - uio_resid(auio);
9373 out:
9374         file_drop(fd);
9375         return error;
9376 }
9377
9378
9379 int
9380 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9381 {
9382         off_t offset;
9383         ssize_t bytesread;
9384         int error, eofflag;
9385
9386         AUDIT_ARG(fd, uap->fd);
9387         error = getdirentries_common(uap->fd, uap->buf, uap->count,
9388             &bytesread, &offset, &eofflag, 0);
9389
9390         if (error == 0) {
9391                 if (proc_is64bit(p)) {
9392                         user64_long_t base = (user64_long_t)offset;
9393                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9394                 } else {
9395                         user32_long_t base = (user32_long_t)offset;
9396                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9397                 }
9398                 *retval = bytesread;
9399         }
9400         return error;
9401 }
9402
9403 int
9404 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9405 {
9406         off_t offset;
9407         ssize_t bytesread;
9408         int error, eofflag;
9409         user_size_t bufsize;
9410
9411         AUDIT_ARG(fd, uap->fd);
9412
9413         /*
9414          * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9415          * then the kernel carves out the last 4 bytes to return extended
9416          * information to userspace (namely whether we reached EOF with this call).
9417          */
9418         if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9419                 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9420         } else {
9421                 bufsize = uap->bufsize;
9422         }
9423
9424         error = getdirentries_common(uap->fd, uap->buf, bufsize,
9425             &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9426
9427         if (error == 0) {
9428                 *retval = bytesread;
9429                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9430
9431                 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9432                         getdirentries64_flags_t flags = 0;
9433                         if (eofflag) {
9434                                 flags |= GETDIRENTRIES64_EOF;
9435                         }
9436                         error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9437                             sizeof(flags));
9438                 }
9439         }
9440         return error;
9441 }
9442
9443
9444 /*
9445  * Set the mode mask for creation of filesystem nodes.
9446  * XXX implement xsecurity
9447  */
9448 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
9449 static int
9450 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9451 {
9452         struct filedesc *fdp;
9453
9454         AUDIT_ARG(mask, newmask);
9455         proc_fdlock(p);
9456         fdp = p->p_fd;
9457         *retval = fdp->fd_cmask;
9458         fdp->fd_cmask = newmask & ALLPERMS;
9459         proc_fdunlock(p);
9460         return 0;
9461 }
9462
9463 /*
9464  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9465  *
9466  * Parameters:    p                       Process requesting to set the umask
9467  *                uap                     User argument descriptor (see below)
9468  *                retval                  umask of the process (parameter p)
9469  *
9470  * Indirect:      uap->newmask            umask to set
9471  *                uap->xsecurity          ACL to set
9472  *
9473  * Returns:        0                      Success
9474  *                !0                      Not success
9475  *
9476  */
9477 int
9478 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9479 {
9480         int ciferror;
9481         kauth_filesec_t xsecdst;
9482
9483         xsecdst = KAUTH_FILESEC_NONE;
9484         if (uap->xsecurity != USER_ADDR_NULL) {
9485                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
9486                         return ciferror;
9487                 }
9488         } else {
9489                 xsecdst = KAUTH_FILESEC_NONE;
9490         }
9491
9492         ciferror = umask1(p, uap->newmask, xsecdst, retval);
9493
9494         if (xsecdst != KAUTH_FILESEC_NONE) {
9495                 kauth_filesec_free(xsecdst);
9496         }
9497         return ciferror;
9498 }
9499
9500 int
9501 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9502 {
9503         return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9504 }
9505
9506 /*
9507  * Void all references to file by ripping underlying filesystem
9508  * away from vnode.
9509  */
9510 /* ARGSUSED */
9511 int
9512 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9513 {
9514         vnode_t vp;
9515         struct vnode_attr va;
9516         vfs_context_t ctx = vfs_context_current();
9517         int error;
9518         struct nameidata nd;
9519
9520         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9521             uap->path, ctx);
9522         error = namei(&nd);
9523         if (error) {
9524                 return error;
9525         }
9526         vp = nd.ni_vp;
9527
9528         nameidone(&nd);
9529
9530         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9531                 error = ENOTSUP;
9532                 goto out;
9533         }
9534
9535         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9536                 error = EBUSY;
9537                 goto out;
9538         }
9539
9540 #if CONFIG_MACF
9541         error = mac_vnode_check_revoke(ctx, vp);
9542         if (error) {
9543                 goto out;
9544         }
9545 #endif
9546
9547         VATTR_INIT(&va);
9548         VATTR_WANTED(&va, va_uid);
9549         if ((error = vnode_getattr(vp, &va, ctx))) {
9550                 goto out;
9551         }
9552         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9553             (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9554                 goto out;
9555         }
9556         if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9557                 VNOP_REVOKE(vp, REVOKEALL, ctx);
9558         }
9559 out:
9560         vnode_put(vp);
9561         return error;
9562 }
9563
9564
9565 /*
9566  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9567  *  The following system calls are designed to support features
9568  *  which are specific to the HFS & HFS Plus volume formats
9569  */
9570
9571
9572 /*
9573  * Obtain attribute information on objects in a directory while enumerating
9574  * the directory.
9575  */
9576 /* ARGSUSED */
9577 int
9578 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9579 {
9580         vnode_t vp;
9581         struct fileproc *fp;
9582         uio_t auio = NULL;
9583         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9584         uint32_t count = 0, savecount = 0;
9585         uint32_t newstate = 0;
9586         int error, eofflag;
9587         uint32_t loff = 0;
9588         struct attrlist attributelist;
9589         vfs_context_t ctx = vfs_context_current();
9590         int fd = uap->fd;
9591         char uio_buf[UIO_SIZEOF(1)];
9592         kauth_action_t action;
9593
9594         AUDIT_ARG(fd, fd);
9595
9596         /* Get the attributes into kernel space */
9597         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9598                 return error;
9599         }
9600         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9601                 return error;
9602         }
9603         savecount = count;
9604         if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9605                 return error;
9606         }
9607         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9608                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9609                 error = EBADF;
9610                 goto out;
9611         }
9612
9613
9614 #if CONFIG_MACF
9615         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
9616             fp->f_fglob);
9617         if (error) {
9618                 goto out;
9619         }
9620 #endif
9621
9622
9623         if ((error = vnode_getwithref(vp))) {
9624                 goto out;
9625         }
9626
9627         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9628
9629 unionread:
9630         if (vp->v_type != VDIR) {
9631                 (void)vnode_put(vp);
9632                 error = EINVAL;
9633                 goto out;
9634         }
9635
9636 #if CONFIG_MACF
9637         error = mac_vnode_check_readdir(ctx, vp);
9638         if (error != 0) {
9639                 (void)vnode_put(vp);
9640                 goto out;
9641         }
9642 #endif /* MAC */
9643
9644         /* set up the uio structure which will contain the users return buffer */
9645         loff = fp->f_fglob->fg_offset;
9646         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9647         uio_addiov(auio, uap->buffer, uap->buffersize);
9648
9649         /*
9650          * If the only item requested is file names, we can let that past with
9651          * just LIST_DIRECTORY.  If they want any other attributes, that means
9652          * they need SEARCH as well.
9653          */
9654         action = KAUTH_VNODE_LIST_DIRECTORY;
9655         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
9656             attributelist.fileattr || attributelist.dirattr) {
9657                 action |= KAUTH_VNODE_SEARCH;
9658         }
9659
9660         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
9661                 /* Believe it or not, uap->options only has 32-bits of valid
9662                  * info, so truncate before extending again */
9663
9664                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
9665                     (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
9666         }
9667
9668         if (error) {
9669                 (void) vnode_put(vp);
9670                 goto out;
9671         }
9672
9673         /*
9674          * If we've got the last entry of a directory in a union mount
9675          * then reset the eofflag and pretend there's still more to come.
9676          * The next call will again set eofflag and the buffer will be empty,
9677          * so traverse to the underlying directory and do the directory
9678          * read there.
9679          */
9680         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
9681                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
9682                         eofflag = 0;
9683                 } else {                                                // Empty buffer
9684                         struct vnode *tvp = vp;
9685                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
9686                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
9687                                 fp->f_fglob->fg_data = (caddr_t) vp;
9688                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
9689                                 count = savecount;
9690                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
9691                                 vnode_put(tvp);
9692                                 goto unionread;
9693                         }
9694                         vp = tvp;
9695                 }
9696         }
9697
9698         (void)vnode_put(vp);
9699
9700         if (error) {
9701                 goto out;
9702         }
9703         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
9704
9705         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
9706                 goto out;
9707         }
9708         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
9709                 goto out;
9710         }
9711         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
9712                 goto out;
9713         }
9714
9715         *retval = eofflag;  /* similar to getdirentries */
9716         error = 0;
9717 out:
9718         file_drop(fd);
9719         return error; /* return error earlier, an retval of 0 or 1 now */
9720 } /* end of getdirentriesattr system call */
9721
9722 /*
9723  * Exchange data between two files
9724  */
9725
9726 /* ARGSUSED */
9727 int
9728 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
9729 {
9730         struct nameidata fnd, snd;
9731         vfs_context_t ctx = vfs_context_current();
9732         vnode_t fvp;
9733         vnode_t svp;
9734         int error;
9735         u_int32_t nameiflags;
9736         char *fpath = NULL;
9737         char *spath = NULL;
9738         int   flen = 0, slen = 0;
9739         int from_truncated = 0, to_truncated = 0;
9740 #if CONFIG_FSE
9741         fse_info f_finfo, s_finfo;
9742 #endif
9743
9744         nameiflags = 0;
9745         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
9746                 nameiflags |= FOLLOW;
9747         }
9748
9749         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
9750             UIO_USERSPACE, uap->path1, ctx);
9751
9752         error = namei(&fnd);
9753         if (error) {
9754                 goto out2;
9755         }
9756
9757         nameidone(&fnd);
9758         fvp = fnd.ni_vp;
9759
9760         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
9761             UIO_USERSPACE, uap->path2, ctx);
9762
9763         error = namei(&snd);
9764         if (error) {
9765                 vnode_put(fvp);
9766                 goto out2;
9767         }
9768         nameidone(&snd);
9769         svp = snd.ni_vp;
9770
9771         /*
9772          * if the files are the same, return an inval error
9773          */
9774         if (svp == fvp) {
9775                 error = EINVAL;
9776                 goto out;
9777         }
9778
9779         /*
9780          * if the files are on different volumes, return an error
9781          */
9782         if (svp->v_mount != fvp->v_mount) {
9783                 error = EXDEV;
9784                 goto out;
9785         }
9786
9787         /* If they're not files, return an error */
9788         if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
9789                 error = EINVAL;
9790                 goto out;
9791         }
9792
9793 #if CONFIG_MACF
9794         error = mac_vnode_check_exchangedata(ctx,
9795             fvp, svp);
9796         if (error) {
9797                 goto out;
9798         }
9799 #endif
9800         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
9801             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
9802                 goto out;
9803         }
9804
9805         if (
9806 #if CONFIG_FSE
9807                 need_fsevent(FSE_EXCHANGE, fvp) ||
9808 #endif
9809                 kauth_authorize_fileop_has_listeners()) {
9810                 GET_PATH(fpath);
9811                 GET_PATH(spath);
9812                 if (fpath == NULL || spath == NULL) {
9813                         error = ENOMEM;
9814                         goto out;
9815                 }
9816
9817                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
9818                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
9819
9820 #if CONFIG_FSE
9821                 get_fse_info(fvp, &f_finfo, ctx);
9822                 get_fse_info(svp, &s_finfo, ctx);
9823                 if (from_truncated || to_truncated) {
9824                         // set it here since only the f_finfo gets reported up to user space
9825                         f_finfo.mode |= FSE_TRUNCATED_PATH;
9826                 }
9827 #endif
9828         }
9829         /* Ok, make the call */
9830         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
9831
9832         if (error == 0) {
9833                 const char *tmpname;
9834
9835                 if (fpath != NULL && spath != NULL) {
9836                         /* call out to allow 3rd party notification of exchangedata.
9837                          * Ignore result of kauth_authorize_fileop call.
9838                          */
9839                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
9840                             (uintptr_t)fpath, (uintptr_t)spath);
9841                 }
9842                 name_cache_lock();
9843
9844                 tmpname     = fvp->v_name;
9845                 fvp->v_name = svp->v_name;
9846                 svp->v_name = tmpname;
9847
9848                 if (fvp->v_parent != svp->v_parent) {
9849                         vnode_t tmp;
9850
9851                         tmp           = fvp->v_parent;
9852                         fvp->v_parent = svp->v_parent;
9853                         svp->v_parent = tmp;
9854                 }
9855                 name_cache_unlock();
9856
9857 #if CONFIG_FSE
9858                 if (fpath != NULL && spath != NULL) {
9859                         add_fsevent(FSE_EXCHANGE, ctx,
9860                             FSE_ARG_STRING, flen, fpath,
9861                             FSE_ARG_FINFO, &f_finfo,
9862                             FSE_ARG_STRING, slen, spath,
9863                             FSE_ARG_FINFO, &s_finfo,
9864                             FSE_ARG_DONE);
9865                 }
9866 #endif
9867         }
9868
9869 out:
9870         if (fpath != NULL) {
9871                 RELEASE_PATH(fpath);
9872         }
9873         if (spath != NULL) {
9874                 RELEASE_PATH(spath);
9875         }
9876         vnode_put(svp);
9877         vnode_put(fvp);
9878 out2:
9879         return error;
9880 }
9881
9882 /*
9883  * Return (in MB) the amount of freespace on the given vnode's volume.
9884  */
9885 uint32_t freespace_mb(vnode_t vp);
9886
9887 uint32_t
9888 freespace_mb(vnode_t vp)
9889 {
9890         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
9891         return ((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
9892                vp->v_mount->mnt_vfsstat.f_bsize) >> 20;
9893 }
9894
9895 #if CONFIG_SEARCHFS
9896
9897 /* ARGSUSED */
9898
9899 int
9900 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
9901 {
9902         vnode_t vp, tvp;
9903         int i, error = 0;
9904         int fserror = 0;
9905         struct nameidata nd;
9906         struct user64_fssearchblock searchblock;
9907         struct searchstate *state;
9908         struct attrlist *returnattrs;
9909         struct timeval timelimit;
9910         void *searchparams1, *searchparams2;
9911         uio_t auio = NULL;
9912         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9913         uint32_t nummatches;
9914         int mallocsize;
9915         uint32_t nameiflags;
9916         vfs_context_t ctx = vfs_context_current();
9917         char uio_buf[UIO_SIZEOF(1)];
9918
9919         /* Start by copying in fsearchblock parameter list */
9920         if (IS_64BIT_PROCESS(p)) {
9921                 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
9922                 timelimit.tv_sec = searchblock.timelimit.tv_sec;
9923                 timelimit.tv_usec = searchblock.timelimit.tv_usec;
9924         } else {
9925                 struct user32_fssearchblock tmp_searchblock;
9926
9927                 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
9928                 // munge into 64-bit version
9929                 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
9930                 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
9931                 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
9932                 searchblock.maxmatches = tmp_searchblock.maxmatches;
9933                 /*
9934                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
9935                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
9936                  */
9937                 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
9938                 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
9939                 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
9940                 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
9941                 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
9942                 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
9943                 searchblock.searchattrs = tmp_searchblock.searchattrs;
9944         }
9945         if (error) {
9946                 return error;
9947         }
9948
9949         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
9950          */
9951         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
9952             searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
9953                 return EINVAL;
9954         }
9955
9956         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
9957         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
9958         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
9959         /* block.                                                                                             */
9960         /*                                                                                                    */
9961         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
9962         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
9963         /*       assumes the size is still 556 bytes it will continue to work                                 */
9964
9965         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
9966             sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
9967
9968         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
9969
9970         /* Now set up the various pointers to the correct place in our newly allocated memory */
9971
9972         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
9973         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
9974         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
9975
9976         /* Now copy in the stuff given our local variables. */
9977
9978         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
9979                 goto freeandexit;
9980         }
9981
9982         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
9983                 goto freeandexit;
9984         }
9985
9986         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
9987                 goto freeandexit;
9988         }
9989
9990         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
9991                 goto freeandexit;
9992         }
9993
9994         /*
9995          * When searching a union mount, need to set the
9996          * start flag at the first call on each layer to
9997          * reset state for the new volume.
9998          */
9999         if (uap->options & SRCHFS_START) {
10000                 state->ss_union_layer = 0;
10001         } else {
10002                 uap->options |= state->ss_union_flags;
10003         }
10004         state->ss_union_flags = 0;
10005
10006         /*
10007          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10008          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10009          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10010          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10011          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10012          */
10013
10014         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10015                 attrreference_t* string_ref;
10016                 u_int32_t* start_length;
10017                 user64_size_t param_length;
10018
10019                 /* validate searchparams1 */
10020                 param_length = searchblock.sizeofsearchparams1;
10021                 /* skip the word that specifies length of the buffer */
10022                 start_length = (u_int32_t*) searchparams1;
10023                 start_length = start_length + 1;
10024                 string_ref = (attrreference_t*) start_length;
10025
10026                 /* ensure no negative offsets or too big offsets */
10027                 if (string_ref->attr_dataoffset < 0) {
10028                         error = EINVAL;
10029                         goto freeandexit;
10030                 }
10031                 if (string_ref->attr_length > MAXPATHLEN) {
10032                         error = EINVAL;
10033                         goto freeandexit;
10034                 }
10035
10036                 /* Check for pointer overflow in the string ref */
10037                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10038                         error = EINVAL;
10039                         goto freeandexit;
10040                 }
10041
10042                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10043                         error = EINVAL;
10044                         goto freeandexit;
10045                 }
10046                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10047                         error = EINVAL;
10048                         goto freeandexit;
10049                 }
10050         }
10051
10052         /* set up the uio structure which will contain the users return buffer */
10053         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10054         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10055
10056         nameiflags = 0;
10057         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10058                 nameiflags |= FOLLOW;
10059         }
10060         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10061             UIO_USERSPACE, uap->path, ctx);
10062
10063         error = namei(&nd);
10064         if (error) {
10065                 goto freeandexit;
10066         }
10067         vp = nd.ni_vp;
10068         nameidone(&nd);
10069
10070         /*
10071          * Switch to the root vnode for the volume
10072          */
10073         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10074         vnode_put(vp);
10075         if (error) {
10076                 goto freeandexit;
10077         }
10078         vp = tvp;
10079
10080         /*
10081          * If it's a union mount, the path lookup takes
10082          * us to the top layer. But we may need to descend
10083          * to a lower layer. For non-union mounts the layer
10084          * is always zero.
10085          */
10086         for (i = 0; i < (int) state->ss_union_layer; i++) {
10087                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10088                         break;
10089                 }
10090                 tvp = vp;
10091                 vp = vp->v_mount->mnt_vnodecovered;
10092                 if (vp == NULL) {
10093                         vnode_put(tvp);
10094                         error = ENOENT;
10095                         goto freeandexit;
10096                 }
10097                 error = vnode_getwithref(vp);
10098                 vnode_put(tvp);
10099                 if (error) {
10100                         goto freeandexit;
10101                 }
10102         }
10103
10104 #if CONFIG_MACF
10105         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
10106         if (error) {
10107                 vnode_put(vp);
10108                 goto freeandexit;
10109         }
10110 #endif
10111
10112
10113         /*
10114          * If searchblock.maxmatches == 0, then skip the search. This has happened
10115          * before and sometimes the underlying code doesnt deal with it well.
10116          */
10117         if (searchblock.maxmatches == 0) {
10118                 nummatches = 0;
10119                 goto saveandexit;
10120         }
10121
10122         /*
10123          * Allright, we have everything we need, so lets make that call.
10124          *
10125          * We keep special track of the return value from the file system:
10126          * EAGAIN is an acceptable error condition that shouldn't keep us
10127          * from copying out any results...
10128          */
10129
10130         fserror = VNOP_SEARCHFS(vp,
10131             searchparams1,
10132             searchparams2,
10133             &searchblock.searchattrs,
10134             (u_long)searchblock.maxmatches,
10135             &timelimit,
10136             returnattrs,
10137             &nummatches,
10138             (u_long)uap->scriptcode,
10139             (u_long)uap->options,
10140             auio,
10141             (struct searchstate *) &state->ss_fsstate,
10142             ctx);
10143
10144         /*
10145          * If it's a union mount we need to be called again
10146          * to search the mounted-on filesystem.
10147          */
10148         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10149                 state->ss_union_flags = SRCHFS_START;
10150                 state->ss_union_layer++;        // search next layer down
10151                 fserror = EAGAIN;
10152         }
10153
10154 saveandexit:
10155
10156         vnode_put(vp);
10157
10158         /* Now copy out the stuff that needs copying out. That means the number of matches, the
10159          *  search state.  Everything was already put into he return buffer by the vop call. */
10160
10161         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10162                 goto freeandexit;
10163         }
10164
10165         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10166                 goto freeandexit;
10167         }
10168
10169         error = fserror;
10170
10171 freeandexit:
10172
10173         FREE(searchparams1, M_TEMP);
10174
10175         return error;
10176 } /* end of searchfs system call */
10177
10178 #else /* CONFIG_SEARCHFS */
10179
10180 int
10181 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10182 {
10183         return ENOTSUP;
10184 }
10185
10186 #endif /* CONFIG_SEARCHFS */
10187
10188
10189 #if CONFIG_DATALESS_FILES
10190
10191 /*
10192  * === Namespace Resolver Up-call Mechanism ===
10193  *
10194  * When I/O is performed to a dataless file or directory (read, write,
10195  * lookup-in, etc.), the file system performs an upcall to the namespace
10196  * resolver (filecoordinationd) to materialize the object.
10197  *
10198  * We need multiple up-calls to be in flight at once, and we need these
10199  * up-calls to be interruptible, thus the following implementation:
10200  *
10201  * => The nspace_resolver_request represents the in-kernel request state.
10202  *    It contains a request ID, storage space for the errno code returned
10203  *    by filecoordinationd, and flags.
10204  *
10205  * => The request ID is simply a global monotonically incrementing 32-bit
10206  *    number.  Outstanding requests are stored in a hash table, and the
10207  *    hash function is extremely simple.
10208  *
10209  * => When an upcall is to be made to filecoordinationd, a request structure
10210  *    is allocated on the stack (it is small, and needs to live only during
10211  *    the duration of the call to resolve_nspace_item_ext()).  It is
10212  *    initialized and inserted into the table.  Some backpressure from
10213  *    filecoordinationd is applied by limiting the numnber of entries that
10214  *    can be inserted into the table (and thus limiting the number of
10215  *    outstanding requests issued to filecoordinationd); waiting for an
10216  *    available slot is interruptible.
10217  *
10218  * => Once the request has been inserted into the table, the up-call is made
10219  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
10220  *    immediately and filecoordinationd processes the request asynchronously.
10221  *
10222  * => The caller now waits for the request to complete.  Tnis is achieved by
10223  *    sleeping on the address of the request structure and waiting for
10224  *    filecoordinationd to mark the request structure as complete.  This
10225  *    is an interruptible sleep call; if interrupted, the request structure
10226  *    is removed from the table and EINTR is returned to the caller.  If
10227  *    this occurs, an advisory up-call is made to filecoordinationd with
10228  *    the request ID to indicate that the request can be aborted or
10229  *    de-prioritized at the discretion of filecoordinationd.
10230  *
10231  * => When filecoordinationd has completed the request, it signals completion
10232  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
10233  *    decorated as a namespace resolver can write to this sysctl node.  The
10234  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10235  *    The request ID is looked up in the table, and if the request is found,
10236  *    the error code is stored in the request structure and a wakeup()
10237  *    issued on the address of the request structure.  If the request is not
10238  *    found, we simply drop the completion notification, assuming that the
10239  *    caller was interrupted.
10240  *
10241  * => When the waiting thread wakes up, it extracts the error code from the
10242  *    request structure, removes the request from the table, and returns the
10243  *    error code to the calling function.  Fini!
10244  */
10245
10246 struct nspace_resolver_request {
10247         LIST_ENTRY(nspace_resolver_request) r_hashlink;
10248         uint32_t        r_req_id;
10249         int             r_resolver_error;
10250         int             r_flags;
10251 };
10252
10253 #define RRF_COMPLETE    0x0001
10254
10255 static uint32_t
10256 next_nspace_req_id(void)
10257 {
10258         static uint32_t next_req_id;
10259
10260         return OSAddAtomic(1, &next_req_id);
10261 }
10262
10263 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
10264 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
10265
10266 static LIST_HEAD(nspace_resolver_requesthead,
10267     nspace_resolver_request) * nspace_resolver_request_hashtbl;
10268 static u_long nspace_resolver_request_hashmask;
10269 static u_int nspace_resolver_request_count;
10270 static bool nspace_resolver_request_wait_slot;
10271 static lck_grp_t *nspace_resolver_request_lck_grp;
10272 static lck_mtx_t nspace_resolver_request_hash_mutex;
10273
10274 #define NSPACE_REQ_LOCK() \
10275         lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10276 #define NSPACE_REQ_UNLOCK() \
10277         lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10278
10279 #define NSPACE_RESOLVER_HASH(req_id)    \
10280         (&nspace_resolver_request_hashtbl[(req_id) & \
10281          nspace_resolver_request_hashmask])
10282
10283 static struct nspace_resolver_request *
10284 nspace_resolver_req_lookup(uint32_t req_id)
10285 {
10286         struct nspace_resolver_requesthead *bucket;
10287         struct nspace_resolver_request *req;
10288
10289         bucket = NSPACE_RESOLVER_HASH(req_id);
10290         LIST_FOREACH(req, bucket, r_hashlink) {
10291                 if (req->r_req_id == req_id) {
10292                         return req;
10293                 }
10294         }
10295
10296         return NULL;
10297 }
10298
10299 static int
10300 nspace_resolver_req_add(struct nspace_resolver_request *req)
10301 {
10302         struct nspace_resolver_requesthead *bucket;
10303         int error;
10304
10305         while (nspace_resolver_request_count >=
10306             NSPACE_RESOLVER_MAX_OUTSTANDING) {
10307                 nspace_resolver_request_wait_slot = true;
10308                 error = msleep(&nspace_resolver_request_count,
10309                     &nspace_resolver_request_hash_mutex,
10310                     PVFS | PCATCH, "nspacerq", NULL);
10311                 if (error) {
10312                         return error;
10313                 }
10314         }
10315
10316         bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10317 #if DIAGNOSTIC
10318         assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10319 #endif /* DIAGNOSTIC */
10320         LIST_INSERT_HEAD(bucket, req, r_hashlink);
10321         nspace_resolver_request_count++;
10322
10323         return 0;
10324 }
10325
10326 static void
10327 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10328 {
10329         struct nspace_resolver_requesthead *bucket;
10330
10331         bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10332 #if DIAGNOSTIC
10333         assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10334 #endif /* DIAGNOSTIC */
10335         LIST_REMOVE(req, r_hashlink);
10336         nspace_resolver_request_count--;
10337
10338         if (nspace_resolver_request_wait_slot) {
10339                 nspace_resolver_request_wait_slot = false;
10340                 wakeup(&nspace_resolver_request_count);
10341         }
10342 }
10343
10344 static void
10345 nspace_resolver_req_cancel(uint32_t req_id)
10346 {
10347         kern_return_t kr;
10348         mach_port_t mp;
10349
10350         // Failures here aren't fatal -- the cancellation message
10351         // sent to the resolver is merely advisory.
10352
10353         kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10354         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10355                 return;
10356         }
10357
10358         kr = send_nspace_resolve_cancel(mp, req_id);
10359         if (kr != KERN_SUCCESS) {
10360                 os_log_error(OS_LOG_DEFAULT,
10361                     "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10362         }
10363
10364         ipc_port_release_send(mp);
10365 }
10366
10367 static int
10368 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10369 {
10370         bool send_cancel_message = false;
10371         int error;
10372
10373         NSPACE_REQ_LOCK();
10374
10375         while ((req->r_flags & RRF_COMPLETE) == 0) {
10376                 error = msleep(req, &nspace_resolver_request_hash_mutex,
10377                     PVFS | PCATCH, "nspace", NULL);
10378                 if (error && error != ERESTART) {
10379                         req->r_resolver_error = (error == EINTR) ? EINTR :
10380                             ETIMEDOUT;
10381                         send_cancel_message = true;
10382                         break;
10383                 }
10384         }
10385
10386         nspace_resolver_req_remove(req);
10387
10388         NSPACE_REQ_UNLOCK();
10389
10390         if (send_cancel_message) {
10391                 nspace_resolver_req_cancel(req->r_req_id);
10392         }
10393
10394         return req->r_resolver_error;
10395 }
10396
10397 static void
10398 nspace_resolver_req_mark_complete(
10399         struct nspace_resolver_request *req,
10400         int resolver_error)
10401 {
10402         req->r_resolver_error = resolver_error;
10403         req->r_flags |= RRF_COMPLETE;
10404         wakeup(req);
10405 }
10406
10407 static void
10408 nspace_resolver_req_completed(uint32_t req_id, int resolver_error)
10409 {
10410         struct nspace_resolver_request *req;
10411
10412         NSPACE_REQ_LOCK();
10413
10414         // If we don't find the request corresponding to our req_id,
10415         // just drop the completion signal on the floor; it's likely
10416         // that the requester interrupted with a signal.
10417
10418         req = nspace_resolver_req_lookup(req_id);
10419         if (req) {
10420                 nspace_resolver_req_mark_complete(req, resolver_error);
10421         }
10422
10423         NSPACE_REQ_UNLOCK();
10424 }
10425
10426 static struct proc *nspace_resolver_proc;
10427
10428 static int
10429 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10430 {
10431         *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10432             p == nspace_resolver_proc) ? 1 : 0;
10433         return 0;
10434 }
10435
10436 static int
10437 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10438 {
10439         vfs_context_t ctx = vfs_context_current();
10440         int error = 0;
10441
10442         //
10443         // The system filecoordinationd runs as uid == 0.  This also
10444         // has the nice side-effect of filtering out filecoordinationd
10445         // running in the simulator.
10446         //
10447         if (!vfs_context_issuser(ctx)) {
10448                 return EPERM;
10449         }
10450
10451         error = priv_check_cred(vfs_context_ucred(ctx),
10452             PRIV_VFS_DATALESS_RESOLVER, 0);
10453         if (error) {
10454                 return error;
10455         }
10456
10457         if (is_resolver) {
10458                 NSPACE_REQ_LOCK();
10459
10460                 if (nspace_resolver_proc == NULL) {
10461                         proc_lock(p);
10462                         p->p_lflag |= P_LNSPACE_RESOLVER;
10463                         proc_unlock(p);
10464                         nspace_resolver_proc = p;
10465                 } else {
10466                         error = EBUSY;
10467                 }
10468
10469                 NSPACE_REQ_UNLOCK();
10470         } else {
10471                 // This is basically just like the exit case.
10472                 // nspace_resolver_exited() will verify that the
10473                 // process is the resolver, and will clear the
10474                 // global.
10475                 nspace_resolver_exited(p);
10476         }
10477
10478         return error;
10479 }
10480
10481 static int
10482 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10483 {
10484         if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10485             (p->p_vfs_iopolicy &
10486             P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10487                 *is_prevented = 1;
10488         } else {
10489                 *is_prevented = 0;
10490         }
10491         return 0;
10492 }
10493
10494 static int
10495 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10496 {
10497         if (p->p_lflag & P_LNSPACE_RESOLVER) {
10498                 return is_prevented ? 0 : EBUSY;
10499         }
10500
10501         if (is_prevented) {
10502                 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10503         } else {
10504                 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10505         }
10506         return 0;
10507 }
10508
10509 static int
10510 nspace_materialization_get_thread_state(int *is_prevented)
10511 {
10512         uthread_t ut = get_bsdthread_info(current_thread());
10513
10514         *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10515         return 0;
10516 }
10517
10518 static int
10519 nspace_materialization_set_thread_state(int is_prevented)
10520 {
10521         uthread_t ut = get_bsdthread_info(current_thread());
10522
10523         if (is_prevented) {
10524                 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10525         } else {
10526                 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10527         }
10528         return 0;
10529 }
10530
10531 static int
10532 nspace_materialization_is_prevented(void)
10533 {
10534         proc_t p = current_proc();
10535         uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
10536         vfs_context_t ctx = vfs_context_current();
10537
10538         /*
10539          * Kernel context ==> return EDEADLK, as we would with any random
10540          * process decorated as no-materialize.
10541          */
10542         if (ctx == vfs_context_kernel()) {
10543                 return EDEADLK;
10544         }
10545
10546         /*
10547          * If the process has the dataless-manipulation entitlement,
10548          * materialization is prevented, and depending on the kind
10549          * of file system operation, things get to proceed as if the
10550          * object is not dataless.
10551          */
10552         if (vfs_context_is_dataless_manipulator(ctx)) {
10553                 return EJUSTRETURN;
10554         }
10555
10556         /*
10557          * Per-thread decorations override any process-wide decorations.
10558          * (Foundation uses this, and this overrides even the dataless-
10559          * manipulation entitlement so as to make API contracts consistent.)
10560          */
10561         if (ut != NULL) {
10562                 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
10563                         return EDEADLK;
10564                 }
10565                 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
10566                         return 0;
10567                 }
10568         }
10569
10570         /*
10571          * If the process's iopolicy specifies that dataless files
10572          * can be materialized, then we let it go ahead.
10573          */
10574         if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
10575                 return 0;
10576         }
10577
10578         /*
10579          * The default behavior is to not materialize dataless files;
10580          * return to the caller that deadlock was detected.
10581          */
10582         return EDEADLK;
10583 }
10584
10585 /* the vfs.nspace branch */
10586 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10587
10588 static int
10589 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10590     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10591 {
10592         struct proc *p = req->p;
10593         int new_value, old_value, changed = 0;
10594         int error;
10595
10596         error = nspace_resolver_get_proc_state(p, &old_value);
10597         if (error) {
10598                 return error;
10599         }
10600
10601         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10602             &changed);
10603         if (error == 0 && changed) {
10604                 error = nspace_resolver_set_proc_state(p, new_value);
10605         }
10606         return error;
10607 }
10608
10609 /* decorate this process as the dataless file resolver */
10610 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
10611     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10612     0, 0, sysctl_nspace_resolver, "I", "");
10613
10614 static int
10615 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
10616     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10617 {
10618         struct proc *p = req->p;
10619         int new_value, old_value, changed = 0;
10620         int error;
10621
10622         error = nspace_materialization_get_proc_state(p, &old_value);
10623         if (error) {
10624                 return error;
10625         }
10626
10627         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10628             &changed);
10629         if (error == 0 && changed) {
10630                 error = nspace_materialization_set_proc_state(p, new_value);
10631         }
10632         return error;
10633 }
10634
10635 /* decorate this process as not wanting to materialize dataless files */
10636 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
10637     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10638     0, 0, sysctl_nspace_prevent_materialization, "I", "");
10639
10640 static int
10641 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
10642     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10643 {
10644         int new_value, old_value, changed = 0;
10645         int error;
10646
10647         error = nspace_materialization_get_thread_state(&old_value);
10648         if (error) {
10649                 return error;
10650         }
10651
10652         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10653             &changed);
10654         if (error == 0 && changed) {
10655                 error = nspace_materialization_set_thread_state(new_value);
10656         }
10657         return error;
10658 }
10659
10660 /* decorate this thread as not wanting to materialize dataless files */
10661 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
10662     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10663     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
10664
10665 static int
10666 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
10667     __unused int arg2, struct sysctl_req *req)
10668 {
10669         struct proc *p = req->p;
10670         uint32_t req_status[2] = { 0, 0 };
10671         int error, is_resolver, changed = 0;
10672
10673         error = nspace_resolver_get_proc_state(p, &is_resolver);
10674         if (error) {
10675                 return error;
10676         }
10677
10678         if (!is_resolver) {
10679                 return EPERM;
10680         }
10681
10682         error = sysctl_io_opaque(req, req_status, sizeof(req_status),
10683             &changed);
10684         if (error) {
10685                 return error;
10686         }
10687
10688         /*
10689          * req_status[0] is the req_id
10690          *
10691          * req_status[1] is the errno
10692          */
10693         if (error == 0 && changed) {
10694                 nspace_resolver_req_completed(req_status[0],
10695                     (int)req_status[1]);
10696         }
10697         return error;
10698 }
10699
10700 /* Resolver reports completed reqs here. */
10701 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
10702     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10703     0, 0, sysctl_nspace_complete, "-", "");
10704
10705 #endif /* CONFIG_DATALESS_FILES */
10706
10707 #if CONFIG_DATALESS_FILES
10708 #define __no_dataless_unused    /* nothing */
10709 #else
10710 #define __no_dataless_unused    __unused
10711 #endif
10712
10713 void
10714 nspace_resolver_init(void)
10715 {
10716 #if CONFIG_DATALESS_FILES
10717         nspace_resolver_request_lck_grp =
10718             lck_grp_alloc_init("file namespace resolver", NULL);
10719
10720         lck_mtx_init(&nspace_resolver_request_hash_mutex,
10721             nspace_resolver_request_lck_grp, NULL);
10722
10723         nspace_resolver_request_hashtbl =
10724             hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
10725             M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
10726 #endif /* CONFIG_DATALESS_FILES */
10727 }
10728
10729 void
10730 nspace_resolver_exited(struct proc *p __no_dataless_unused)
10731 {
10732 #if CONFIG_DATALESS_FILES
10733         struct nspace_resolver_requesthead *bucket;
10734         struct nspace_resolver_request *req;
10735         u_long idx;
10736
10737         NSPACE_REQ_LOCK();
10738
10739         if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10740             p == nspace_resolver_proc) {
10741                 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
10742                         bucket = &nspace_resolver_request_hashtbl[idx];
10743                         LIST_FOREACH(req, bucket, r_hashlink) {
10744                                 nspace_resolver_req_mark_complete(req,
10745                                     ETIMEDOUT);
10746                         }
10747                 }
10748                 nspace_resolver_proc = NULL;
10749         }
10750
10751         NSPACE_REQ_UNLOCK();
10752 #endif /* CONFIG_DATALESS_FILES */
10753 }
10754
10755 int
10756 resolve_nspace_item(struct vnode *vp, uint64_t op)
10757 {
10758         return resolve_nspace_item_ext(vp, op, NULL);
10759 }
10760
10761 #define DATALESS_RESOLVER_ENTITLEMENT     \
10762         "com.apple.private.vfs.dataless-resolver"
10763 #define DATALESS_MANIPULATION_ENTITLEMENT \
10764         "com.apple.private.vfs.dataless-manipulation"
10765
10766 /*
10767  * Return TRUE if the vfs context is associated with a process entitled
10768  * for dataless manipulation.
10769  *
10770  * XXX Arguably belongs in vfs_subr.c, but is here because of the
10771  * complication around CONFIG_DATALESS_FILES.
10772  */
10773 boolean_t
10774 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
10775 {
10776 #if CONFIG_DATALESS_FILES
10777         assert(ctx->vc_thread == current_thread());
10778         task_t const task = current_task();
10779         return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
10780                IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
10781 #else
10782         return false;
10783 #endif /* CONFIG_DATALESS_FILES */
10784 }
10785
10786 int
10787 resolve_nspace_item_ext(
10788         struct vnode *vp __no_dataless_unused,
10789         uint64_t op __no_dataless_unused,
10790         void *arg __unused)
10791 {
10792 #if CONFIG_DATALESS_FILES
10793         int error;
10794         mach_port_t mp;
10795         char *path = NULL;
10796         int path_len;
10797         kern_return_t kr;
10798         struct nspace_resolver_request req;
10799
10800         // only allow namespace events on regular files, directories and symlinks.
10801         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
10802                 return EFTYPE;
10803         }
10804
10805         //
10806         // if this is a snapshot event and the vnode is on a
10807         // disk image just pretend nothing happened since any
10808         // change to the disk image will cause the disk image
10809         // itself to get backed up and this avoids multi-way
10810         // deadlocks between the snapshot handler and the ever
10811         // popular diskimages-helper process.  the variable
10812         // nspace_allow_virtual_devs allows this behavior to
10813         // be overridden (for use by the Mobile TimeMachine
10814         // testing infrastructure which uses disk images)
10815         //
10816         if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
10817                 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
10818                 return ENOTSUP;
10819         }
10820
10821         error = nspace_materialization_is_prevented();
10822         if (error) {
10823                 os_log_debug(OS_LOG_DEFAULT,
10824                     "NSPACE process/thread is decorated as no-materialization");
10825                 return error;
10826         }
10827
10828         kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10829         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10830                 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
10831                 // Treat this like being unable to access the backing
10832                 // store server.
10833                 return ETIMEDOUT;
10834         }
10835
10836         MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
10837         if (path == NULL) {
10838                 error = ENOMEM;
10839                 goto out_release_port;
10840         }
10841         path_len = MAXPATHLEN;
10842
10843         error = vn_getpath(vp, path, &path_len);
10844         if (error == 0) {
10845                 int xxx_rdar44371223;   /* XXX Mig bug */
10846                 req.r_req_id = next_nspace_req_id();
10847                 req.r_resolver_error = 0;
10848                 req.r_flags = 0;
10849
10850                 NSPACE_REQ_LOCK();
10851                 error = nspace_resolver_req_add(&req);
10852                 NSPACE_REQ_UNLOCK();
10853                 if (error) {
10854                         goto out_release_port;
10855                 }
10856
10857                 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
10858                 kr = send_nspace_resolve_path(mp, req.r_req_id,
10859                     current_proc()->p_pid, (uint32_t)(op & 0xffffffff),
10860                     path, &xxx_rdar44371223);
10861                 if (kr != KERN_SUCCESS) {
10862                         // Also treat this like being unable to access
10863                         // the backing store server.
10864                         os_log_error(OS_LOG_DEFAULT,
10865                             "NSPACE resolve_path failure: %d", kr);
10866                         error = ETIMEDOUT;
10867
10868                         NSPACE_REQ_LOCK();
10869                         nspace_resolver_req_remove(&req);
10870                         NSPACE_REQ_UNLOCK();
10871                         goto out_release_port;
10872                 }
10873
10874                 // Give back the memory we allocated earlier while
10875                 // we wait; we no longer need it.
10876                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10877                 path = NULL;
10878
10879                 // Request has been submitted to the resolver.
10880                 // Now (interruptibly) wait for completion.
10881                 // Upon requrn, the request will have been removed
10882                 // from the lookup table.
10883                 error = nspace_resolver_req_wait(&req);
10884         }
10885
10886 out_release_port:
10887         if (path != NULL) {
10888                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10889         }
10890         ipc_port_release_send(mp);
10891
10892         return error;
10893 #else
10894         return ENOTSUP;
10895 #endif /* CONFIG_DATALESS_FILES */
10896 }
10897
10898 int
10899 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
10900     __unused uint64_t op_type, __unused void *arg)
10901 {
10902         return 0;
10903 }
10904
10905 #if 0
10906 static int
10907 build_volfs_path(struct vnode *vp, char *path, int *len)
10908 {
10909         struct vnode_attr va;
10910         int ret;
10911
10912         VATTR_INIT(&va);
10913         VATTR_WANTED(&va, va_fsid);
10914         VATTR_WANTED(&va, va_fileid);
10915
10916         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
10917                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
10918                 ret = -1;
10919         } else {
10920                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
10921                 ret = 0;
10922         }
10923
10924         return ret;
10925 }
10926 #endif
10927
10928 static unsigned long
10929 fsctl_bogus_command_compat(unsigned long cmd)
10930 {
10931         switch (cmd) {
10932         case IOCBASECMD(FSIOC_SYNC_VOLUME):
10933                 return FSIOC_SYNC_VOLUME;
10934         case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
10935                 return FSIOC_ROUTEFS_SETROUTEID;
10936         case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
10937                 return FSIOC_SET_PACKAGE_EXTS;
10938         case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
10939                 return FSIOC_SET_FSTYPENAME_OVERRIDE;
10940         case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
10941                 return DISK_CONDITIONER_IOC_GET;
10942         case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
10943                 return DISK_CONDITIONER_IOC_SET;
10944         case IOCBASECMD(FSIOC_FIOSEEKHOLE):
10945                 return FSIOC_FIOSEEKHOLE;
10946         case IOCBASECMD(FSIOC_FIOSEEKDATA):
10947                 return FSIOC_FIOSEEKDATA;
10948         case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
10949                 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
10950         case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
10951                 return SPOTLIGHT_IOC_GET_LAST_MTIME;
10952         }
10953
10954         return cmd;
10955 }
10956
10957 static int
10958 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
10959 {
10960         return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
10961 }
10962
10963 /*
10964  * Make a filesystem-specific control call:
10965  */
10966 /* ARGSUSED */
10967 static int
10968 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
10969 {
10970         int error = 0;
10971         boolean_t is64bit;
10972         u_int size;
10973 #define STK_PARAMS 128
10974         char stkbuf[STK_PARAMS] = {0};
10975         caddr_t data, memp;
10976         vnode_t vp = *arg_vp;
10977
10978         if (vp->v_type == VCHR || vp->v_type == VBLK) {
10979                 return ENOTTY;
10980         }
10981
10982         cmd = fsctl_bogus_command_compat(cmd);
10983
10984         size = IOCPARM_LEN(cmd);
10985         if (size > IOCPARM_MAX) {
10986                 return EINVAL;
10987         }
10988
10989         is64bit = proc_is64bit(p);
10990
10991         memp = NULL;
10992
10993         if (size > sizeof(stkbuf)) {
10994                 if ((memp = (caddr_t)kalloc(size)) == 0) {
10995                         return ENOMEM;
10996                 }
10997                 data = memp;
10998         } else {
10999                 data = &stkbuf[0];
11000         };
11001
11002         if (cmd & IOC_IN) {
11003                 if (size) {
11004                         error = copyin(udata, data, size);
11005                         if (error) {
11006                                 if (memp) {
11007                                         kfree(memp, size);
11008                                 }
11009                                 return error;
11010                         }
11011                 } else {
11012                         if (is64bit) {
11013                                 *(user_addr_t *)data = udata;
11014                         } else {
11015                                 *(uint32_t *)data = (uint32_t)udata;
11016                         }
11017                 };
11018         } else if ((cmd & IOC_OUT) && size) {
11019                 /*
11020                  * Zero the buffer so the user always
11021                  * gets back something deterministic.
11022                  */
11023                 bzero(data, size);
11024         } else if (cmd & IOC_VOID) {
11025                 if (is64bit) {
11026                         *(user_addr_t *)data = udata;
11027                 } else {
11028                         *(uint32_t *)data = (uint32_t)udata;
11029                 }
11030         }
11031
11032         /* Check to see if it's a generic command */
11033         switch (cmd) {
11034         case FSIOC_SYNC_VOLUME: {
11035                 struct vfs_attr vfa;
11036                 mount_t mp = vp->v_mount;
11037                 unsigned arg;
11038
11039
11040                 /* record vid of vp so we can drop it below. */
11041                 uint32_t vvid = vp->v_id;
11042
11043                 /*
11044                  * Then grab mount_iterref so that we can release the vnode.
11045                  * Without this, a thread may call vnode_iterate_prepare then
11046                  * get into a deadlock because we've never released the root vp
11047                  */
11048                 error = mount_iterref(mp, 0);
11049                 if (error) {
11050                         break;
11051                 }
11052                 vnode_put(vp);
11053
11054                 arg = MNT_NOWAIT;
11055                 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11056                         arg = MNT_WAIT;
11057                 }
11058
11059                 /*
11060                  * If the filessytem supports multiple filesytems in a
11061                  * partition (For eg APFS volumes in a container, it knows
11062                  * that the waitfor argument to VFS_SYNC are flags.
11063                  */
11064                 VFSATTR_INIT(&vfa);
11065                 VFSATTR_WANTED(&vfa, f_capabilities);
11066                 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11067                     VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11068                     ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11069                     ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11070                         arg |= MNT_VOLUME;
11071                 }
11072
11073                 /* issue the sync for this volume */
11074                 (void)sync_callback(mp, &arg);
11075
11076                 /*
11077                  * Then release the mount_iterref once we're done syncing; it's not
11078                  * needed for the VNOP_IOCTL below
11079                  */
11080                 mount_iterdrop(mp);
11081
11082                 if (arg & FSCTL_SYNC_FULLSYNC) {
11083                         /* re-obtain vnode iocount on the root vp, if possible */
11084                         error = vnode_getwithvid(vp, vvid);
11085                         if (error == 0) {
11086                                 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11087                                 vnode_put(vp);
11088                         }
11089                 }
11090                 /* mark the argument VP as having been released */
11091                 *arg_vp = NULL;
11092         }
11093         break;
11094
11095         case FSIOC_ROUTEFS_SETROUTEID: {
11096 #if ROUTEFS
11097                 char routepath[MAXPATHLEN];
11098                 size_t len = 0;
11099
11100                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11101                         break;
11102                 }
11103                 bzero(routepath, MAXPATHLEN);
11104                 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11105                 if (error) {
11106                         break;
11107                 }
11108                 error = routefs_kernel_mount(routepath);
11109                 if (error) {
11110                         break;
11111                 }
11112 #endif
11113         }
11114         break;
11115
11116         case FSIOC_SET_PACKAGE_EXTS: {
11117                 user_addr_t ext_strings;
11118                 uint32_t    num_entries;
11119                 uint32_t    max_width;
11120
11121                 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11122                         break;
11123                 }
11124
11125                 if ((is64bit && size != sizeof(user64_package_ext_info))
11126                     || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11127                         // either you're 64-bit and passed a 64-bit struct or
11128                         // you're 32-bit and passed a 32-bit struct.  otherwise
11129                         // it's not ok.
11130                         error = EINVAL;
11131                         break;
11132                 }
11133
11134                 if (is64bit) {
11135                         ext_strings = ((user64_package_ext_info *)data)->strings;
11136                         num_entries = ((user64_package_ext_info *)data)->num_entries;
11137                         max_width   = ((user64_package_ext_info *)data)->max_width;
11138                 } else {
11139                         ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11140                         num_entries = ((user32_package_ext_info *)data)->num_entries;
11141                         max_width   = ((user32_package_ext_info *)data)->max_width;
11142                 }
11143                 error = set_package_extensions_table(ext_strings, num_entries, max_width);
11144         }
11145         break;
11146
11147         case FSIOC_SET_FSTYPENAME_OVERRIDE:
11148         {
11149                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11150                         break;
11151                 }
11152                 if (vp->v_mount) {
11153                         mount_lock(vp->v_mount);
11154                         if (data[0] != 0) {
11155                                 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11156                                 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11157                                 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11158                                         vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
11159                                         vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
11160                                 }
11161                         } else {
11162                                 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11163                                         vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
11164                                 }
11165                                 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11166                                 vp->v_mount->fstypename_override[0] = '\0';
11167                         }
11168                         mount_unlock(vp->v_mount);
11169                 }
11170         }
11171         break;
11172
11173         case DISK_CONDITIONER_IOC_GET: {
11174                 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
11175         }
11176         break;
11177
11178         case DISK_CONDITIONER_IOC_SET: {
11179                 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
11180         }
11181         break;
11182
11183         case FSIOC_CAS_BSDFLAGS: {
11184                 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11185                 struct vnode_attr va;
11186
11187                 VATTR_INIT(&va);
11188                 VATTR_SET(&va, va_flags, cas->new_flags);
11189
11190                 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11191         }
11192         break;
11193
11194         case FSIOC_FD_ONLY_OPEN_ONCE: {
11195                 if (vnode_usecount(vp) > 1) {
11196                         error = EBUSY;
11197                 } else {
11198                         error = 0;
11199                 }
11200         }
11201         break;
11202
11203         default: {
11204                 /* other, known commands shouldn't be passed down here */
11205                 switch (cmd) {
11206                 case F_PUNCHHOLE:
11207                 case F_TRIM_ACTIVE_FILE:
11208                 case F_RDADVISE:
11209                 case F_TRANSCODEKEY:
11210                 case F_GETPROTECTIONLEVEL:
11211                 case F_GETDEFAULTPROTLEVEL:
11212                 case F_MAKECOMPRESSED:
11213                 case F_SET_GREEDY_MODE:
11214                 case F_SETSTATICCONTENT:
11215                 case F_SETIOTYPE:
11216                 case F_SETBACKINGSTORE:
11217                 case F_GETPATH_MTMINFO:
11218                 case APFSIOC_REVERT_TO_SNAPSHOT:
11219                 case FSIOC_FIOSEEKHOLE:
11220                 case FSIOC_FIOSEEKDATA:
11221                 case HFS_GET_BOOT_INFO:
11222                 case HFS_SET_BOOT_INFO:
11223                 case FIOPINSWAP:
11224                 case F_CHKCLEAN:
11225                 case F_FULLFSYNC:
11226                 case F_BARRIERFSYNC:
11227                 case F_FREEZE_FS:
11228                 case F_THAW_FS:
11229                         error = EINVAL;
11230                         goto outdrop;
11231                 }
11232                 /* Invoke the filesystem-specific code */
11233                 error = VNOP_IOCTL(vp, cmd, data, options, ctx);
11234         }
11235         } /* end switch stmt */
11236
11237         /*
11238          * if no errors, copy any data to user. Size was
11239          * already set and checked above.
11240          */
11241         if (error == 0 && (cmd & IOC_OUT) && size) {
11242                 error = copyout(data, udata, size);
11243         }
11244
11245 outdrop:
11246         if (memp) {
11247                 kfree(memp, size);
11248         }
11249
11250         return error;
11251 }
11252
11253 /* ARGSUSED */
11254 int
11255 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
11256 {
11257         int error;
11258         struct nameidata nd;
11259         u_long nameiflags;
11260         vnode_t vp = NULL;
11261         vfs_context_t ctx = vfs_context_current();
11262
11263         AUDIT_ARG(cmd, uap->cmd);
11264         AUDIT_ARG(value32, uap->options);
11265         /* Get the vnode for the file we are getting info on:  */
11266         nameiflags = 0;
11267         //
11268         // if we come through fsctl() then the file is by definition not open.
11269         // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
11270         // lest the caller mistakenly thinks the only open is their own (but in
11271         // reality it's someone elses).
11272         //
11273         if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
11274                 return EINVAL;
11275         }
11276         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11277                 nameiflags |= FOLLOW;
11278         }
11279         if (uap->cmd == FSIOC_FIRMLINK_CTL) {
11280                 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
11281         }
11282         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
11283             UIO_USERSPACE, uap->path, ctx);
11284         if ((error = namei(&nd))) {
11285                 goto done;
11286         }
11287         vp = nd.ni_vp;
11288         nameidone(&nd);
11289
11290 #if CONFIG_MACF
11291         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
11292         if (error) {
11293                 goto done;
11294         }
11295 #endif
11296
11297         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11298
11299 done:
11300         if (vp) {
11301                 vnode_put(vp);
11302         }
11303         return error;
11304 }
11305 /* ARGSUSED */
11306 int
11307 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
11308 {
11309         int error;
11310         vnode_t vp = NULL;
11311         vfs_context_t ctx = vfs_context_current();
11312         int fd = -1;
11313
11314         AUDIT_ARG(fd, uap->fd);
11315         AUDIT_ARG(cmd, uap->cmd);
11316         AUDIT_ARG(value32, uap->options);
11317
11318         /* Get the vnode for the file we are getting info on:  */
11319         if ((error = file_vnode(uap->fd, &vp))) {
11320                 return error;
11321         }
11322         fd = uap->fd;
11323         if ((error = vnode_getwithref(vp))) {
11324                 file_drop(fd);
11325                 return error;
11326         }
11327
11328 #if CONFIG_MACF
11329         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
11330                 file_drop(fd);
11331                 vnode_put(vp);
11332                 return error;
11333         }
11334 #endif
11335
11336         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11337
11338         file_drop(fd);
11339
11340         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
11341         if (vp) {
11342                 vnode_put(vp);
11343         }
11344
11345         return error;
11346 }
11347 /* end of fsctl system call */
11348
11349 /*
11350  *  Retrieve the data of an extended attribute.
11351  */
11352 int
11353 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
11354 {
11355         vnode_t vp;
11356         struct nameidata nd;
11357         char attrname[XATTR_MAXNAMELEN + 1];
11358         vfs_context_t ctx = vfs_context_current();
11359         uio_t auio = NULL;
11360         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11361         size_t attrsize = 0;
11362         size_t namelen;
11363         u_int32_t nameiflags;
11364         int error;
11365         char uio_buf[UIO_SIZEOF(1)];
11366
11367         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11368                 return EINVAL;
11369         }
11370
11371         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11372         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
11373         if ((error = namei(&nd))) {
11374                 return error;
11375         }
11376         vp = nd.ni_vp;
11377         nameidone(&nd);
11378
11379         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11380         if (error != 0) {
11381                 goto out;
11382         }
11383         if (xattr_protected(attrname)) {
11384                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
11385                         error = EPERM;
11386                         goto out;
11387                 }
11388         }
11389         /*
11390          * the specific check for 0xffffffff is a hack to preserve
11391          * binaray compatibilty in K64 with applications that discovered
11392          * that passing in a buf pointer and a size of -1 resulted in
11393          * just the size of the indicated extended attribute being returned.
11394          * this isn't part of the documented behavior, but because of the
11395          * original implemtation's check for "uap->size > 0", this behavior
11396          * was allowed. In K32 that check turned into a signed comparison
11397          * even though uap->size is unsigned...  in K64, we blow by that
11398          * check because uap->size is unsigned and doesn't get sign smeared
11399          * in the munger for a 32 bit user app.  we also need to add a
11400          * check to limit the maximum size of the buffer being passed in...
11401          * unfortunately, the underlying fileystems seem to just malloc
11402          * the requested size even if the actual extended attribute is tiny.
11403          * because that malloc is for kernel wired memory, we have to put a
11404          * sane limit on it.
11405          *
11406          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
11407          * U64 running on K64 will yield -1 (64 bits wide)
11408          * U32/U64 running on K32 will yield -1 (32 bits wide)
11409          */
11410         if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
11411                 goto no_uio;
11412         }
11413
11414         if (uap->value) {
11415                 if (uap->size > (size_t)XATTR_MAXSIZE) {
11416                         uap->size = XATTR_MAXSIZE;
11417                 }
11418
11419                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11420                     &uio_buf[0], sizeof(uio_buf));
11421                 uio_addiov(auio, uap->value, uap->size);
11422         }
11423 no_uio:
11424         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
11425 out:
11426         vnode_put(vp);
11427
11428         if (auio) {
11429                 *retval = uap->size - uio_resid(auio);
11430         } else {
11431                 *retval = (user_ssize_t)attrsize;
11432         }
11433
11434         return error;
11435 }
11436
11437 /*
11438  * Retrieve the data of an extended attribute.
11439  */
11440 int
11441 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
11442 {
11443         vnode_t vp;
11444         char attrname[XATTR_MAXNAMELEN + 1];
11445         uio_t auio = NULL;
11446         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11447         size_t attrsize = 0;
11448         size_t namelen;
11449         int error;
11450         char uio_buf[UIO_SIZEOF(1)];
11451
11452         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11453                 return EINVAL;
11454         }
11455
11456         if ((error = file_vnode(uap->fd, &vp))) {
11457                 return error;
11458         }
11459         if ((error = vnode_getwithref(vp))) {
11460                 file_drop(uap->fd);
11461                 return error;
11462         }
11463         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11464         if (error != 0) {
11465                 goto out;
11466         }
11467         if (xattr_protected(attrname)) {
11468                 error = EPERM;
11469                 goto out;
11470         }
11471         if (uap->value && uap->size > 0) {
11472                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11473                     &uio_buf[0], sizeof(uio_buf));
11474                 uio_addiov(auio, uap->value, uap->size);
11475         }
11476
11477         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
11478 out:
11479         (void)vnode_put(vp);
11480         file_drop(uap->fd);
11481
11482         if (auio) {
11483                 *retval = uap->size - uio_resid(auio);
11484         } else {
11485                 *retval = (user_ssize_t)attrsize;
11486         }
11487         return error;
11488 }
11489
11490 /*
11491  * Set the data of an extended attribute.
11492  */
11493 int
11494 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
11495 {
11496         vnode_t vp;
11497         struct nameidata nd;
11498         char attrname[XATTR_MAXNAMELEN + 1];
11499         vfs_context_t ctx = vfs_context_current();
11500         uio_t auio = NULL;
11501         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11502         size_t namelen;
11503         u_int32_t nameiflags;
11504         int error;
11505         char uio_buf[UIO_SIZEOF(1)];
11506
11507         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11508                 return EINVAL;
11509         }
11510
11511         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11512         if (error != 0) {
11513                 if (error == EPERM) {
11514                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11515                         return ENAMETOOLONG;
11516                 }
11517                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11518                 return error;
11519         }
11520         if (xattr_protected(attrname)) {
11521                 return EPERM;
11522         }
11523         if (uap->size != 0 && uap->value == 0) {
11524                 return EINVAL;
11525         }
11526
11527         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11528         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
11529         if ((error = namei(&nd))) {
11530                 return error;
11531         }
11532         vp = nd.ni_vp;
11533         nameidone(&nd);
11534
11535         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11536             &uio_buf[0], sizeof(uio_buf));
11537         uio_addiov(auio, uap->value, uap->size);
11538
11539         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
11540 #if CONFIG_FSE
11541         if (error == 0) {
11542                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11543                     FSE_ARG_VNODE, vp,
11544                     FSE_ARG_DONE);
11545         }
11546 #endif
11547         vnode_put(vp);
11548         *retval = 0;
11549         return error;
11550 }
11551
11552 /*
11553  * Set the data of an extended attribute.
11554  */
11555 int
11556 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
11557 {
11558         vnode_t vp;
11559         char attrname[XATTR_MAXNAMELEN + 1];
11560         uio_t auio = NULL;
11561         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11562         size_t namelen;
11563         int error;
11564         char uio_buf[UIO_SIZEOF(1)];
11565 #if CONFIG_FSE
11566         vfs_context_t ctx = vfs_context_current();
11567 #endif
11568
11569         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11570                 return EINVAL;
11571         }
11572
11573         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11574         if (error != 0) {
11575                 if (error == EPERM) {
11576                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11577                         return ENAMETOOLONG;
11578                 }
11579                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11580                 return error;
11581         }
11582         if (xattr_protected(attrname)) {
11583                 return EPERM;
11584         }
11585         if (uap->size != 0 && uap->value == 0) {
11586                 return EINVAL;
11587         }
11588         if ((error = file_vnode(uap->fd, &vp))) {
11589                 return error;
11590         }
11591         if ((error = vnode_getwithref(vp))) {
11592                 file_drop(uap->fd);
11593                 return error;
11594         }
11595         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11596             &uio_buf[0], sizeof(uio_buf));
11597         uio_addiov(auio, uap->value, uap->size);
11598
11599         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
11600 #if CONFIG_FSE
11601         if (error == 0) {
11602                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11603                     FSE_ARG_VNODE, vp,
11604                     FSE_ARG_DONE);
11605         }
11606 #endif
11607         vnode_put(vp);
11608         file_drop(uap->fd);
11609         *retval = 0;
11610         return error;
11611 }
11612
11613 /*
11614  * Remove an extended attribute.
11615  * XXX Code duplication here.
11616  */
11617 int
11618 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
11619 {
11620         vnode_t vp;
11621         struct nameidata nd;
11622         char attrname[XATTR_MAXNAMELEN + 1];
11623         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11624         vfs_context_t ctx = vfs_context_current();
11625         size_t namelen;
11626         u_int32_t nameiflags;
11627         int error;
11628
11629         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11630                 return EINVAL;
11631         }
11632
11633         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11634         if (error != 0) {
11635                 return error;
11636         }
11637         if (xattr_protected(attrname)) {
11638                 return EPERM;
11639         }
11640         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11641         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
11642         if ((error = namei(&nd))) {
11643                 return error;
11644         }
11645         vp = nd.ni_vp;
11646         nameidone(&nd);
11647
11648         error = vn_removexattr(vp, attrname, uap->options, ctx);
11649 #if CONFIG_FSE
11650         if (error == 0) {
11651                 add_fsevent(FSE_XATTR_REMOVED, ctx,
11652                     FSE_ARG_VNODE, vp,
11653                     FSE_ARG_DONE);
11654         }
11655 #endif
11656         vnode_put(vp);
11657         *retval = 0;
11658         return error;
11659 }
11660
11661 /*
11662  * Remove an extended attribute.
11663  * XXX Code duplication here.
11664  */
11665 int
11666 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
11667 {
11668         vnode_t vp;
11669         char attrname[XATTR_MAXNAMELEN + 1];
11670         size_t namelen;
11671         int error;
11672 #if CONFIG_FSE
11673         vfs_context_t ctx = vfs_context_current();
11674 #endif
11675
11676         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11677                 return EINVAL;
11678         }
11679
11680         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11681         if (error != 0) {
11682                 return error;
11683         }
11684         if (xattr_protected(attrname)) {
11685                 return EPERM;
11686         }
11687         if ((error = file_vnode(uap->fd, &vp))) {
11688                 return error;
11689         }
11690         if ((error = vnode_getwithref(vp))) {
11691                 file_drop(uap->fd);
11692                 return error;
11693         }
11694
11695         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
11696 #if CONFIG_FSE
11697         if (error == 0) {
11698                 add_fsevent(FSE_XATTR_REMOVED, ctx,
11699                     FSE_ARG_VNODE, vp,
11700                     FSE_ARG_DONE);
11701         }
11702 #endif
11703         vnode_put(vp);
11704         file_drop(uap->fd);
11705         *retval = 0;
11706         return error;
11707 }
11708
11709 /*
11710  * Retrieve the list of extended attribute names.
11711  * XXX Code duplication here.
11712  */
11713 int
11714 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
11715 {
11716         vnode_t vp;
11717         struct nameidata nd;
11718         vfs_context_t ctx = vfs_context_current();
11719         uio_t auio = NULL;
11720         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11721         size_t attrsize = 0;
11722         u_int32_t nameiflags;
11723         int error;
11724         char uio_buf[UIO_SIZEOF(1)];
11725
11726         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11727                 return EINVAL;
11728         }
11729
11730         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11731         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
11732         if ((error = namei(&nd))) {
11733                 return error;
11734         }
11735         vp = nd.ni_vp;
11736         nameidone(&nd);
11737         if (uap->namebuf != 0 && uap->bufsize > 0) {
11738                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
11739                     &uio_buf[0], sizeof(uio_buf));
11740                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11741         }
11742
11743         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
11744
11745         vnode_put(vp);
11746         if (auio) {
11747                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11748         } else {
11749                 *retval = (user_ssize_t)attrsize;
11750         }
11751         return error;
11752 }
11753
11754 /*
11755  * Retrieve the list of extended attribute names.
11756  * XXX Code duplication here.
11757  */
11758 int
11759 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
11760 {
11761         vnode_t vp;
11762         uio_t auio = NULL;
11763         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11764         size_t attrsize = 0;
11765         int error;
11766         char uio_buf[UIO_SIZEOF(1)];
11767
11768         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11769                 return EINVAL;
11770         }
11771
11772         if ((error = file_vnode(uap->fd, &vp))) {
11773                 return error;
11774         }
11775         if ((error = vnode_getwithref(vp))) {
11776                 file_drop(uap->fd);
11777                 return error;
11778         }
11779         if (uap->namebuf != 0 && uap->bufsize > 0) {
11780                 auio = uio_createwithbuffer(1, 0, spacetype,
11781                     UIO_READ, &uio_buf[0], sizeof(uio_buf));
11782                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11783         }
11784
11785         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
11786
11787         vnode_put(vp);
11788         file_drop(uap->fd);
11789         if (auio) {
11790                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11791         } else {
11792                 *retval = (user_ssize_t)attrsize;
11793         }
11794         return error;
11795 }
11796
11797 static int
11798 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
11799     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
11800 {
11801         int error;
11802         struct mount *mp = NULL;
11803         vnode_t vp;
11804         int length;
11805         int bpflags;
11806         /* maximum number of times to retry build_path */
11807         unsigned int retries = 0x10;
11808
11809         if (bufsize > PAGE_SIZE) {
11810                 return EINVAL;
11811         }
11812
11813         if (buf == NULL) {
11814                 return ENOMEM;
11815         }
11816
11817 retry:
11818         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
11819                 error = ENOTSUP;  /* unexpected failure */
11820                 return ENOTSUP;
11821         }
11822
11823 unionget:
11824         if (objid == 2) {
11825                 struct vfs_attr vfsattr;
11826                 int use_vfs_root = TRUE;
11827
11828                 VFSATTR_INIT(&vfsattr);
11829                 VFSATTR_WANTED(&vfsattr, f_capabilities);
11830                 if (!(options & FSOPT_ISREALFSID) &&
11831                     vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
11832                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
11833                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
11834                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
11835                                 use_vfs_root = FALSE;
11836                         }
11837                 }
11838
11839                 if (use_vfs_root) {
11840                         error = VFS_ROOT(mp, &vp, ctx);
11841                 } else {
11842                         error = VFS_VGET(mp, objid, &vp, ctx);
11843                 }
11844         } else {
11845                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
11846         }
11847
11848         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
11849                 /*
11850                  * If the fileid isn't found and we're in a union
11851                  * mount volume, then see if the fileid is in the
11852                  * mounted-on volume.
11853                  */
11854                 struct mount *tmp = mp;
11855                 mp = vnode_mount(tmp->mnt_vnodecovered);
11856                 vfs_unbusy(tmp);
11857                 if (vfs_busy(mp, LK_NOWAIT) == 0) {
11858                         goto unionget;
11859                 }
11860         } else {
11861                 vfs_unbusy(mp);
11862         }
11863
11864         if (error) {
11865                 return error;
11866         }
11867
11868 #if CONFIG_MACF
11869         error = mac_vnode_check_fsgetpath(ctx, vp);
11870         if (error) {
11871                 vnode_put(vp);
11872                 return error;
11873         }
11874 #endif
11875
11876         /* Obtain the absolute path to this vnode. */
11877         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
11878         if (options & FSOPT_NOFIRMLINKPATH) {
11879                 bpflags |= BUILDPATH_NO_FIRMLINK;
11880         }
11881         bpflags |= BUILDPATH_CHECK_MOVED;
11882         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
11883         vnode_put(vp);
11884
11885         if (error) {
11886                 /* there was a race building the path, try a few more times */
11887                 if (error == EAGAIN) {
11888                         --retries;
11889                         if (retries > 0) {
11890                                 goto retry;
11891                         }
11892
11893                         error = ENOENT;
11894                 }
11895                 goto out;
11896         }
11897
11898         AUDIT_ARG(text, buf);
11899
11900         if (kdebug_enable) {
11901                 long dbg_parms[NUMPARMS];
11902                 int  dbg_namelen;
11903
11904                 dbg_namelen = (int)sizeof(dbg_parms);
11905
11906                 if (length < dbg_namelen) {
11907                         memcpy((char *)dbg_parms, buf, length);
11908                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
11909
11910                         dbg_namelen = length;
11911                 } else {
11912                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
11913                 }
11914
11915                 kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
11916                     KDBG_VFS_LOOKUP_FLAG_LOOKUP);
11917         }
11918
11919         *pathlen = (user_ssize_t)length; /* may be superseded by error */
11920
11921 out:
11922         return error;
11923 }
11924
11925 /*
11926  * Obtain the full pathname of a file system object by id.
11927  */
11928 static int
11929 fsgetpath_extended(user_addr_t buf, int bufsize, user_addr_t user_fsid, uint64_t objid,
11930     uint32_t options, user_ssize_t *retval)
11931 {
11932         vfs_context_t ctx = vfs_context_current();
11933         fsid_t fsid;
11934         char *realpath;
11935         int length;
11936         int error;
11937
11938         if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
11939                 return EINVAL;
11940         }
11941
11942         if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
11943                 return error;
11944         }
11945         AUDIT_ARG(value32, fsid.val[0]);
11946         AUDIT_ARG(value64, objid);
11947         /* Restrict output buffer size for now. */
11948
11949         if (bufsize > PAGE_SIZE || bufsize <= 0) {
11950                 return EINVAL;
11951         }
11952         MALLOC(realpath, char *, bufsize, M_TEMP, M_WAITOK | M_ZERO);
11953         if (realpath == NULL) {
11954                 return ENOMEM;
11955         }
11956
11957         error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
11958             options, &length);
11959
11960         if (error) {
11961                 goto out;
11962         }
11963
11964         error = copyout((caddr_t)realpath, buf, length);
11965
11966         *retval = (user_ssize_t)length; /* may be superseded by error */
11967 out:
11968         if (realpath) {
11969                 FREE(realpath, M_TEMP);
11970         }
11971         return error;
11972 }
11973
11974 int
11975 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
11976 {
11977         return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
11978                    0, retval);
11979 }
11980
11981 int
11982 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
11983 {
11984         return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
11985                    uap->options, retval);
11986 }
11987
11988 /*
11989  * Common routine to handle various flavors of statfs data heading out
11990  *      to user space.
11991  *
11992  * Returns:     0                       Success
11993  *              EFAULT
11994  */
11995 static int
11996 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
11997     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
11998     boolean_t partial_copy)
11999 {
12000         int             error;
12001         int             my_size, copy_size;
12002
12003         if (is_64_bit) {
12004                 struct user64_statfs sfs;
12005                 my_size = copy_size = sizeof(sfs);
12006                 bzero(&sfs, my_size);
12007                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12008                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12009                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12010                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12011                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12012                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12013                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12014                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12015                 sfs.f_files = (user64_long_t)sfsp->f_files;
12016                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12017                 sfs.f_fsid = sfsp->f_fsid;
12018                 sfs.f_owner = sfsp->f_owner;
12019                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12020                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12021                 } else {
12022                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12023                 }
12024                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12025                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12026
12027                 if (partial_copy) {
12028                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12029                 }
12030                 error = copyout((caddr_t)&sfs, bufp, copy_size);
12031         } else {
12032                 struct user32_statfs sfs;
12033
12034                 my_size = copy_size = sizeof(sfs);
12035                 bzero(&sfs, my_size);
12036
12037                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12038                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12039                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12040
12041                 /*
12042                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12043                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
12044                  * to reflect the filesystem size as best we can.
12045                  */
12046                 if ((sfsp->f_blocks > INT_MAX)
12047                     /* Hack for 4061702 . I think the real fix is for Carbon to
12048                      * look for some volume capability and not depend on hidden
12049                      * semantics agreed between a FS and carbon.
12050                      * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12051                      * for Carbon to set bNoVolumeSizes volume attribute.
12052                      * Without this the webdavfs files cannot be copied onto
12053                      * disk as they look huge. This change should not affect
12054                      * XSAN as they should not setting these to -1..
12055                      */
12056                     && (sfsp->f_blocks != 0xffffffffffffffffULL)
12057                     && (sfsp->f_bfree != 0xffffffffffffffffULL)
12058                     && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12059                         int             shift;
12060
12061                         /*
12062                          * Work out how far we have to shift the block count down to make it fit.
12063                          * Note that it's possible to have to shift so far that the resulting
12064                          * blocksize would be unreportably large.  At that point, we will clip
12065                          * any values that don't fit.
12066                          *
12067                          * For safety's sake, we also ensure that f_iosize is never reported as
12068                          * being smaller than f_bsize.
12069                          */
12070                         for (shift = 0; shift < 32; shift++) {
12071                                 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12072                                         break;
12073                                 }
12074                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12075                                         break;
12076                                 }
12077                         }
12078 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12079                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12080                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12081                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12082 #undef __SHIFT_OR_CLIP
12083                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12084                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
12085                 } else {
12086                         /* filesystem is small enough to be reported honestly */
12087                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12088                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12089                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12090                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12091                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12092                 }
12093                 sfs.f_files = (user32_long_t)sfsp->f_files;
12094                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12095                 sfs.f_fsid = sfsp->f_fsid;
12096                 sfs.f_owner = sfsp->f_owner;
12097                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12098                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12099                 } else {
12100                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12101                 }
12102                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12103                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12104
12105                 if (partial_copy) {
12106                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12107                 }
12108                 error = copyout((caddr_t)&sfs, bufp, copy_size);
12109         }
12110
12111         if (sizep != NULL) {
12112                 *sizep = my_size;
12113         }
12114         return error;
12115 }
12116
12117 /*
12118  * copy stat structure into user_stat structure.
12119  */
12120 void
12121 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
12122 {
12123         bzero(usbp, sizeof(*usbp));
12124
12125         usbp->st_dev = sbp->st_dev;
12126         usbp->st_ino = sbp->st_ino;
12127         usbp->st_mode = sbp->st_mode;
12128         usbp->st_nlink = sbp->st_nlink;
12129         usbp->st_uid = sbp->st_uid;
12130         usbp->st_gid = sbp->st_gid;
12131         usbp->st_rdev = sbp->st_rdev;
12132 #ifndef _POSIX_C_SOURCE
12133         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12134         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12135         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12136         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12137         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12138         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12139 #else
12140         usbp->st_atime = sbp->st_atime;
12141         usbp->st_atimensec = sbp->st_atimensec;
12142         usbp->st_mtime = sbp->st_mtime;
12143         usbp->st_mtimensec = sbp->st_mtimensec;
12144         usbp->st_ctime = sbp->st_ctime;
12145         usbp->st_ctimensec = sbp->st_ctimensec;
12146 #endif
12147         usbp->st_size = sbp->st_size;
12148         usbp->st_blocks = sbp->st_blocks;
12149         usbp->st_blksize = sbp->st_blksize;
12150         usbp->st_flags = sbp->st_flags;
12151         usbp->st_gen = sbp->st_gen;
12152         usbp->st_lspare = sbp->st_lspare;
12153         usbp->st_qspare[0] = sbp->st_qspare[0];
12154         usbp->st_qspare[1] = sbp->st_qspare[1];
12155 }
12156
12157 void
12158 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
12159 {
12160         bzero(usbp, sizeof(*usbp));
12161
12162         usbp->st_dev = sbp->st_dev;
12163         usbp->st_ino = sbp->st_ino;
12164         usbp->st_mode = sbp->st_mode;
12165         usbp->st_nlink = sbp->st_nlink;
12166         usbp->st_uid = sbp->st_uid;
12167         usbp->st_gid = sbp->st_gid;
12168         usbp->st_rdev = sbp->st_rdev;
12169 #ifndef _POSIX_C_SOURCE
12170         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12171         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12172         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12173         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12174         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12175         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12176 #else
12177         usbp->st_atime = sbp->st_atime;
12178         usbp->st_atimensec = sbp->st_atimensec;
12179         usbp->st_mtime = sbp->st_mtime;
12180         usbp->st_mtimensec = sbp->st_mtimensec;
12181         usbp->st_ctime = sbp->st_ctime;
12182         usbp->st_ctimensec = sbp->st_ctimensec;
12183 #endif
12184         usbp->st_size = sbp->st_size;
12185         usbp->st_blocks = sbp->st_blocks;
12186         usbp->st_blksize = sbp->st_blksize;
12187         usbp->st_flags = sbp->st_flags;
12188         usbp->st_gen = sbp->st_gen;
12189         usbp->st_lspare = sbp->st_lspare;
12190         usbp->st_qspare[0] = sbp->st_qspare[0];
12191         usbp->st_qspare[1] = sbp->st_qspare[1];
12192 }
12193
12194 /*
12195  * copy stat64 structure into user_stat64 structure.
12196  */
12197 void
12198 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
12199 {
12200         bzero(usbp, sizeof(*usbp));
12201
12202         usbp->st_dev = sbp->st_dev;
12203         usbp->st_ino = sbp->st_ino;
12204         usbp->st_mode = sbp->st_mode;
12205         usbp->st_nlink = sbp->st_nlink;
12206         usbp->st_uid = sbp->st_uid;
12207         usbp->st_gid = sbp->st_gid;
12208         usbp->st_rdev = sbp->st_rdev;
12209 #ifndef _POSIX_C_SOURCE
12210         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12211         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12212         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12213         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12214         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12215         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12216         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12217         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12218 #else
12219         usbp->st_atime = sbp->st_atime;
12220         usbp->st_atimensec = sbp->st_atimensec;
12221         usbp->st_mtime = sbp->st_mtime;
12222         usbp->st_mtimensec = sbp->st_mtimensec;
12223         usbp->st_ctime = sbp->st_ctime;
12224         usbp->st_ctimensec = sbp->st_ctimensec;
12225         usbp->st_birthtime = sbp->st_birthtime;
12226         usbp->st_birthtimensec = sbp->st_birthtimensec;
12227 #endif
12228         usbp->st_size = sbp->st_size;
12229         usbp->st_blocks = sbp->st_blocks;
12230         usbp->st_blksize = sbp->st_blksize;
12231         usbp->st_flags = sbp->st_flags;
12232         usbp->st_gen = sbp->st_gen;
12233         usbp->st_lspare = sbp->st_lspare;
12234         usbp->st_qspare[0] = sbp->st_qspare[0];
12235         usbp->st_qspare[1] = sbp->st_qspare[1];
12236 }
12237
12238 void
12239 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
12240 {
12241         bzero(usbp, sizeof(*usbp));
12242
12243         usbp->st_dev = sbp->st_dev;
12244         usbp->st_ino = sbp->st_ino;
12245         usbp->st_mode = sbp->st_mode;
12246         usbp->st_nlink = sbp->st_nlink;
12247         usbp->st_uid = sbp->st_uid;
12248         usbp->st_gid = sbp->st_gid;
12249         usbp->st_rdev = sbp->st_rdev;
12250 #ifndef _POSIX_C_SOURCE
12251         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12252         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12253         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12254         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12255         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12256         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12257         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12258         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12259 #else
12260         usbp->st_atime = sbp->st_atime;
12261         usbp->st_atimensec = sbp->st_atimensec;
12262         usbp->st_mtime = sbp->st_mtime;
12263         usbp->st_mtimensec = sbp->st_mtimensec;
12264         usbp->st_ctime = sbp->st_ctime;
12265         usbp->st_ctimensec = sbp->st_ctimensec;
12266         usbp->st_birthtime = sbp->st_birthtime;
12267         usbp->st_birthtimensec = sbp->st_birthtimensec;
12268 #endif
12269         usbp->st_size = sbp->st_size;
12270         usbp->st_blocks = sbp->st_blocks;
12271         usbp->st_blksize = sbp->st_blksize;
12272         usbp->st_flags = sbp->st_flags;
12273         usbp->st_gen = sbp->st_gen;
12274         usbp->st_lspare = sbp->st_lspare;
12275         usbp->st_qspare[0] = sbp->st_qspare[0];
12276         usbp->st_qspare[1] = sbp->st_qspare[1];
12277 }
12278
12279 /*
12280  * Purge buffer cache for simulating cold starts
12281  */
12282 static int
12283 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
12284 {
12285         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
12286
12287         return VNODE_RETURNED;
12288 }
12289
12290 static int
12291 vfs_purge_callback(mount_t mp, __unused void * arg)
12292 {
12293         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
12294
12295         return VFS_RETURNED;
12296 }
12297
12298 int
12299 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
12300 {
12301         if (!kauth_cred_issuser(kauth_cred_get())) {
12302                 return EPERM;
12303         }
12304
12305         vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
12306
12307         return 0;
12308 }
12309
12310 /*
12311  * gets the vnode associated with the (unnamed) snapshot directory
12312  * for a Filesystem. The snapshot directory vnode is returned with
12313  * an iocount on it.
12314  */
12315 int
12316 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
12317 {
12318         return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
12319 }
12320
12321 /*
12322  * Get the snapshot vnode.
12323  *
12324  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
12325  * needs nameidone() on ndp.
12326  *
12327  * If the snapshot vnode exists it is returned in ndp->ni_vp.
12328  *
12329  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
12330  * not needed.
12331  */
12332 static int
12333 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
12334     user_addr_t name, struct nameidata *ndp, int32_t op,
12335 #if !CONFIG_TRIGGERS
12336     __unused
12337 #endif
12338     enum path_operation pathop,
12339     vfs_context_t ctx)
12340 {
12341         int error, i;
12342         caddr_t name_buf;
12343         size_t name_len;
12344         struct vfs_attr vfa;
12345
12346         *sdvpp = NULLVP;
12347         *rvpp = NULLVP;
12348
12349         error = vnode_getfromfd(ctx, dirfd, rvpp);
12350         if (error) {
12351                 return error;
12352         }
12353
12354         if (!vnode_isvroot(*rvpp)) {
12355                 error = EINVAL;
12356                 goto out;
12357         }
12358
12359         /* Make sure the filesystem supports snapshots */
12360         VFSATTR_INIT(&vfa);
12361         VFSATTR_WANTED(&vfa, f_capabilities);
12362         if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
12363             !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
12364             !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
12365             VOL_CAP_INT_SNAPSHOT)) ||
12366             !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
12367             VOL_CAP_INT_SNAPSHOT))) {
12368                 error = ENOTSUP;
12369                 goto out;
12370         }
12371
12372         error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
12373         if (error) {
12374                 goto out;
12375         }
12376
12377         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12378         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12379         if (error) {
12380                 goto out1;
12381         }
12382
12383         /*
12384          * Some sanity checks- name can't be empty, "." or ".." or have slashes.
12385          * (the length returned by copyinstr includes the terminating NUL)
12386          */
12387         if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
12388             (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
12389                 error = EINVAL;
12390                 goto out1;
12391         }
12392         for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
12393                 ;
12394         }
12395         if (i < (int)name_len) {
12396                 error = EINVAL;
12397                 goto out1;
12398         }
12399
12400 #if CONFIG_MACF
12401         if (op == CREATE) {
12402                 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
12403                     name_buf);
12404         } else if (op == DELETE) {
12405                 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
12406                     name_buf);
12407         }
12408         if (error) {
12409                 goto out1;
12410         }
12411 #endif
12412
12413         /* Check if the snapshot already exists ... */
12414         NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
12415             UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
12416         ndp->ni_dvp = *sdvpp;
12417
12418         error = namei(ndp);
12419 out1:
12420         FREE(name_buf, M_TEMP);
12421 out:
12422         if (error) {
12423                 if (*sdvpp) {
12424                         vnode_put(*sdvpp);
12425                         *sdvpp = NULLVP;
12426                 }
12427                 if (*rvpp) {
12428                         vnode_put(*rvpp);
12429                         *rvpp = NULLVP;
12430                 }
12431         }
12432         return error;
12433 }
12434
12435 /*
12436  * create a filesystem snapshot (for supporting filesystems)
12437  *
12438  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
12439  * We get to the (unnamed) snapshot directory vnode and create the vnode
12440  * for the snapshot in it.
12441  *
12442  * Restrictions:
12443  *
12444  *    a) Passed in name for snapshot cannot have slashes.
12445  *    b) name can't be "." or ".."
12446  *
12447  * Since this requires superuser privileges, vnode_authorize calls are not
12448  * made.
12449  */
12450 static int
12451 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
12452     vfs_context_t ctx)
12453 {
12454         vnode_t rvp, snapdvp;
12455         int error;
12456         struct nameidata namend;
12457
12458         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
12459             OP_LINK, ctx);
12460         if (error) {
12461                 return error;
12462         }
12463
12464         if (namend.ni_vp) {
12465                 vnode_put(namend.ni_vp);
12466                 error = EEXIST;
12467         } else {
12468                 struct vnode_attr va;
12469                 vnode_t vp = NULLVP;
12470
12471                 VATTR_INIT(&va);
12472                 VATTR_SET(&va, va_type, VREG);
12473                 VATTR_SET(&va, va_mode, 0);
12474
12475                 error = vn_create(snapdvp, &vp, &namend, &va,
12476                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
12477                 if (!error && vp) {
12478                         vnode_put(vp);
12479                 }
12480         }
12481
12482         nameidone(&namend);
12483         vnode_put(snapdvp);
12484         vnode_put(rvp);
12485         return error;
12486 }
12487
12488 /*
12489  * Delete a Filesystem snapshot
12490  *
12491  * get the vnode for the unnamed snapshot directory and the snapshot and
12492  * delete the snapshot.
12493  */
12494 static int
12495 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
12496     vfs_context_t ctx)
12497 {
12498         vnode_t rvp, snapdvp;
12499         int error;
12500         struct nameidata namend;
12501
12502         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
12503             OP_UNLINK, ctx);
12504         if (error) {
12505                 goto out;
12506         }
12507
12508         error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
12509             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
12510
12511         vnode_put(namend.ni_vp);
12512         nameidone(&namend);
12513         vnode_put(snapdvp);
12514         vnode_put(rvp);
12515 out:
12516         return error;
12517 }
12518
12519 /*
12520  * Revert a filesystem to a snapshot
12521  *
12522  * Marks the filesystem to revert to the given snapshot on next mount.
12523  */
12524 static int
12525 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
12526     vfs_context_t ctx)
12527 {
12528         int error;
12529         vnode_t rvp;
12530         mount_t mp;
12531         struct fs_snapshot_revert_args revert_data;
12532         struct componentname cnp;
12533         caddr_t name_buf;
12534         size_t name_len;
12535
12536         error = vnode_getfromfd(ctx, dirfd, &rvp);
12537         if (error) {
12538                 return error;
12539         }
12540         mp = vnode_mount(rvp);
12541
12542         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12543         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12544         if (error) {
12545                 FREE(name_buf, M_TEMP);
12546                 vnode_put(rvp);
12547                 return error;
12548         }
12549
12550 #if CONFIG_MACF
12551         error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
12552         if (error) {
12553                 FREE(name_buf, M_TEMP);
12554                 vnode_put(rvp);
12555                 return error;
12556         }
12557 #endif
12558
12559         /*
12560          * Grab mount_iterref so that we can release the vnode,
12561          * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
12562          */
12563         error = mount_iterref(mp, 0);
12564         vnode_put(rvp);
12565         if (error) {
12566                 FREE(name_buf, M_TEMP);
12567                 return error;
12568         }
12569
12570         memset(&cnp, 0, sizeof(cnp));
12571         cnp.cn_pnbuf = (char *)name_buf;
12572         cnp.cn_nameiop = LOOKUP;
12573         cnp.cn_flags = ISLASTCN | HASBUF;
12574         cnp.cn_pnlen = MAXPATHLEN;
12575         cnp.cn_nameptr = cnp.cn_pnbuf;
12576         cnp.cn_namelen = (int)name_len;
12577         revert_data.sr_cnp = &cnp;
12578
12579         error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
12580         mount_iterdrop(mp);
12581         FREE(name_buf, M_TEMP);
12582
12583         if (error) {
12584                 /* If there was any error, try again using VNOP_IOCTL */
12585
12586                 vnode_t snapdvp;
12587                 struct nameidata namend;
12588
12589                 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
12590                     OP_LOOKUP, ctx);
12591                 if (error) {
12592                         return error;
12593                 }
12594
12595
12596                 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
12597                     0, ctx);
12598
12599                 vnode_put(namend.ni_vp);
12600                 nameidone(&namend);
12601                 vnode_put(snapdvp);
12602                 vnode_put(rvp);
12603         }
12604
12605         return error;
12606 }
12607
12608 /*
12609  * rename a Filesystem snapshot
12610  *
12611  * get the vnode for the unnamed snapshot directory and the snapshot and
12612  * rename the snapshot. This is a very specialised (and simple) case of
12613  * rename(2) (which has to deal with a lot more complications). It differs
12614  * slightly from rename(2) in that EEXIST is returned if the new name exists.
12615  */
12616 static int
12617 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
12618     __unused uint32_t flags, vfs_context_t ctx)
12619 {
12620         vnode_t rvp, snapdvp;
12621         int error, i;
12622         caddr_t newname_buf;
12623         size_t name_len;
12624         vnode_t fvp;
12625         struct nameidata *fromnd, *tond;
12626         /* carving out a chunk for structs that are too big to be on stack. */
12627         struct {
12628                 struct nameidata from_node;
12629                 struct nameidata to_node;
12630         } * __rename_data;
12631
12632         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
12633         fromnd = &__rename_data->from_node;
12634         tond = &__rename_data->to_node;
12635
12636         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
12637             OP_UNLINK, ctx);
12638         if (error) {
12639                 goto out;
12640         }
12641         fvp  = fromnd->ni_vp;
12642
12643         MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12644         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
12645         if (error) {
12646                 goto out1;
12647         }
12648
12649         /*
12650          * Some sanity checks- new name can't be empty, "." or ".." or have
12651          * slashes.
12652          * (the length returned by copyinstr includes the terminating NUL)
12653          *
12654          * The FS rename VNOP is suppossed to handle this but we'll pick it
12655          * off here itself.
12656          */
12657         if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
12658             (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
12659                 error = EINVAL;
12660                 goto out1;
12661         }
12662         for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
12663                 ;
12664         }
12665         if (i < (int)name_len) {
12666                 error = EINVAL;
12667                 goto out1;
12668         }
12669
12670 #if CONFIG_MACF
12671         error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
12672             newname_buf);
12673         if (error) {
12674                 goto out1;
12675         }
12676 #endif
12677
12678         NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
12679             UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
12680         tond->ni_dvp = snapdvp;
12681
12682         error = namei(tond);
12683         if (error) {
12684                 goto out2;
12685         } else if (tond->ni_vp) {
12686                 /*
12687                  * snapshot rename behaves differently than rename(2) - if the
12688                  * new name exists, EEXIST is returned.
12689                  */
12690                 vnode_put(tond->ni_vp);
12691                 error = EEXIST;
12692                 goto out2;
12693         }
12694
12695         error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
12696             &tond->ni_cnd, ctx);
12697
12698 out2:
12699         nameidone(tond);
12700 out1:
12701         FREE(newname_buf, M_TEMP);
12702         vnode_put(fvp);
12703         vnode_put(snapdvp);
12704         vnode_put(rvp);
12705         nameidone(fromnd);
12706 out:
12707         FREE(__rename_data, M_TEMP);
12708         return error;
12709 }
12710
12711 /*
12712  * Mount a Filesystem snapshot
12713  *
12714  * get the vnode for the unnamed snapshot directory and the snapshot and
12715  * mount the snapshot.
12716  */
12717 static int
12718 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
12719     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
12720 {
12721         vnode_t rvp, snapdvp, snapvp, vp, pvp;
12722         int error;
12723         struct nameidata *snapndp, *dirndp;
12724         /* carving out a chunk for structs that are too big to be on stack. */
12725         struct {
12726                 struct nameidata snapnd;
12727                 struct nameidata dirnd;
12728         } * __snapshot_mount_data;
12729
12730         MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
12731             M_TEMP, M_WAITOK);
12732         snapndp = &__snapshot_mount_data->snapnd;
12733         dirndp = &__snapshot_mount_data->dirnd;
12734
12735         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
12736             OP_LOOKUP, ctx);
12737         if (error) {
12738                 goto out;
12739         }
12740
12741         snapvp  = snapndp->ni_vp;
12742         if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
12743                 error = EIO;
12744                 goto out1;
12745         }
12746
12747         /* Get the vnode to be covered */
12748         NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
12749             UIO_USERSPACE, directory, ctx);
12750         error = namei(dirndp);
12751         if (error) {
12752                 goto out1;
12753         }
12754
12755         vp = dirndp->ni_vp;
12756         pvp = dirndp->ni_dvp;
12757
12758         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
12759                 error = EINVAL;
12760         } else {
12761                 mount_t mp = vnode_mount(rvp);
12762                 struct fs_snapshot_mount_args smnt_data;
12763
12764                 smnt_data.sm_mp  = mp;
12765                 smnt_data.sm_cnp = &snapndp->ni_cnd;
12766                 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
12767                     &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
12768                     KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
12769         }
12770
12771         vnode_put(vp);
12772         vnode_put(pvp);
12773         nameidone(dirndp);
12774 out1:
12775         vnode_put(snapvp);
12776         vnode_put(snapdvp);
12777         vnode_put(rvp);
12778         nameidone(snapndp);
12779 out:
12780         FREE(__snapshot_mount_data, M_TEMP);
12781         return error;
12782 }
12783
12784 /*
12785  * Root from a snapshot of the filesystem
12786  *
12787  * Marks the filesystem to root from the given snapshot on next boot.
12788  */
12789 static int
12790 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
12791     vfs_context_t ctx)
12792 {
12793         int error;
12794         vnode_t rvp;
12795         mount_t mp;
12796         struct fs_snapshot_root_args root_data;
12797         struct componentname cnp;
12798         caddr_t name_buf;
12799         size_t name_len;
12800
12801         error = vnode_getfromfd(ctx, dirfd, &rvp);
12802         if (error) {
12803                 return error;
12804         }
12805         mp = vnode_mount(rvp);
12806
12807         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12808         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12809         if (error) {
12810                 FREE(name_buf, M_TEMP);
12811                 vnode_put(rvp);
12812                 return error;
12813         }
12814
12815         // XXX MAC checks ?
12816
12817         /*
12818          * Grab mount_iterref so that we can release the vnode,
12819          * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
12820          */
12821         error = mount_iterref(mp, 0);
12822         vnode_put(rvp);
12823         if (error) {
12824                 FREE(name_buf, M_TEMP);
12825                 return error;
12826         }
12827
12828         memset(&cnp, 0, sizeof(cnp));
12829         cnp.cn_pnbuf = (char *)name_buf;
12830         cnp.cn_nameiop = LOOKUP;
12831         cnp.cn_flags = ISLASTCN | HASBUF;
12832         cnp.cn_pnlen = MAXPATHLEN;
12833         cnp.cn_nameptr = cnp.cn_pnbuf;
12834         cnp.cn_namelen = (int)name_len;
12835         root_data.sr_cnp = &cnp;
12836
12837         error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
12838
12839         mount_iterdrop(mp);
12840         FREE(name_buf, M_TEMP);
12841
12842         return error;
12843 }
12844
12845 /*
12846  * FS snapshot operations dispatcher
12847  */
12848 int
12849 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
12850     __unused int32_t *retval)
12851 {
12852         int error;
12853         vfs_context_t ctx = vfs_context_current();
12854
12855         AUDIT_ARG(fd, uap->dirfd);
12856         AUDIT_ARG(value32, uap->op);
12857
12858         error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
12859         if (error) {
12860                 return error;
12861         }
12862
12863         /*
12864          * Enforce user authorization for snapshot modification operations
12865          */
12866         if ((uap->op != SNAPSHOT_OP_MOUNT) &&
12867             (uap->op != SNAPSHOT_OP_ROOT)) {
12868                 vnode_t dvp = NULLVP;
12869                 vnode_t devvp = NULLVP;
12870                 mount_t mp;
12871
12872                 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
12873                 if (error) {
12874                         return error;
12875                 }
12876                 mp = vnode_mount(dvp);
12877                 devvp = mp->mnt_devvp;
12878
12879                 /* get an iocount on devvp */
12880                 if (devvp == NULLVP) {
12881                         error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
12882                         /* for mounts which arent block devices */
12883                         if (error == ENOENT) {
12884                                 error = ENXIO;
12885                         }
12886                 } else {
12887                         error = vnode_getwithref(devvp);
12888                 }
12889
12890                 if (error) {
12891                         vnode_put(dvp);
12892                         return error;
12893                 }
12894
12895                 if ((vfs_context_issuser(ctx) == 0) &&
12896                     (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0)) {
12897                         error = EPERM;
12898                 }
12899                 vnode_put(dvp);
12900                 vnode_put(devvp);
12901
12902                 if (error) {
12903                         return error;
12904                 }
12905         }
12906
12907         switch (uap->op) {
12908         case SNAPSHOT_OP_CREATE:
12909                 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
12910                 break;
12911         case SNAPSHOT_OP_DELETE:
12912                 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
12913                 break;
12914         case SNAPSHOT_OP_RENAME:
12915                 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
12916                     uap->flags, ctx);
12917                 break;
12918         case SNAPSHOT_OP_MOUNT:
12919                 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
12920                     uap->data, uap->flags, ctx);
12921                 break;
12922         case SNAPSHOT_OP_REVERT:
12923                 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
12924                 break;
12925 #if CONFIG_MNT_ROOTSNAP
12926         case SNAPSHOT_OP_ROOT:
12927                 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
12928                 break;
12929 #endif /* CONFIG_MNT_ROOTSNAP */
12930         default:
12931                 error = ENOSYS;
12932         }
12933
12934         return error;
12935 }