bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <kern/kalloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/sysctl.h>
  98 #include <sys/xattr.h>
  99 #include <sys/fcntl.h>
 100 #include <sys/fsctl.h>
 101 #include <sys/ubc_internal.h>
 102 #include <sys/disk.h>
 103 #include <sys/content_protection.h>
 104 #include <sys/clonefile.h>
 105 #include <sys/snapshot.h>
 106 #include <sys/priv.h>
 107 #include <sys/fsgetpath.h>
 108 #include <machine/cons.h>
 109 #include <machine/limits.h>
 110 #include <miscfs/specfs/specdev.h>
 111
 112 #include <vfs/vfs_disk_conditioner.h>
 113
 114 #include <security/audit/audit.h>
 115 #include <bsm/audit_kevents.h>
 116
 117 #include <mach/mach_types.h>
 118 #include <kern/kern_types.h>
 119 #include <kern/kalloc.h>
 120 #include <kern/task.h>
 121
 122 #include <vm/vm_pageout.h>
 123 #include <vm/vm_protos.h>
 124
 125 #include <libkern/OSAtomic.h>
 126 #include <os/atomic_private.h>
 127 #include <pexpert/pexpert.h>
 128 #include <IOKit/IOBSD.h>
 129
 130 // deps for MIG call
 131 #include <kern/host.h>
 132 #include <kern/ipc_misc.h>
 133 #include <mach/host_priv.h>
 134 #include <mach/vfs_nspace.h>
 135 #include <os/log.h>
 136
 137 #include <nfs/nfs_conf.h>
 138
 139 #if ROUTEFS
 140 #include <miscfs/routefs/routefs.h>
 141 #endif /* ROUTEFS */
 142
 143 #if CONFIG_MACF
 144 #include <security/mac.h>
 145 #include <security/mac_framework.h>
 146 #endif
 147
 148 #if CONFIG_FSE
 149 #define GET_PATH(x) \
 150         ((x) = get_pathbuff())
 151 #define RELEASE_PATH(x) \
 152         release_pathbuff(x)
 153 #else
 154 #define GET_PATH(x)     \
 155         ((x) = zalloc(ZV_NAMEI))
 156 #define RELEASE_PATH(x) \
 157         zfree(ZV_NAMEI, x)
 158 #endif /* CONFIG_FSE */
 159
 160 #ifndef HFS_GET_BOOT_INFO
 161 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
 162 #endif
 163
 164 #ifndef HFS_SET_BOOT_INFO
 165 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
 166 #endif
 167
 168 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
 169 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
 170 #endif
 171
 172 /*
 173  * If you need accounting for KM_FD_VN_DATA consider using
 174  * ZONE_VIEW_DEFINE to define a zone view.
 175  */
 176 #define KM_FD_VN_DATA KHEAP_DEFAULT
 177
 178 extern void disk_conditioner_unmount(mount_t mp);
 179
 180 /* struct for checkdirs iteration */
 181 struct cdirargs {
 182         vnode_t olddp;
 183         vnode_t newdp;
 184 };
 185 /* callback  for checkdirs iteration */
 186 static int checkdirs_callback(proc_t p, void * arg);
 187
 188 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 189 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 190 void enablequotas(struct mount *mp, vfs_context_t ctx);
 191 static int getfsstat_callback(mount_t mp, void * arg);
 192 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 193 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 194 static int sync_callback(mount_t, void *);
 195 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 196     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 197     boolean_t partial_copy);
 198 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 199 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 200     struct componentname *cnp, user_addr_t fsmountargs,
 201     int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 202     vfs_context_t ctx);
 203 void vfs_notify_mount(vnode_t pdvp);
 204
 205 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 206
 207 struct fd_vn_data * fg_vn_data_alloc(void);
 208
 209 /*
 210  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 211  * Concurrent lookups (or lookups by ids) on hard links can cause the
 212  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 213  * does) to return ENOENT as the path cannot be returned from the name cache
 214  * alone. We have no option but to retry and hope to get one namei->reverse path
 215  * generation done without an intervening lookup, lookup by id on the hard link
 216  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 217  * which currently are the MAC hooks for rename, unlink and rmdir.
 218  */
 219 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 220
 221 /* Max retry limit for rename due to vnode recycling. */
 222 #define MAX_RENAME_ERECYCLE_RETRIES 1024
 223
 224 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
 225     int unlink_flags);
 226
 227 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, uint32_t options, int *);
 228
 229 #ifdef CONFIG_IMGSRC_ACCESS
 230 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 231 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 232 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 233 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 234 static void mount_end_update(mount_t mp);
 235 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 236 #endif /* CONFIG_IMGSRC_ACCESS */
 237
 238 #if CONFIG_LOCKERBOOT
 239 int mount_locker_protoboot(const char *fsname, const char *mntpoint,
 240     const char *pbdevpath);
 241 #endif
 242
 243 //snapshot functions
 244 #if CONFIG_MNT_ROOTSNAP
 245 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
 246 #else
 247 static int __attribute__ ((noinline)) snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
 248 #endif
 249
 250 __private_extern__
 251 int sync_internal(void);
 252
 253 __private_extern__
 254 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 255
 256 static LCK_GRP_DECLARE(fd_vn_lck_grp, "fd_vnode_data");
 257 static LCK_ATTR_DECLARE(fd_vn_lck_attr, 0, 0);
 258
 259 /* vars for sync mutex */
 260 static LCK_GRP_DECLARE(sync_mtx_lck_grp, "sync thread");
 261 static LCK_MTX_DECLARE(sync_mtx_lck, &sync_mtx_lck_grp);
 262
 263 extern lck_rw_t rootvnode_rw_lock;
 264
 265 /*
 266  * incremented each time a mount or unmount operation occurs
 267  * used to invalidate the cached value of the rootvp in the
 268  * mount structure utilized by cache_lookup_path
 269  */
 270 uint32_t mount_generation = 0;
 271
 272 /* counts number of mount and unmount operations */
 273 unsigned int vfs_nummntops = 0;
 274
 275 /* system-wide, per-boot unique mount ID */
 276 static _Atomic uint64_t mount_unique_id = 1;
 277
 278 extern const struct fileops vnops;
 279 #if CONFIG_APPLEDOUBLE
 280 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 281 #endif /* CONFIG_APPLEDOUBLE */
 282
 283 /*
 284  * Virtual File System System Calls
 285  */
 286
 287 #if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
 288 /*
 289  * Private in-kernel mounting spi (NFS only, not exported)
 290  */
 291 __private_extern__
 292 boolean_t
 293 vfs_iskernelmount(mount_t mp)
 294 {
 295         return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
 296 }
 297
 298 __private_extern__
 299 int
 300 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 301     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags, vfs_context_t ctx)
 302 {
 303         struct nameidata nd;
 304         boolean_t did_namei;
 305         int error;
 306
 307         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 308             UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 309
 310         /*
 311          * Get the vnode to be covered if it's not supplied
 312          */
 313         if (vp == NULLVP) {
 314                 error = namei(&nd);
 315                 if (error) {
 316                         if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK)) {
 317                                 printf("failed to locate mount-on path: %s ", path);
 318                         }
 319                         return error;
 320                 }
 321                 vp = nd.ni_vp;
 322                 pvp = nd.ni_dvp;
 323                 did_namei = TRUE;
 324         } else {
 325                 char *pnbuf = CAST_DOWN(char *, path);
 326
 327                 nd.ni_cnd.cn_pnbuf = pnbuf;
 328                 nd.ni_cnd.cn_pnlen = (int)(strlen(pnbuf) + 1);
 329                 did_namei = FALSE;
 330         }
 331
 332         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 333             syscall_flags, kern_flags, NULL, TRUE, ctx);
 334
 335         if (did_namei) {
 336                 vnode_put(vp);
 337                 vnode_put(pvp);
 338                 nameidone(&nd);
 339         }
 340
 341         return error;
 342 }
 343 #endif /* CONFIG_NFS_CLIENT || DEVFS */
 344
 345 /*
 346  * Mount a file system.
 347  */
 348 /* ARGSUSED */
 349 int
 350 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 351 {
 352         struct __mac_mount_args muap;
 353
 354         muap.type = uap->type;
 355         muap.path = uap->path;
 356         muap.flags = uap->flags;
 357         muap.data = uap->data;
 358         muap.mac_p = USER_ADDR_NULL;
 359         return __mac_mount(p, &muap, retval);
 360 }
 361
 362 int
 363 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
 364 {
 365         struct componentname    cn;
 366         vfs_context_t           ctx = vfs_context_current();
 367         size_t                  dummy = 0;
 368         int                     error;
 369         int                     flags = uap->flags;
 370         char                    fstypename[MFSNAMELEN];
 371         char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
 372         vnode_t                 pvp;
 373         vnode_t                 vp;
 374
 375         AUDIT_ARG(fd, uap->fd);
 376         AUDIT_ARG(fflags, flags);
 377         /* fstypename will get audited by mount_common */
 378
 379         /* Sanity check the flags */
 380         if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
 381                 return ENOTSUP;
 382         }
 383
 384         if (flags & MNT_UNION) {
 385                 return EPERM;
 386         }
 387
 388         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 389         if (error) {
 390                 return error;
 391         }
 392
 393         if ((error = file_vnode(uap->fd, &vp)) != 0) {
 394                 return error;
 395         }
 396
 397         if ((error = vnode_getwithref(vp)) != 0) {
 398                 file_drop(uap->fd);
 399                 return error;
 400         }
 401
 402         pvp = vnode_getparent(vp);
 403         if (pvp == NULL) {
 404                 vnode_put(vp);
 405                 file_drop(uap->fd);
 406                 return EINVAL;
 407         }
 408
 409         memset(&cn, 0, sizeof(struct componentname));
 410         cn.cn_pnbuf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
 411         cn.cn_pnlen = MAXPATHLEN;
 412
 413         if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
 414                 zfree(ZV_NAMEI, cn.cn_pnbuf);
 415                 vnode_put(pvp);
 416                 vnode_put(vp);
 417                 file_drop(uap->fd);
 418                 return error;
 419         }
 420
 421         error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
 422
 423         zfree(ZV_NAMEI, cn.cn_pnbuf);
 424         vnode_put(pvp);
 425         vnode_put(vp);
 426         file_drop(uap->fd);
 427
 428         return error;
 429 }
 430
 431 void
 432 vfs_notify_mount(vnode_t pdvp)
 433 {
 434         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 435         lock_vnode_and_post(pdvp, NOTE_WRITE);
 436 }
 437
 438 /*
 439  * __mac_mount:
 440  *      Mount a file system taking into account MAC label behavior.
 441  *      See mount(2) man page for more information
 442  *
 443  * Parameters:    p                        Process requesting the mount
 444  *                uap                      User argument descriptor (see below)
 445  *                retval                   (ignored)
 446  *
 447  * Indirect:      uap->type                Filesystem type
 448  *                uap->path                Path to mount
 449  *                uap->data                Mount arguments
 450  *                uap->mac_p               MAC info
 451  *                uap->flags               Mount flags
 452  *
 453  *
 454  * Returns:        0                       Success
 455  *                !0                       Not success
 456  */
 457 boolean_t root_fs_upgrade_try = FALSE;
 458
 459 int
 460 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 461 {
 462         vnode_t pvp = NULL;
 463         vnode_t vp = NULL;
 464         int need_nameidone = 0;
 465         vfs_context_t ctx = vfs_context_current();
 466         char fstypename[MFSNAMELEN];
 467         struct nameidata nd;
 468         size_t dummy = 0;
 469         char *labelstr = NULL;
 470         size_t labelsz = 0;
 471         int flags = uap->flags;
 472         int error;
 473 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 474         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 475 #else
 476 #pragma unused(p)
 477 #endif
 478         /*
 479          * Get the fs type name from user space
 480          */
 481         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 482         if (error) {
 483                 return error;
 484         }
 485
 486         /*
 487          * Get the vnode to be covered
 488          */
 489         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 490             UIO_USERSPACE, uap->path, ctx);
 491         error = namei(&nd);
 492         if (error) {
 493                 goto out;
 494         }
 495         need_nameidone = 1;
 496         vp = nd.ni_vp;
 497         pvp = nd.ni_dvp;
 498
 499 #ifdef CONFIG_IMGSRC_ACCESS
 500         /* Mounting image source cannot be batched with other operations */
 501         if (flags == MNT_IMGSRC_BY_INDEX) {
 502                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 503                     ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 504                 goto out;
 505         }
 506 #endif /* CONFIG_IMGSRC_ACCESS */
 507
 508 #if CONFIG_MACF
 509         /*
 510          * Get the label string (if any) from user space
 511          */
 512         if (uap->mac_p != USER_ADDR_NULL) {
 513                 struct user_mac mac;
 514                 size_t ulen = 0;
 515
 516                 if (is_64bit) {
 517                         struct user64_mac mac64;
 518                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 519                         mac.m_buflen = (user_size_t)mac64.m_buflen;
 520                         mac.m_string = (user_addr_t)mac64.m_string;
 521                 } else {
 522                         struct user32_mac mac32;
 523                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 524                         mac.m_buflen = mac32.m_buflen;
 525                         mac.m_string = mac32.m_string;
 526                 }
 527                 if (error) {
 528                         goto out;
 529                 }
 530                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 531                     (mac.m_buflen < 2)) {
 532                         error = EINVAL;
 533                         goto out;
 534                 }
 535                 labelsz = mac.m_buflen;
 536                 labelstr = kheap_alloc(KHEAP_TEMP, labelsz, Z_WAITOK);
 537                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 538                 if (error) {
 539                         goto out;
 540                 }
 541                 AUDIT_ARG(mac_string, labelstr);
 542         }
 543 #endif /* CONFIG_MACF */
 544
 545         AUDIT_ARG(fflags, flags);
 546
 547 #if SECURE_KERNEL
 548         if (flags & MNT_UNION) {
 549                 /* No union mounts on release kernels */
 550                 error = EPERM;
 551                 goto out;
 552         }
 553 #endif
 554
 555         if ((vp->v_flag & VROOT) &&
 556             (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 557                 if (!(flags & MNT_UNION)) {
 558                         flags |= MNT_UPDATE;
 559                 } else {
 560                         /*
 561                          * For a union mount on '/', treat it as fresh
 562                          * mount instead of update.
 563                          * Otherwise, union mouting on '/' used to panic the
 564                          * system before, since mnt_vnodecovered was found to
 565                          * be NULL for '/' which is required for unionlookup
 566                          * after it gets ENOENT on union mount.
 567                          */
 568                         flags = (flags & ~(MNT_UPDATE));
 569                 }
 570
 571 #if SECURE_KERNEL
 572                 if ((flags & MNT_RDONLY) == 0) {
 573                         /* Release kernels are not allowed to mount "/" as rw */
 574                         error = EPERM;
 575                         goto out;
 576                 }
 577 #endif
 578                 /*
 579                  * See 7392553 for more details on why this check exists.
 580                  * Suffice to say: If this check is ON and something tries
 581                  * to mount the rootFS RW, we'll turn off the codesign
 582                  * bitmap optimization.
 583                  */
 584 #if CHECK_CS_VALIDATION_BITMAP
 585                 if ((flags & MNT_RDONLY) == 0) {
 586                         root_fs_upgrade_try = TRUE;
 587                 }
 588 #endif
 589         }
 590
 591         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 592             labelstr, FALSE, ctx);
 593
 594 out:
 595
 596 #if CONFIG_MACF
 597         kheap_free(KHEAP_DEFAULT, labelstr, labelsz);
 598 #endif /* CONFIG_MACF */
 599
 600         if (vp) {
 601                 vnode_put(vp);
 602         }
 603         if (pvp) {
 604                 vnode_put(pvp);
 605         }
 606         if (need_nameidone) {
 607                 nameidone(&nd);
 608         }
 609
 610         return error;
 611 }
 612
 613 /*
 614  * common mount implementation (final stage of mounting)
 615  *
 616  * Arguments:
 617  *  fstypename  file system type (ie it's vfs name)
 618  *  pvp         parent of covered vnode
 619  *  vp          covered vnode
 620  *  cnp         component name (ie path) of covered vnode
 621  *  flags       generic mount flags
 622  *  fsmountargs file system specific data
 623  *  labelstr    optional MAC label
 624  *  kernelmount TRUE for mounts initiated from inside the kernel
 625  *  ctx         caller's context
 626  */
 627 static int
 628 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 629     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 630     char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 631 {
 632 #if !CONFIG_MACF
 633 #pragma unused(labelstr)
 634 #endif
 635         struct vnode *devvp = NULLVP;
 636         struct vnode *device_vnode = NULLVP;
 637 #if CONFIG_MACF
 638         struct vnode *rvp;
 639 #endif
 640         struct mount *mp;
 641         struct vfstable *vfsp = (struct vfstable *)0;
 642         struct proc *p = vfs_context_proc(ctx);
 643         int error, flag = 0;
 644         bool flag_set = false;
 645         user_addr_t devpath = USER_ADDR_NULL;
 646         int ronly = 0;
 647         int mntalloc = 0;
 648         boolean_t vfsp_ref = FALSE;
 649         boolean_t is_rwlock_locked = FALSE;
 650         boolean_t did_rele = FALSE;
 651         boolean_t have_usecount = FALSE;
 652         boolean_t did_set_lmount = FALSE;
 653
 654 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM || CONFIG_BASESYSTEMROOT
 655         /* Check for mutually-exclusive flag bits */
 656         uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_VOLBYROLE_MASK | KERNEL_MOUNT_BASESYSTEMROOT));
 657         int bitcount = 0;
 658         while (checkflags != 0) {
 659                 checkflags &= (checkflags - 1);
 660                 bitcount++;
 661         }
 662
 663         if (bitcount > 1) {
 664                 //not allowed to request multiple mount-by-role flags
 665                 error = EINVAL;
 666                 goto out1;
 667         }
 668 #endif
 669
 670         /*
 671          * Process an update for an existing mount
 672          */
 673         if (flags & MNT_UPDATE) {
 674                 if ((vp->v_flag & VROOT) == 0) {
 675                         error = EINVAL;
 676                         goto out1;
 677                 }
 678                 mp = vp->v_mount;
 679
 680                 /* if unmount or mount in progress, return error */
 681                 mount_lock_spin(mp);
 682                 if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
 683                         mount_unlock(mp);
 684                         error = EBUSY;
 685                         goto out1;
 686                 }
 687                 mp->mnt_lflag |= MNT_LMOUNT;
 688                 did_set_lmount = TRUE;
 689                 mount_unlock(mp);
 690                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 691                 is_rwlock_locked = TRUE;
 692                 /*
 693                  * We only allow the filesystem to be reloaded if it
 694                  * is currently mounted read-only.
 695                  */
 696                 if ((flags & MNT_RELOAD) &&
 697                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 698                         error = ENOTSUP;
 699                         goto out1;
 700                 }
 701
 702                 /*
 703                  * If content protection is enabled, update mounts are not
 704                  * allowed to turn it off.
 705                  */
 706                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 707                     ((flags & MNT_CPROTECT) == 0)) {
 708                         error = EINVAL;
 709                         goto out1;
 710                 }
 711
 712                 /*
 713                  * can't turn off MNT_REMOVABLE either but it may be an unexpected
 714                  * failure to return an error for this so we'll just silently
 715                  * add it if it is not passed in.
 716                  */
 717                 if ((mp->mnt_flag & MNT_REMOVABLE) &&
 718                     ((flags & MNT_REMOVABLE) == 0)) {
 719                         flags |= MNT_REMOVABLE;
 720                 }
 721
 722                 /* Can't downgrade the backer of the root FS */
 723                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 724                     (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 725                         error = ENOTSUP;
 726                         goto out1;
 727                 }
 728
 729                 /*
 730                  * Only root, or the user that did the original mount is
 731                  * permitted to update it.
 732                  */
 733                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 734                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 735                         goto out1;
 736                 }
 737 #if CONFIG_MACF
 738                 error = mac_mount_check_remount(ctx, mp);
 739                 if (error != 0) {
 740                         goto out1;
 741                 }
 742 #endif
 743                 /*
 744                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 745                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 746                  */
 747                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 748                         flags |= MNT_NOSUID | MNT_NODEV;
 749                         if (mp->mnt_flag & MNT_NOEXEC) {
 750                                 flags |= MNT_NOEXEC;
 751                         }
 752                 }
 753                 flag = mp->mnt_flag;
 754                 flag_set = true;
 755
 756
 757
 758                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 759
 760                 vfsp = mp->mnt_vtable;
 761                 goto update;
 762         } // MNT_UPDATE
 763
 764         /*
 765          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 766          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 767          */
 768         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 769                 flags |= MNT_NOSUID | MNT_NODEV;
 770                 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
 771                         flags |= MNT_NOEXEC;
 772                 }
 773         }
 774
 775         /* XXXAUDIT: Should we capture the type on the error path as well? */
 776         AUDIT_ARG(text, fstypename);
 777         mount_list_lock();
 778         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 779                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 780                         vfsp->vfc_refcount++;
 781                         vfsp_ref = TRUE;
 782                         break;
 783                 }
 784         }
 785         mount_list_unlock();
 786         if (vfsp == NULL) {
 787                 error = ENODEV;
 788                 goto out1;
 789         }
 790
 791         /*
 792          * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
 793          * except in ROSV configs and for the initial BaseSystem root.
 794          */
 795         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
 796             ((internal_flags & KERNEL_MOUNT_VOLBYROLE_MASK) == 0) &&
 797             ((internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) == 0)) {
 798                 error = EINVAL;  /* unsupported request */
 799                 goto out1;
 800         }
 801
 802         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 803         if (error != 0) {
 804                 goto out1;
 805         }
 806
 807         /*
 808          * Allocate and initialize the filesystem (mount_t)
 809          */
 810         mp = zalloc_flags(mount_zone, Z_WAITOK | Z_ZERO);
 811         mntalloc = 1;
 812
 813         /* Initialize the default IO constraints */
 814         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 815         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 816         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 817         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 818         mp->mnt_devblocksize = DEV_BSIZE;
 819         mp->mnt_alignmentmask = PAGE_MASK;
 820         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 821         mp->mnt_ioscale = 1;
 822         mp->mnt_ioflags = 0;
 823         mp->mnt_realrootvp = NULLVP;
 824         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 825
 826         mp->mnt_lflag |= MNT_LMOUNT;
 827         did_set_lmount = TRUE;
 828
 829         TAILQ_INIT(&mp->mnt_vnodelist);
 830         TAILQ_INIT(&mp->mnt_workerqueue);
 831         TAILQ_INIT(&mp->mnt_newvnodes);
 832         mount_lock_init(mp);
 833         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 834         is_rwlock_locked = TRUE;
 835         mp->mnt_op = vfsp->vfc_vfsops;
 836         mp->mnt_vtable = vfsp;
 837         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 838         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 839         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 840         do {
 841                 int pathlen = MAXPATHLEN;
 842
 843                 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
 844                         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 845                 }
 846         } while (0);
 847         mp->mnt_vnodecovered = vp;
 848         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 849         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 850         mp->mnt_devbsdunit = 0;
 851         mp->mnt_mount_id = os_atomic_inc_orig(&mount_unique_id, relaxed);
 852
 853         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 854         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 855
 856 #if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
 857         if (kernelmount) {
 858                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 859         }
 860         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
 861                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 862         }
 863 #endif /* CONFIG_NFS_CLIENT || DEVFS */
 864
 865         if (KERNEL_MOUNT_DEVFS & internal_flags) {
 866                 // kernel mounted devfs
 867                 mp->mnt_kern_flag |= MNTK_SYSTEM;
 868         }
 869
 870 update:
 871
 872         /*
 873          * Set the mount level flags.
 874          */
 875         if (flags & MNT_RDONLY) {
 876                 mp->mnt_flag |= MNT_RDONLY;
 877         } else if (mp->mnt_flag & MNT_RDONLY) {
 878                 // disallow read/write upgrades of file systems that
 879                 // had the TYPENAME_OVERRIDE feature set.
 880                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 881                         error = EPERM;
 882                         goto out1;
 883                 }
 884                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 885         }
 886         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 887             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 888             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 889             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
 890             MNT_QUARANTINE | MNT_CPROTECT);
 891
 892 #if SECURE_KERNEL
 893 #if !CONFIG_MNT_SUID
 894         /*
 895          * On release builds of iOS based platforms, always enforce NOSUID on
 896          * all mounts. We do this here because we can catch update mounts as well as
 897          * non-update mounts in this case.
 898          */
 899         mp->mnt_flag |= (MNT_NOSUID);
 900 #endif
 901 #endif
 902
 903         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 904             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 905             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 906             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
 907             MNT_QUARANTINE | MNT_CPROTECT);
 908
 909 #if CONFIG_MACF
 910         if (flags & MNT_MULTILABEL) {
 911                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 912                         error = EINVAL;
 913                         goto out1;
 914                 }
 915                 mp->mnt_flag |= MNT_MULTILABEL;
 916         }
 917 #endif
 918         /*
 919          * Process device path for local file systems if requested.
 920          *
 921          * Snapshot and mount-by-role mounts do not use this path; they are
 922          * passing other opaque data in the device path field.
 923          *
 924          * Basesystemroot mounts pass a device path to be resolved here,
 925          * but it's just a char * already inside the kernel, which
 926          * kernel_mount() shoved into a user_addr_t to call us. So for such
 927          * mounts we must skip copyin (both of the address and of the string
 928          * (in NDINIT).
 929          */
 930         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
 931             !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VOLBYROLE_MASK))) {
 932                 boolean_t do_copyin_devpath = true;
 933 #if CONFIG_BASESYSTEMROOT
 934                 if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
 935                         // KERNEL_MOUNT_BASESYSTEMROOT implies subtle behavior worh nothing:
 936                         // We have been passed fsmountargs, which is typed as a user_addr_t,
 937                         // but is actually a char ** pointing to a (kernelspace) string.
 938                         // We manually unpack it with a series of casts and dereferences
 939                         // that reverses what was done just above us on the stack in
 940                         // imageboot_pivot_image().
 941                         // After retrieving the path to the dev node (which we will NDINIT
 942                         // in a moment), we pass NULL fsmountargs on to the filesystem.
 943                         _Static_assert(sizeof(char **) == sizeof(fsmountargs), "fsmountargs should fit a (kernel) address");
 944                         char **devnamepp = (char **)fsmountargs;
 945                         char *devnamep = *devnamepp;
 946                         devpath = CAST_USER_ADDR_T(devnamep);
 947                         do_copyin_devpath = false;
 948                         fsmountargs = USER_ADDR_NULL;
 949
 950                         //Now that we have a mp, denote that this mount is for the basesystem.
 951                         mp->mnt_supl_kern_flag |= MNTK_SUPL_BASESYSTEM;
 952                 }
 953 #endif // CONFIG_BASESYSTEMROOT
 954
 955                 if (do_copyin_devpath) {
 956                         if (vfs_context_is64bit(ctx)) {
 957                                 if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
 958                                         goto out1;
 959                                 }
 960                                 fsmountargs += sizeof(devpath);
 961                         } else {
 962                                 user32_addr_t tmp;
 963                                 if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
 964                                         goto out1;
 965                                 }
 966                                 /* munge into LP64 addr */
 967                                 devpath = CAST_USER_ADDR_T(tmp);
 968                                 fsmountargs += sizeof(tmp);
 969                         }
 970                 }
 971
 972                 /* Lookup device and authorize access to it */
 973                 if ((devpath)) {
 974                         struct nameidata nd;
 975
 976                         enum uio_seg seg = UIO_USERSPACE;
 977 #if CONFIG_BASESYSTEMROOT
 978                         if (internal_flags & KERNEL_MOUNT_BASESYSTEMROOT) {
 979                                 seg = UIO_SYSSPACE;
 980                         }
 981 #endif // CONFIG_BASESYSTEMROOT
 982
 983                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, seg, devpath, ctx);
 984                         if ((error = namei(&nd))) {
 985                                 goto out1;
 986                         }
 987
 988                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 989                         devvp = nd.ni_vp;
 990
 991                         nameidone(&nd);
 992
 993                         if (devvp->v_type != VBLK) {
 994                                 error = ENOTBLK;
 995                                 goto out2;
 996                         }
 997                         if (major(devvp->v_rdev) >= nblkdev) {
 998                                 error = ENXIO;
 999                                 goto out2;
1000                         }
1001                         /*
1002                          * If mount by non-root, then verify that user has necessary
1003                          * permissions on the device.
1004                          */
1005                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
1006                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
1007
1008                                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1009                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1010                                 }
1011                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
1012                                         goto out2;
1013                                 }
1014                         }
1015                 }
1016                 /* On first mount, preflight and open device */
1017                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
1018                         if ((error = vnode_ref(devvp))) {
1019                                 goto out2;
1020                         }
1021                         /*
1022                          * Disallow multiple mounts of the same device.
1023                          * Disallow mounting of a device that is currently in use
1024                          * (except for root, which might share swap device for miniroot).
1025                          * Flush out any old buffers remaining from a previous use.
1026                          */
1027                         if ((error = vfs_mountedon(devvp))) {
1028                                 goto out3;
1029                         }
1030
1031                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
1032                                 error = EBUSY;
1033                                 goto out3;
1034                         }
1035                         if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
1036                                 error = ENOTBLK;
1037                                 goto out3;
1038                         }
1039                         if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
1040                                 goto out3;
1041                         }
1042
1043                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
1044 #if CONFIG_MACF
1045                         error = mac_vnode_check_open(ctx,
1046                             devvp,
1047                             ronly ? FREAD : FREAD | FWRITE);
1048                         if (error) {
1049                                 goto out3;
1050                         }
1051 #endif /* MAC */
1052                         if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
1053                                 goto out3;
1054                         }
1055
1056                         mp->mnt_devvp = devvp;
1057                         device_vnode = devvp;
1058                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
1059                     (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
1060                     (device_vnode = mp->mnt_devvp)) {
1061                         dev_t dev;
1062                         int maj;
1063                         /*
1064                          * If upgrade to read-write by non-root, then verify
1065                          * that user has necessary permissions on the device.
1066                          */
1067                         vnode_getalways(device_vnode);
1068
1069                         if (suser(vfs_context_ucred(ctx), NULL) &&
1070                             (error = vnode_authorize(device_vnode, NULL,
1071                             KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1072                             ctx)) != 0) {
1073                                 vnode_put(device_vnode);
1074                                 goto out2;
1075                         }
1076
1077                         /* Tell the device that we're upgrading */
1078                         dev = (dev_t)device_vnode->v_rdev;
1079                         maj = major(dev);
1080
1081                         if ((u_int)maj >= (u_int)nblkdev) {
1082                                 panic("Volume mounted on a device with invalid major number.");
1083                         }
1084
1085                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1086                         vnode_put(device_vnode);
1087                         device_vnode = NULLVP;
1088                         if (error != 0) {
1089                                 goto out2;
1090                         }
1091                 }
1092         } // localargs && !(snapshot | data | vm)
1093
1094 #if CONFIG_MACF
1095         if ((flags & MNT_UPDATE) == 0) {
1096                 mac_mount_label_init(mp);
1097                 mac_mount_label_associate(ctx, mp);
1098         }
1099         if (labelstr) {
1100                 if ((flags & MNT_UPDATE) != 0) {
1101                         error = mac_mount_check_label_update(ctx, mp);
1102                         if (error != 0) {
1103                                 goto out3;
1104                         }
1105                 }
1106         }
1107 #endif
1108         /*
1109          * Mount the filesystem.  We already asserted that internal_flags
1110          * cannot have more than one mount-by-role bit set.
1111          */
1112         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1113                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1114                     (caddr_t)fsmountargs, 0, ctx);
1115         } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1116 #if CONFIG_ROSV_STARTUP
1117                 struct mount *origin_mp = (struct mount*)fsmountargs;
1118                 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1119                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1120                 if (error) {
1121                         printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1122                 } else {
1123                         /* Mark volume associated with system volume */
1124                         mp->mnt_kern_flag |= MNTK_SYSTEM;
1125
1126                         /* Attempt to acquire the mnt_devvp and set it up */
1127                         struct vnode *mp_devvp = NULL;
1128                         if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1129                                 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1130                                     0, &mp_devvp, vfs_context_kernel());
1131                                 if (!lerr) {
1132                                         mp->mnt_devvp = mp_devvp;
1133                                         //vnode_lookup took an iocount, need to drop it.
1134                                         vnode_put(mp_devvp);
1135                                         // now set `device_vnode` to the devvp that was acquired.
1136                                         // this is needed in order to ensure vfs_init_io_attributes is invoked.
1137                                         // note that though the iocount above was dropped, the mount acquires
1138                                         // an implicit reference against the device.
1139                                         device_vnode = mp_devvp;
1140                                 }
1141                         }
1142                 }
1143 #else
1144                 error = EINVAL;
1145 #endif
1146         } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1147 #if CONFIG_MOUNT_VM
1148                 struct mount *origin_mp = (struct mount*)fsmountargs;
1149                 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1150                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1151                 if (error) {
1152                         printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1153                 } else {
1154                         /* Mark volume associated with system volume and a swap mount */
1155                         mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1156                         /* Attempt to acquire the mnt_devvp and set it up */
1157                         struct vnode *mp_devvp = NULL;
1158                         if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1159                                 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1160                                     0, &mp_devvp, vfs_context_kernel());
1161                                 if (!lerr) {
1162                                         mp->mnt_devvp = mp_devvp;
1163                                         //vnode_lookup took an iocount, need to drop it.
1164                                         vnode_put(mp_devvp);
1165
1166                                         // now set `device_vnode` to the devvp that was acquired.
1167                                         // note that though the iocount above was dropped, the mount acquires
1168                                         // an implicit reference against the device.
1169                                         device_vnode = mp_devvp;
1170                                 }
1171                         }
1172                 }
1173 #else
1174                 error = EINVAL;
1175 #endif
1176         } else if ((internal_flags & KERNEL_MOUNT_PREBOOTVOL) || (internal_flags & KERNEL_MOUNT_RECOVERYVOL)) {
1177 #if CONFIG_MOUNT_PREBOOTRECOVERY
1178                 struct mount *origin_mp = (struct mount*)fsmountargs;
1179                 uint32_t mount_role = 0;
1180                 if (internal_flags & KERNEL_MOUNT_PREBOOTVOL) {
1181                         mount_role = VFS_PREBOOT_ROLE;
1182                 } else if (internal_flags & KERNEL_MOUNT_RECOVERYVOL) {
1183                         mount_role = VFS_RECOVERY_ROLE;
1184                 }
1185
1186                 if (mount_role != 0) {
1187                         fs_role_mount_args_t frma = {origin_mp, mount_role};
1188                         error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1189                         if (error) {
1190                                 printf("MOUNT-BY-ROLE (%d) failed! (%d)", mount_role, error);
1191                         } else {
1192                                 // NOT YET - need to qualify how this interacts with shutdown, ERP/ERB, etc
1193                                 /* Mark volume associated with system volume */
1194                                 //mp->mnt_kern_flag |= MNTK_SYSTEM;
1195                                 /* Attempt to acquire the mnt_devvp and set it up */
1196                                 struct vnode *mp_devvp = NULL;
1197                                 if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1198                                         errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1199                                             0, &mp_devvp, vfs_context_kernel());
1200                                         if (!lerr) {
1201                                                 mp->mnt_devvp = mp_devvp;
1202                                                 //vnode_lookup took an iocount, need to drop it.
1203                                                 vnode_put(mp_devvp);
1204
1205                                                 // now set `device_vnode` to the devvp that was acquired.
1206                                                 // note that though the iocount above was dropped, the mount acquires
1207                                                 // an implicit reference against the device.
1208                                                 device_vnode = mp_devvp;
1209                                         }
1210                                 }
1211                         }
1212                 } else {
1213                         printf("MOUNT-BY-ROLE (%d) failed - ROLE UNRECOGNIZED! (%d)", mount_role, error);
1214                         error = EINVAL;
1215                 }
1216 #else
1217                 error = EINVAL;
1218 #endif
1219         } else {
1220                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1221         }
1222
1223         if (flags & MNT_UPDATE) {
1224                 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1225                         mp->mnt_flag &= ~MNT_RDONLY;
1226                 }
1227                 mp->mnt_flag &= ~
1228                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1229                 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1230                 if (error) {
1231                         mp->mnt_flag = flag;  /* restore flag value */
1232                 }
1233                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1234                 lck_rw_done(&mp->mnt_rwlock);
1235                 is_rwlock_locked = FALSE;
1236                 if (!error) {
1237                         enablequotas(mp, ctx);
1238                 }
1239                 goto exit;
1240         }
1241
1242         /*
1243          * Put the new filesystem on the mount list after root.
1244          */
1245         if (error == 0) {
1246                 struct vfs_attr vfsattr;
1247 #if CONFIG_MACF
1248                 error = mac_mount_check_mount_late(ctx, mp);
1249                 if (error != 0) {
1250                         goto out4;
1251                 }
1252
1253                 if (vfs_flags(mp) & MNT_MULTILABEL) {
1254                         error = VFS_ROOT(mp, &rvp, ctx);
1255                         if (error) {
1256                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1257                                 goto out4;
1258                         }
1259                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1260                         /*
1261                          * drop reference provided by VFS_ROOT
1262                          */
1263                         vnode_put(rvp);
1264
1265                         if (error) {
1266                                 goto out4;
1267                         }
1268                 }
1269 #endif  /* MAC */
1270
1271                 vnode_lock_spin(vp);
1272                 CLR(vp->v_flag, VMOUNT);
1273                 vp->v_mountedhere = mp;
1274                 vnode_unlock(vp);
1275
1276                 /*
1277                  * taking the name_cache_lock exclusively will
1278                  * insure that everyone is out of the fast path who
1279                  * might be trying to use a now stale copy of
1280                  * vp->v_mountedhere->mnt_realrootvp
1281                  * bumping mount_generation causes the cached values
1282                  * to be invalidated
1283                  */
1284                 name_cache_lock();
1285                 mount_generation++;
1286                 name_cache_unlock();
1287
1288                 error = vnode_ref(vp);
1289                 if (error != 0) {
1290                         goto out4;
1291                 }
1292
1293                 have_usecount = TRUE;
1294
1295                 error = checkdirs(vp, ctx);
1296                 if (error != 0) {
1297                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
1298                         goto out4;
1299                 }
1300                 /*
1301                  * there is no cleanup code here so I have made it void
1302                  * we need to revisit this
1303                  */
1304                 (void)VFS_START(mp, 0, ctx);
1305
1306                 if (mount_list_add(mp) != 0) {
1307                         /*
1308                          * The system is shutting down trying to umount
1309                          * everything, so fail with a plausible errno.
1310                          */
1311                         error = EBUSY;
1312                         goto out4;
1313                 }
1314                 lck_rw_done(&mp->mnt_rwlock);
1315                 is_rwlock_locked = FALSE;
1316
1317                 /* Check if this mounted file system supports EAs or named streams. */
1318                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1319                 VFSATTR_INIT(&vfsattr);
1320                 VFSATTR_WANTED(&vfsattr, f_capabilities);
1321                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1322                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1323                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1324                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1325                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1326                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1327                         }
1328 #if NAMEDSTREAMS
1329                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1330                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1331                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1332                         }
1333 #endif
1334                         /* Check if this file system supports path from id lookups. */
1335                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1336                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1337                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1338                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
1339                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1340                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1341                         }
1342
1343                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1344                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1345                                 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1346                         }
1347                 }
1348                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1349                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1350                 }
1351                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1352                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1353                 }
1354                 /* increment the operations count */
1355                 OSAddAtomic(1, &vfs_nummntops);
1356                 enablequotas(mp, ctx);
1357
1358                 if (device_vnode) {
1359                         device_vnode->v_specflags |= SI_MOUNTEDON;
1360
1361                         /*
1362                          *   cache the IO attributes for the underlying physical media...
1363                          *   an error return indicates the underlying driver doesn't
1364                          *   support all the queries necessary... however, reasonable
1365                          *   defaults will have been set, so no reason to bail or care
1366                          */
1367                         vfs_init_io_attributes(device_vnode, mp);
1368                 }
1369
1370                 /* Now that mount is setup, notify the listeners */
1371                 vfs_notify_mount(pvp);
1372                 IOBSDMountChange(mp, kIOMountChangeMount);
1373         } else {
1374                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1375                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1376                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1377                             mp->mnt_vtable->vfc_name, error);
1378                 }
1379
1380                 vnode_lock_spin(vp);
1381                 CLR(vp->v_flag, VMOUNT);
1382                 vnode_unlock(vp);
1383                 mount_list_lock();
1384                 mp->mnt_vtable->vfc_refcount--;
1385                 mount_list_unlock();
1386
1387                 if (device_vnode) {
1388                         vnode_rele(device_vnode);
1389                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1390                 }
1391                 lck_rw_done(&mp->mnt_rwlock);
1392                 is_rwlock_locked = FALSE;
1393
1394                 /*
1395                  * if we get here, we have a mount structure that needs to be freed,
1396                  * but since the coveredvp hasn't yet been updated to point at it,
1397                  * no need to worry about other threads holding a crossref on this mp
1398                  * so it's ok to just free it
1399                  */
1400                 mount_lock_destroy(mp);
1401 #if CONFIG_MACF
1402                 mac_mount_label_destroy(mp);
1403 #endif
1404                 zfree(mount_zone, mp);
1405                 did_set_lmount = false;
1406         }
1407 exit:
1408         /*
1409          * drop I/O count on the device vp if there was one
1410          */
1411         if (devpath && devvp) {
1412                 vnode_put(devvp);
1413         }
1414
1415         if (did_set_lmount) {
1416                 mount_lock_spin(mp);
1417                 mp->mnt_lflag &= ~MNT_LMOUNT;
1418                 mount_unlock(mp);
1419         }
1420
1421         return error;
1422
1423 /* Error condition exits */
1424 out4:
1425         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1426
1427         /*
1428          * If the mount has been placed on the covered vp,
1429          * it may have been discovered by now, so we have
1430          * to treat this just like an unmount
1431          */
1432         mount_lock_spin(mp);
1433         mp->mnt_lflag |= MNT_LDEAD;
1434         mount_unlock(mp);
1435
1436         if (device_vnode != NULLVP) {
1437                 vnode_rele(device_vnode);
1438                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1439                     ctx);
1440                 did_rele = TRUE;
1441         }
1442
1443         vnode_lock_spin(vp);
1444
1445         mp->mnt_crossref++;
1446         vp->v_mountedhere = (mount_t) 0;
1447
1448         vnode_unlock(vp);
1449
1450         if (have_usecount) {
1451                 vnode_rele(vp);
1452         }
1453 out3:
1454         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1455                 vnode_rele(devvp);
1456         }
1457 out2:
1458         if (devpath && devvp) {
1459                 vnode_put(devvp);
1460         }
1461 out1:
1462         /* Release mnt_rwlock only when it was taken */
1463         if (is_rwlock_locked == TRUE) {
1464                 if (flag_set) {
1465                         mp->mnt_flag = flag;  /* restore mnt_flag value */
1466                 }
1467                 lck_rw_done(&mp->mnt_rwlock);
1468         }
1469
1470         if (did_set_lmount) {
1471                 mount_lock_spin(mp);
1472                 mp->mnt_lflag &= ~MNT_LMOUNT;
1473                 mount_unlock(mp);
1474         }
1475
1476         if (mntalloc) {
1477                 if (mp->mnt_crossref) {
1478                         mount_dropcrossref(mp, vp, 0);
1479                 } else {
1480                         mount_lock_destroy(mp);
1481 #if CONFIG_MACF
1482                         mac_mount_label_destroy(mp);
1483 #endif
1484                         zfree(mount_zone, mp);
1485                 }
1486         }
1487         if (vfsp_ref) {
1488                 mount_list_lock();
1489                 vfsp->vfc_refcount--;
1490                 mount_list_unlock();
1491         }
1492
1493         return error;
1494 }
1495
1496 /*
1497  * Flush in-core data, check for competing mount attempts,
1498  * and set VMOUNT
1499  */
1500 int
1501 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1502 {
1503 #if !CONFIG_MACF
1504 #pragma unused(cnp,fsname)
1505 #endif
1506         struct vnode_attr va;
1507         int error;
1508
1509         if (!skip_auth) {
1510                 /*
1511                  * If the user is not root, ensure that they own the directory
1512                  * onto which we are attempting to mount.
1513                  */
1514                 VATTR_INIT(&va);
1515                 VATTR_WANTED(&va, va_uid);
1516                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1517                     (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1518                     (!vfs_context_issuser(ctx)))) {
1519                         error = EPERM;
1520                         goto out;
1521                 }
1522         }
1523
1524         if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1525                 goto out;
1526         }
1527
1528         if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1529                 goto out;
1530         }
1531
1532         if (vp->v_type != VDIR) {
1533                 error = ENOTDIR;
1534                 goto out;
1535         }
1536
1537         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1538                 error = EBUSY;
1539                 goto out;
1540         }
1541
1542 #if CONFIG_MACF
1543         error = mac_mount_check_mount(ctx, vp,
1544             cnp, fsname);
1545         if (error != 0) {
1546                 goto out;
1547         }
1548 #endif
1549
1550         vnode_lock_spin(vp);
1551         SET(vp->v_flag, VMOUNT);
1552         vnode_unlock(vp);
1553
1554 out:
1555         return error;
1556 }
1557
1558 #if CONFIG_IMGSRC_ACCESS
1559
1560 #define DEBUG_IMGSRC 0
1561
1562 #if DEBUG_IMGSRC
1563 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1564 #else
1565 #define IMGSRC_DEBUG(args...) do { } while(0)
1566 #endif
1567
1568 static int
1569 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1570 {
1571         struct nameidata nd;
1572         vnode_t vp, realdevvp;
1573         mode_t accessmode;
1574         int error;
1575         enum uio_seg uio = UIO_USERSPACE;
1576
1577         if (ctx == vfs_context_kernel()) {
1578                 uio = UIO_SYSSPACE;
1579         }
1580
1581         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1582         if ((error = namei(&nd))) {
1583                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1584                 return error;
1585         }
1586
1587         vp = nd.ni_vp;
1588
1589         if (!vnode_isblk(vp)) {
1590                 IMGSRC_DEBUG("Not block device.\n");
1591                 error = ENOTBLK;
1592                 goto out;
1593         }
1594
1595         realdevvp = mp->mnt_devvp;
1596         if (realdevvp == NULLVP) {
1597                 IMGSRC_DEBUG("No device backs the mount.\n");
1598                 error = ENXIO;
1599                 goto out;
1600         }
1601
1602         error = vnode_getwithref(realdevvp);
1603         if (error != 0) {
1604                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1605                 goto out;
1606         }
1607
1608         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1609                 IMGSRC_DEBUG("Wrong dev_t.\n");
1610                 error = ENXIO;
1611                 goto out1;
1612         }
1613
1614         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1615
1616         /*
1617          * If mount by non-root, then verify that user has necessary
1618          * permissions on the device.
1619          */
1620         if (!vfs_context_issuser(ctx)) {
1621                 accessmode = KAUTH_VNODE_READ_DATA;
1622                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1623                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1624                 }
1625                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1626                         IMGSRC_DEBUG("Access denied.\n");
1627                         goto out1;
1628                 }
1629         }
1630
1631         *devvpp = vp;
1632
1633 out1:
1634         vnode_put(realdevvp);
1635
1636 out:
1637         nameidone(&nd);
1638
1639         if (error) {
1640                 vnode_put(vp);
1641         }
1642
1643         return error;
1644 }
1645
1646 /*
1647  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1648  * and call checkdirs()
1649  */
1650 static int
1651 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1652 {
1653         int error;
1654
1655         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1656
1657         IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1658             mp->mnt_vtable->vfc_name, vnode_getname(vp));
1659
1660         vnode_lock_spin(vp);
1661         CLR(vp->v_flag, VMOUNT);
1662         vp->v_mountedhere = mp;
1663         vnode_unlock(vp);
1664
1665         /*
1666          * taking the name_cache_lock exclusively will
1667          * insure that everyone is out of the fast path who
1668          * might be trying to use a now stale copy of
1669          * vp->v_mountedhere->mnt_realrootvp
1670          * bumping mount_generation causes the cached values
1671          * to be invalidated
1672          */
1673         name_cache_lock();
1674         mount_generation++;
1675         name_cache_unlock();
1676
1677         error = vnode_ref(vp);
1678         if (error != 0) {
1679                 goto out;
1680         }
1681
1682         error = checkdirs(vp, ctx);
1683         if (error != 0) {
1684                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1685                 vnode_rele(vp);
1686                 goto out;
1687         }
1688
1689 out:
1690         if (error != 0) {
1691                 mp->mnt_vnodecovered = NULLVP;
1692         }
1693         return error;
1694 }
1695
1696 static void
1697 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1698 {
1699         vnode_rele(vp);
1700         vnode_lock_spin(vp);
1701         vp->v_mountedhere = (mount_t)NULL;
1702         vnode_unlock(vp);
1703
1704         mp->mnt_vnodecovered = NULLVP;
1705 }
1706
1707 static int
1708 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1709 {
1710         int error;
1711
1712         /* unmount in progress return error */
1713         mount_lock_spin(mp);
1714         if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
1715                 mount_unlock(mp);
1716                 return EBUSY;
1717         }
1718         mount_unlock(mp);
1719         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1720
1721         /*
1722          * We only allow the filesystem to be reloaded if it
1723          * is currently mounted read-only.
1724          */
1725         if ((flags & MNT_RELOAD) &&
1726             ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1727                 error = ENOTSUP;
1728                 goto out;
1729         }
1730
1731         /*
1732          * Only root, or the user that did the original mount is
1733          * permitted to update it.
1734          */
1735         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1736             (!vfs_context_issuser(ctx))) {
1737                 error = EPERM;
1738                 goto out;
1739         }
1740 #if CONFIG_MACF
1741         error = mac_mount_check_remount(ctx, mp);
1742         if (error != 0) {
1743                 goto out;
1744         }
1745 #endif
1746
1747 out:
1748         if (error) {
1749                 lck_rw_done(&mp->mnt_rwlock);
1750         }
1751
1752         return error;
1753 }
1754
1755 static void
1756 mount_end_update(mount_t mp)
1757 {
1758         lck_rw_done(&mp->mnt_rwlock);
1759 }
1760
1761 static int
1762 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1763 {
1764         vnode_t vp;
1765
1766         if (height >= MAX_IMAGEBOOT_NESTING) {
1767                 return EINVAL;
1768         }
1769
1770         vp = imgsrc_rootvnodes[height];
1771         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1772                 *rvpp = vp;
1773                 return 0;
1774         } else {
1775                 return ENOENT;
1776         }
1777 }
1778
1779 static int
1780 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1781     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1782     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1783 {
1784         int error;
1785         mount_t mp;
1786         boolean_t placed = FALSE;
1787         struct vfstable *vfsp;
1788         user_addr_t devpath;
1789         char *old_mntonname;
1790         vnode_t rvp;
1791         vnode_t devvp;
1792         uint32_t height;
1793         uint32_t flags;
1794
1795         /* If we didn't imageboot, nothing to move */
1796         if (imgsrc_rootvnodes[0] == NULLVP) {
1797                 return EINVAL;
1798         }
1799
1800         /* Only root can do this */
1801         if (!vfs_context_issuser(ctx)) {
1802                 return EPERM;
1803         }
1804
1805         IMGSRC_DEBUG("looking for root vnode.\n");
1806
1807         /*
1808          * Get root vnode of filesystem we're moving.
1809          */
1810         if (by_index) {
1811                 if (is64bit) {
1812                         struct user64_mnt_imgsrc_args mia64;
1813                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1814                         if (error != 0) {
1815                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1816                                 return error;
1817                         }
1818
1819                         height = mia64.mi_height;
1820                         flags = mia64.mi_flags;
1821                         devpath = (user_addr_t)mia64.mi_devpath;
1822                 } else {
1823                         struct user32_mnt_imgsrc_args mia32;
1824                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1825                         if (error != 0) {
1826                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1827                                 return error;
1828                         }
1829
1830                         height = mia32.mi_height;
1831                         flags = mia32.mi_flags;
1832                         devpath = mia32.mi_devpath;
1833                 }
1834         } else {
1835                 /*
1836                  * For binary compatibility--assumes one level of nesting.
1837                  */
1838                 if (is64bit) {
1839                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1840                                 return error;
1841                         }
1842                 } else {
1843                         user32_addr_t tmp;
1844                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1845                                 return error;
1846                         }
1847
1848                         /* munge into LP64 addr */
1849                         devpath = CAST_USER_ADDR_T(tmp);
1850                 }
1851
1852                 height = 0;
1853                 flags = 0;
1854         }
1855
1856         if (flags != 0) {
1857                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1858                 return EINVAL;
1859         }
1860
1861         error = get_imgsrc_rootvnode(height, &rvp);
1862         if (error != 0) {
1863                 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1864                 return error;
1865         }
1866
1867         IMGSRC_DEBUG("got old root vnode\n");
1868
1869         old_mntonname = zalloc_flags(ZV_NAMEI, Z_WAITOK);
1870
1871         /* Can only move once */
1872         mp = vnode_mount(rvp);
1873         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1874                 IMGSRC_DEBUG("Already moved.\n");
1875                 error = EBUSY;
1876                 goto out0;
1877         }
1878
1879         IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1880         IMGSRC_DEBUG("Starting updated.\n");
1881
1882         /* Get exclusive rwlock on mount, authorize update on mp */
1883         error = mount_begin_update(mp, ctx, 0);
1884         if (error != 0) {
1885                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1886                 goto out0;
1887         }
1888
1889         /*
1890          * It can only be moved once.  Flag is set under the rwlock,
1891          * so we're now safe to proceed.
1892          */
1893         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1894                 IMGSRC_DEBUG("Already moved [2]\n");
1895                 goto out1;
1896         }
1897
1898         IMGSRC_DEBUG("Preparing coveredvp.\n");
1899
1900         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1901         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1902         if (error != 0) {
1903                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1904                 goto out1;
1905         }
1906
1907         IMGSRC_DEBUG("Covered vp OK.\n");
1908
1909         /* Sanity check the name caller has provided */
1910         vfsp = mp->mnt_vtable;
1911         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1912                 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1913                     vfsp->vfc_name, fsname);
1914                 error = EINVAL;
1915                 goto out2;
1916         }
1917
1918         /* Check the device vnode and update mount-from name, for local filesystems */
1919         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1920                 IMGSRC_DEBUG("Local, doing device validation.\n");
1921
1922                 if (devpath != USER_ADDR_NULL) {
1923                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1924                         if (error) {
1925                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1926                                 goto out2;
1927                         }
1928
1929                         vnode_put(devvp);
1930                 }
1931         }
1932
1933         /*
1934          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1935          * and increment the name cache's mount generation
1936          */
1937
1938         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1939         error = place_mount_and_checkdirs(mp, vp, ctx);
1940         if (error != 0) {
1941                 goto out2;
1942         }
1943
1944         placed = TRUE;
1945
1946         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1947         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1948
1949         /* Forbid future moves */
1950         mount_lock(mp);
1951         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1952         mount_unlock(mp);
1953
1954         /* Finally, add to mount list, completely ready to go */
1955         if (mount_list_add(mp) != 0) {
1956                 /*
1957                  * The system is shutting down trying to umount
1958                  * everything, so fail with a plausible errno.
1959                  */
1960                 error = EBUSY;
1961                 goto out3;
1962         }
1963
1964         mount_end_update(mp);
1965         vnode_put(rvp);
1966         zfree(ZV_NAMEI, old_mntonname);
1967
1968         vfs_notify_mount(pvp);
1969
1970         return 0;
1971 out3:
1972         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1973
1974         mount_lock(mp);
1975         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1976         mount_unlock(mp);
1977
1978 out2:
1979         /*
1980          * Placing the mp on the vnode clears VMOUNT,
1981          * so cleanup is different after that point
1982          */
1983         if (placed) {
1984                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1985                 undo_place_on_covered_vp(mp, vp);
1986         } else {
1987                 vnode_lock_spin(vp);
1988                 CLR(vp->v_flag, VMOUNT);
1989                 vnode_unlock(vp);
1990         }
1991 out1:
1992         mount_end_update(mp);
1993
1994 out0:
1995         vnode_put(rvp);
1996         zfree(ZV_NAMEI, old_mntonname);
1997         return error;
1998 }
1999
2000 #if CONFIG_LOCKERBOOT
2001 __private_extern__
2002 int
2003 mount_locker_protoboot(const char *fsname, const char *mntpoint,
2004     const char *pbdevpath)
2005 {
2006         int error = -1;
2007         struct nameidata nd;
2008         boolean_t cleanup_nd = FALSE;
2009         vfs_context_t ctx = vfs_context_kernel();
2010         boolean_t is64 = TRUE;
2011         boolean_t by_index = TRUE;
2012         struct user64_mnt_imgsrc_args mia64 = {
2013                 .mi_height = 0,
2014                 .mi_flags = 0,
2015                 .mi_devpath = CAST_USER_ADDR_T(pbdevpath),
2016         };
2017         user_addr_t mia64addr = CAST_USER_ADDR_T(&mia64);
2018
2019         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
2020             UIO_SYSSPACE, CAST_USER_ADDR_T(mntpoint), ctx);
2021         error = namei(&nd);
2022         if (error) {
2023                 IMGSRC_DEBUG("namei: %d\n", error);
2024                 goto out;
2025         }
2026
2027         cleanup_nd = TRUE;
2028         error = relocate_imageboot_source(nd.ni_dvp, nd.ni_vp,
2029             &nd.ni_cnd, fsname, ctx, is64, mia64addr, by_index);
2030
2031 out:
2032         if (cleanup_nd) {
2033                 int stashed = error;
2034
2035                 error = vnode_put(nd.ni_vp);
2036                 if (error) {
2037                         panic("vnode_put() returned non-zero: %d", error);
2038                 }
2039
2040                 if (nd.ni_dvp) {
2041                         error = vnode_put(nd.ni_dvp);
2042                         if (error) {
2043                                 panic("vnode_put() returned non-zero: %d", error);
2044                         }
2045                 }
2046                 nameidone(&nd);
2047
2048                 error = stashed;
2049         }
2050         return error;
2051 }
2052 #endif /* CONFIG_LOCKERBOOT */
2053 #endif /* CONFIG_IMGSRC_ACCESS */
2054
2055 void
2056 enablequotas(struct mount *mp, vfs_context_t ctx)
2057 {
2058         struct nameidata qnd;
2059         int type;
2060         char qfpath[MAXPATHLEN];
2061         const char *qfname = QUOTAFILENAME;
2062         const char *qfopsname = QUOTAOPSNAME;
2063         const char *qfextension[] = INITQFNAMES;
2064
2065         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
2066         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
2067                 return;
2068         }
2069         /*
2070          * Enable filesystem disk quotas if necessary.
2071          * We ignore errors as this should not interfere with final mount
2072          */
2073         for (type = 0; type < MAXQUOTAS; type++) {
2074                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
2075                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
2076                     CAST_USER_ADDR_T(qfpath), ctx);
2077                 if (namei(&qnd) != 0) {
2078                         continue;           /* option file to trigger quotas is not present */
2079                 }
2080                 vnode_put(qnd.ni_vp);
2081                 nameidone(&qnd);
2082                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
2083
2084                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
2085         }
2086         return;
2087 }
2088
2089
2090 static int
2091 checkdirs_callback(proc_t p, void * arg)
2092 {
2093         struct cdirargs * cdrp = (struct cdirargs *)arg;
2094         vnode_t olddp = cdrp->olddp;
2095         vnode_t newdp = cdrp->newdp;
2096         struct filedesc *fdp;
2097         vnode_t new_cvp = newdp;
2098         vnode_t new_rvp = newdp;
2099         vnode_t old_cvp = NULL;
2100         vnode_t old_rvp = NULL;
2101
2102         /*
2103          * XXX Also needs to iterate each thread in the process to see if it
2104          * XXX is using a per-thread current working directory, and, if so,
2105          * XXX update that as well.
2106          */
2107
2108         /*
2109          * First, with the proc_fdlock held, check to see if we will need
2110          * to do any work.  If not, we will get out fast.
2111          */
2112         proc_fdlock(p);
2113         fdp = p->p_fd;
2114         if (fdp == NULL ||
2115             (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp)) {
2116                 proc_fdunlock(p);
2117                 return PROC_RETURNED;
2118         }
2119         proc_fdunlock(p);
2120
2121         /*
2122          * Ok, we will have to do some work.  Always take two refs
2123          * because we might need that many.  We'll dispose of whatever
2124          * we ended up not using.
2125          */
2126         if (vnode_ref(newdp) != 0) {
2127                 return PROC_RETURNED;
2128         }
2129         if (vnode_ref(newdp) != 0) {
2130                 vnode_rele(newdp);
2131                 return PROC_RETURNED;
2132         }
2133
2134         proc_dirs_lock_exclusive(p);
2135         /*
2136          * Now do the work.  Note: we dropped the proc_fdlock, so we
2137          * have to do all of the checks again.
2138          */
2139         proc_fdlock(p);
2140         fdp = p->p_fd;
2141         if (fdp != NULL) {
2142                 if (fdp->fd_cdir == olddp) {
2143                         old_cvp = olddp;
2144                         fdp->fd_cdir = newdp;
2145                         new_cvp = NULL;
2146                 }
2147                 if (fdp->fd_rdir == olddp) {
2148                         old_rvp = olddp;
2149                         fdp->fd_rdir = newdp;
2150                         new_rvp = NULL;
2151                 }
2152         }
2153         proc_fdunlock(p);
2154         proc_dirs_unlock_exclusive(p);
2155
2156         /*
2157          * Dispose of any references that are no longer needed.
2158          */
2159         if (old_cvp != NULL) {
2160                 vnode_rele(old_cvp);
2161         }
2162         if (old_rvp != NULL) {
2163                 vnode_rele(old_rvp);
2164         }
2165         if (new_cvp != NULL) {
2166                 vnode_rele(new_cvp);
2167         }
2168         if (new_rvp != NULL) {
2169                 vnode_rele(new_rvp);
2170         }
2171
2172         return PROC_RETURNED;
2173 }
2174
2175
2176
2177 /*
2178  * Scan all active processes to see if any of them have a current
2179  * or root directory onto which the new filesystem has just been
2180  * mounted. If so, replace them with the new mount point.
2181  */
2182 static int
2183 checkdirs(vnode_t olddp, vfs_context_t ctx)
2184 {
2185         vnode_t newdp;
2186         vnode_t tvp;
2187         int err;
2188         struct cdirargs cdr;
2189
2190         if (olddp->v_usecount == 1) {
2191                 return 0;
2192         }
2193         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2194
2195         if (err != 0) {
2196 #if DIAGNOSTIC
2197                 panic("mount: lost mount: error %d", err);
2198 #endif
2199                 return err;
2200         }
2201
2202         cdr.olddp = olddp;
2203         cdr.newdp = newdp;
2204         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2205         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2206
2207         if (rootvnode == olddp) {
2208                 vnode_ref(newdp);
2209                 lck_rw_lock_exclusive(&rootvnode_rw_lock);
2210                 tvp = rootvnode;
2211                 rootvnode = newdp;
2212                 lck_rw_unlock_exclusive(&rootvnode_rw_lock);
2213                 vnode_rele(tvp);
2214         }
2215
2216         vnode_put(newdp);
2217         return 0;
2218 }
2219
2220 /*
2221  * Unmount a file system.
2222  *
2223  * Note: unmount takes a path to the vnode mounted on as argument,
2224  * not special file (as before).
2225  */
2226 /* ARGSUSED */
2227 int
2228 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2229 {
2230         vnode_t vp;
2231         struct mount *mp;
2232         int error;
2233         struct nameidata nd;
2234         vfs_context_t ctx = vfs_context_current();
2235
2236         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2237             UIO_USERSPACE, uap->path, ctx);
2238         error = namei(&nd);
2239         if (error) {
2240                 return error;
2241         }
2242         vp = nd.ni_vp;
2243         mp = vp->v_mount;
2244         nameidone(&nd);
2245
2246 #if CONFIG_MACF
2247         error = mac_mount_check_umount(ctx, mp);
2248         if (error != 0) {
2249                 vnode_put(vp);
2250                 return error;
2251         }
2252 #endif
2253         /*
2254          * Must be the root of the filesystem
2255          */
2256         if ((vp->v_flag & VROOT) == 0) {
2257                 vnode_put(vp);
2258                 return EINVAL;
2259         }
2260         mount_ref(mp, 0);
2261         vnode_put(vp);
2262         /* safedounmount consumes the mount ref */
2263         return safedounmount(mp, uap->flags, ctx);
2264 }
2265
2266 int
2267 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2268 {
2269         mount_t mp;
2270
2271         mp = mount_list_lookupby_fsid(fsid, 0, 1);
2272         if (mp == (mount_t)0) {
2273                 return ENOENT;
2274         }
2275         mount_ref(mp, 0);
2276         mount_iterdrop(mp);
2277         /* safedounmount consumes the mount ref */
2278         return safedounmount(mp, flags, ctx);
2279 }
2280
2281 #define ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT        \
2282         "com.apple.private.vfs.role-account-unmount"
2283
2284 /*
2285  * The mount struct comes with a mount ref which will be consumed.
2286  * Do the actual file system unmount, prevent some common foot shooting.
2287  */
2288 int
2289 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2290 {
2291         int error;
2292         proc_t p = vfs_context_proc(ctx);
2293
2294         /*
2295          * If the file system is not responding and MNT_NOBLOCK
2296          * is set and not a forced unmount then return EBUSY.
2297          */
2298         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2299             (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2300                 error = EBUSY;
2301                 goto out;
2302         }
2303
2304         /*
2305          * Skip authorization in two cases:
2306          * - If the process running the unmount has ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT.
2307          *   This entitlement allows non-root processes unmount volumes mounted by
2308          *   other processes.
2309          * - If the mount is tagged as permissive and this is not a forced-unmount
2310          *   attempt.
2311          */
2312         if (!IOTaskHasEntitlement(current_task(), ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) &&
2313             (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0)))) {
2314                 /*
2315                  * Only root, or the user that did the original mount is
2316                  * permitted to unmount this filesystem.
2317                  */
2318                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2319                     (error = suser(kauth_cred_get(), &p->p_acflag))) {
2320                         goto out;
2321                 }
2322         }
2323         /*
2324          * Don't allow unmounting the root file system, or other volumes
2325          * associated with it (for example, the associated VM or DATA mounts) .
2326          */
2327         if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2328                 if (!(mp->mnt_flag & MNT_ROOTFS)) {
2329                         printf("attempt to unmount a system mount (%s), will return EBUSY\n",
2330                             mp->mnt_vfsstat.f_mntonname);
2331                 }
2332                 error = EBUSY; /* the root (or associated volumes) is always busy */
2333                 goto out;
2334         }
2335
2336         /*
2337          * If the mount is providing the root filesystem's disk image
2338          * (i.e. imageboot), don't allow unmounting
2339          */
2340         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2341                 error = EBUSY;
2342                 goto out;
2343         }
2344
2345         return dounmount(mp, flags, 1, ctx);
2346
2347 out:
2348         mount_drop(mp, 0);
2349         return error;
2350 }
2351
2352 /*
2353  * Do the actual file system unmount.
2354  */
2355 int
2356 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2357 {
2358         vnode_t coveredvp = (vnode_t)0;
2359         int error;
2360         int needwakeup = 0;
2361         int forcedunmount = 0;
2362         int lflags = 0;
2363         struct vnode *devvp = NULLVP;
2364 #if CONFIG_TRIGGERS
2365         proc_t p = vfs_context_proc(ctx);
2366         int did_vflush = 0;
2367         int pflags_save = 0;
2368 #endif /* CONFIG_TRIGGERS */
2369
2370 #if CONFIG_FSE
2371         if (!(flags & MNT_FORCE)) {
2372                 fsevent_unmount(mp, ctx);  /* has to come first! */
2373         }
2374 #endif
2375
2376         mount_lock(mp);
2377
2378         /*
2379          * If already an unmount in progress just return EBUSY.
2380          * Even a forced unmount cannot override.
2381          */
2382         if (mp->mnt_lflag & (MNT_LUNMOUNT | MNT_LMOUNT)) {
2383                 if (withref != 0) {
2384                         mount_drop(mp, 1);
2385                 }
2386                 mount_unlock(mp);
2387                 return EBUSY;
2388         }
2389
2390         if (flags & MNT_FORCE) {
2391                 forcedunmount = 1;
2392                 mp->mnt_lflag |= MNT_LFORCE;
2393         }
2394
2395 #if CONFIG_TRIGGERS
2396         if (flags & MNT_NOBLOCK && p != kernproc) {
2397                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2398         }
2399 #endif
2400
2401         mp->mnt_kern_flag |= MNTK_UNMOUNT;
2402         mp->mnt_lflag |= MNT_LUNMOUNT;
2403         mp->mnt_flag &= ~MNT_ASYNC;
2404         /*
2405          * anyone currently in the fast path that
2406          * trips over the cached rootvp will be
2407          * dumped out and forced into the slow path
2408          * to regenerate a new cached value
2409          */
2410         mp->mnt_realrootvp = NULLVP;
2411         mount_unlock(mp);
2412
2413         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2414                 /*
2415                  * Force unmount any mounts in this filesystem.
2416                  * If any unmounts fail - just leave them dangling.
2417                  * Avoids recursion.
2418                  */
2419                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2420         }
2421
2422         /*
2423          * taking the name_cache_lock exclusively will
2424          * insure that everyone is out of the fast path who
2425          * might be trying to use a now stale copy of
2426          * vp->v_mountedhere->mnt_realrootvp
2427          * bumping mount_generation causes the cached values
2428          * to be invalidated
2429          */
2430         name_cache_lock();
2431         mount_generation++;
2432         name_cache_unlock();
2433
2434
2435         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2436         if (withref != 0) {
2437                 mount_drop(mp, 0);
2438         }
2439         error = 0;
2440         if (forcedunmount == 0) {
2441                 ubc_umount(mp); /* release cached vnodes */
2442                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2443                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
2444                         if (error) {
2445                                 mount_lock(mp);
2446                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2447                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2448                                 mp->mnt_lflag &= ~MNT_LFORCE;
2449                                 goto out;
2450                         }
2451                 }
2452         }
2453
2454         IOBSDMountChange(mp, kIOMountChangeUnmount);
2455
2456 #if CONFIG_TRIGGERS
2457         vfs_nested_trigger_unmounts(mp, flags, ctx);
2458         did_vflush = 1;
2459 #endif
2460         if (forcedunmount) {
2461                 lflags |= FORCECLOSE;
2462         }
2463         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2464         if ((forcedunmount == 0) && error) {
2465                 mount_lock(mp);
2466                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2467                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2468                 mp->mnt_lflag &= ~MNT_LFORCE;
2469                 goto out;
2470         }
2471
2472         /* make sure there are no one in the mount iterations or lookup */
2473         mount_iterdrain(mp);
2474
2475         error = VFS_UNMOUNT(mp, flags, ctx);
2476         if (error) {
2477                 mount_iterreset(mp);
2478                 mount_lock(mp);
2479                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2480                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2481                 mp->mnt_lflag &= ~MNT_LFORCE;
2482                 goto out;
2483         }
2484
2485         /* increment the operations count */
2486         if (!error) {
2487                 OSAddAtomic(1, &vfs_nummntops);
2488         }
2489
2490         if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2491                 /* hold an io reference and drop the usecount before close */
2492                 devvp = mp->mnt_devvp;
2493                 vnode_getalways(devvp);
2494                 vnode_rele(devvp);
2495                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2496                     ctx);
2497                 vnode_clearmountedon(devvp);
2498                 vnode_put(devvp);
2499         }
2500         lck_rw_done(&mp->mnt_rwlock);
2501         mount_list_remove(mp);
2502         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2503
2504         /* mark the mount point hook in the vp but not drop the ref yet */
2505         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2506                 /*
2507                  * The covered vnode needs special handling. Trying to get an
2508                  * iocount must not block here as this may lead to deadlocks
2509                  * if the Filesystem to which the covered vnode belongs is
2510                  * undergoing forced unmounts. Since we hold a usecount, the
2511                  * vnode cannot be reused (it can, however, still be terminated)
2512                  */
2513                 vnode_getalways(coveredvp);
2514                 vnode_lock_spin(coveredvp);
2515
2516                 mp->mnt_crossref++;
2517                 coveredvp->v_mountedhere = (struct mount *)0;
2518                 CLR(coveredvp->v_flag, VMOUNT);
2519
2520                 vnode_unlock(coveredvp);
2521                 vnode_put(coveredvp);
2522         }
2523
2524         mount_list_lock();
2525         mp->mnt_vtable->vfc_refcount--;
2526         mount_list_unlock();
2527
2528         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2529         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2530         mount_lock(mp);
2531         mp->mnt_lflag |= MNT_LDEAD;
2532
2533         if (mp->mnt_lflag & MNT_LWAIT) {
2534                 /*
2535                  * do the wakeup here
2536                  * in case we block in mount_refdrain
2537                  * which will drop the mount lock
2538                  * and allow anyone blocked in vfs_busy
2539                  * to wakeup and see the LDEAD state
2540                  */
2541                 mp->mnt_lflag &= ~MNT_LWAIT;
2542                 wakeup((caddr_t)mp);
2543         }
2544         mount_refdrain(mp);
2545
2546         /* free disk_conditioner_info structure for this mount */
2547         disk_conditioner_unmount(mp);
2548
2549 out:
2550         if (mp->mnt_lflag & MNT_LWAIT) {
2551                 mp->mnt_lflag &= ~MNT_LWAIT;
2552                 needwakeup = 1;
2553         }
2554
2555 #if CONFIG_TRIGGERS
2556         if (flags & MNT_NOBLOCK && p != kernproc) {
2557                 // Restore P_NOREMOTEHANG bit to its previous value
2558                 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2559                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2560                 }
2561         }
2562
2563         /*
2564          * Callback and context are set together under the mount lock, and
2565          * never cleared, so we're safe to examine them here, drop the lock,
2566          * and call out.
2567          */
2568         if (mp->mnt_triggercallback != NULL) {
2569                 mount_unlock(mp);
2570                 if (error == 0) {
2571                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2572                 } else if (did_vflush) {
2573                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2574                 }
2575         } else {
2576                 mount_unlock(mp);
2577         }
2578 #else
2579         mount_unlock(mp);
2580 #endif /* CONFIG_TRIGGERS */
2581
2582         lck_rw_done(&mp->mnt_rwlock);
2583
2584         if (needwakeup) {
2585                 wakeup((caddr_t)mp);
2586         }
2587
2588         if (!error) {
2589                 if ((coveredvp != NULLVP)) {
2590                         vnode_t pvp = NULLVP;
2591
2592                         /*
2593                          * The covered vnode needs special handling. Trying to
2594                          * get an iocount must not block here as this may lead
2595                          * to deadlocks if the Filesystem to which the covered
2596                          * vnode belongs is undergoing forced unmounts. Since we
2597                          * hold a usecount, the  vnode cannot be reused
2598                          * (it can, however, still be terminated).
2599                          */
2600                         vnode_getalways(coveredvp);
2601
2602                         mount_dropcrossref(mp, coveredvp, 0);
2603                         /*
2604                          * We'll _try_ to detect if this really needs to be
2605                          * done. The coveredvp can only be in termination (or
2606                          * terminated) if the coveredvp's mount point is in a
2607                          * forced unmount (or has been) since we still hold the
2608                          * ref.
2609                          */
2610                         if (!vnode_isrecycled(coveredvp)) {
2611                                 pvp = vnode_getparent(coveredvp);
2612 #if CONFIG_TRIGGERS
2613                                 if (coveredvp->v_resolve) {
2614                                         vnode_trigger_rearm(coveredvp, ctx);
2615                                 }
2616 #endif
2617                         }
2618
2619                         vnode_rele(coveredvp);
2620                         vnode_put(coveredvp);
2621                         coveredvp = NULLVP;
2622
2623                         if (pvp) {
2624                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2625                                 vnode_put(pvp);
2626                         }
2627                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2628                         mount_lock_destroy(mp);
2629 #if CONFIG_MACF
2630                         mac_mount_label_destroy(mp);
2631 #endif
2632                         zfree(mount_zone, mp);
2633                 } else {
2634                         panic("dounmount: no coveredvp");
2635                 }
2636         }
2637         return error;
2638 }
2639
2640 /*
2641  * Unmount any mounts in this filesystem.
2642  */
2643 void
2644 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2645 {
2646         mount_t smp;
2647         fsid_t *fsids, fsid;
2648         int fsids_sz;
2649         int count = 0, i, m = 0;
2650         vnode_t vp;
2651
2652         mount_list_lock();
2653
2654         // Get an array to hold the submounts fsids.
2655         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2656         count++;
2657         fsids_sz = count * sizeof(fsid_t);
2658         fsids = kheap_alloc(KHEAP_TEMP, fsids_sz, Z_NOWAIT);
2659         if (fsids == NULL) {
2660                 mount_list_unlock();
2661                 goto out;
2662         }
2663         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2664
2665         /*
2666          * Fill the array with submount fsids.
2667          * Since mounts are always added to the tail of the mount list, the
2668          * list is always in mount order.
2669          * For each mount check if the mounted-on vnode belongs to a
2670          * mount that's already added to our array of mounts to be unmounted.
2671          */
2672         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2673                 vp = smp->mnt_vnodecovered;
2674                 if (vp == NULL) {
2675                         continue;
2676                 }
2677                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2678                 for (i = 0; i <= m; i++) {
2679                         if (fsids[i].val[0] == fsid.val[0] &&
2680                             fsids[i].val[1] == fsid.val[1]) {
2681                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2682                                 break;
2683                         }
2684                 }
2685         }
2686         mount_list_unlock();
2687
2688         // Unmount the submounts in reverse order. Ignore errors.
2689         for (i = m; i > 0; i--) {
2690                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2691                 if (smp) {
2692                         mount_ref(smp, 0);
2693                         mount_iterdrop(smp);
2694                         (void) dounmount(smp, flags, 1, ctx);
2695                 }
2696         }
2697 out:
2698         kheap_free(KHEAP_TEMP, fsids, fsids_sz);
2699 }
2700
2701 void
2702 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2703 {
2704         vnode_lock(dp);
2705         mp->mnt_crossref--;
2706
2707         if (mp->mnt_crossref < 0) {
2708                 panic("mount cross refs -ve");
2709         }
2710
2711         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2712                 if (need_put) {
2713                         vnode_put_locked(dp);
2714                 }
2715                 vnode_unlock(dp);
2716
2717                 mount_lock_destroy(mp);
2718 #if CONFIG_MACF
2719                 mac_mount_label_destroy(mp);
2720 #endif
2721                 zfree(mount_zone, mp);
2722                 return;
2723         }
2724         if (need_put) {
2725                 vnode_put_locked(dp);
2726         }
2727         vnode_unlock(dp);
2728 }
2729
2730
2731 /*
2732  * Sync each mounted filesystem.
2733  */
2734 #if DIAGNOSTIC
2735 int syncprt = 0;
2736 #endif
2737
2738 int print_vmpage_stat = 0;
2739
2740 /*
2741  * sync_callback:       simple wrapper that calls VFS_SYNC() on volumes
2742  *                      mounted read-write with the passed waitfor value.
2743  *
2744  * Parameters:  mp      mount-point descriptor per mounted file-system instance.
2745  *              arg     user argument (please see below)
2746  *
2747  * User argument is a pointer to 32 bit unsigned integer which describes the
2748  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
2749  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2750  * waitfor value.
2751  *
2752  * Returns:             VFS_RETURNED
2753  */
2754 static int
2755 sync_callback(mount_t mp, void *arg)
2756 {
2757         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2758                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2759                 unsigned waitfor = MNT_NOWAIT;
2760
2761                 if (arg) {
2762                         waitfor = *(uint32_t*)arg;
2763                 }
2764
2765                 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
2766                 if (waitfor != MNT_WAIT &&
2767                     waitfor != (MNT_WAIT | MNT_VOLUME) &&
2768                     waitfor != MNT_NOWAIT &&
2769                     waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2770                     waitfor != MNT_DWAIT &&
2771                     waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2772                         panic("Passed inappropriate waitfor %u to "
2773                             "sync_callback()", waitfor);
2774                 }
2775
2776                 mp->mnt_flag &= ~MNT_ASYNC;
2777                 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2778                 if (asyncflag) {
2779                         mp->mnt_flag |= MNT_ASYNC;
2780                 }
2781         }
2782
2783         return VFS_RETURNED;
2784 }
2785
2786 /* ARGSUSED */
2787 int
2788 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2789 {
2790         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2791
2792         if (print_vmpage_stat) {
2793                 vm_countdirtypages();
2794         }
2795
2796 #if DIAGNOSTIC
2797         if (syncprt) {
2798                 vfs_bufstats();
2799         }
2800 #endif /* DIAGNOSTIC */
2801         return 0;
2802 }
2803
2804 typedef enum {
2805         SYNC_ALL = 0,
2806         SYNC_ONLY_RELIABLE_MEDIA = 1,
2807         SYNC_ONLY_UNRELIABLE_MEDIA = 2
2808 } sync_type_t;
2809
2810 static int
2811 sync_internal_callback(mount_t mp, void *arg)
2812 {
2813         if (arg) {
2814                 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2815                     (mp->mnt_flag & MNT_LOCAL);
2816                 sync_type_t sync_type = *((sync_type_t *)arg);
2817
2818                 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2819                         return VFS_RETURNED;
2820                 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2821                         return VFS_RETURNED;
2822                 }
2823         }
2824
2825         (void)sync_callback(mp, NULL);
2826
2827         return VFS_RETURNED;
2828 }
2829
2830 int sync_thread_state = 0;
2831 int sync_timeout_seconds = 5;
2832
2833 #define SYNC_THREAD_RUN       0x0001
2834 #define SYNC_THREAD_RUNNING   0x0002
2835
2836 #if CONFIG_PHYS_WRITE_ACCT
2837 thread_t pm_sync_thread;
2838 #endif /* CONFIG_PHYS_WRITE_ACCT */
2839
2840 static void
2841 sync_thread(__unused void *arg, __unused wait_result_t wr)
2842 {
2843         sync_type_t sync_type;
2844 #if CONFIG_PHYS_WRITE_ACCT
2845         pm_sync_thread = current_thread();
2846 #endif /* CONFIG_PHYS_WRITE_ACCT */
2847
2848         lck_mtx_lock(&sync_mtx_lck);
2849         while (sync_thread_state & SYNC_THREAD_RUN) {
2850                 sync_thread_state &= ~SYNC_THREAD_RUN;
2851                 lck_mtx_unlock(&sync_mtx_lck);
2852
2853                 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2854                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2855                 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2856                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2857
2858                 lck_mtx_lock(&sync_mtx_lck);
2859         }
2860         /*
2861          * This wakeup _has_ to be issued before the lock is released otherwise
2862          * we may end up waking up a thread in sync_internal which is
2863          * expecting a wakeup from a thread it just created and not from this
2864          * thread which is about to exit.
2865          */
2866         wakeup(&sync_thread_state);
2867         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2868 #if CONFIG_PHYS_WRITE_ACCT
2869         pm_sync_thread = NULL;
2870 #endif /* CONFIG_PHYS_WRITE_ACCT */
2871         lck_mtx_unlock(&sync_mtx_lck);
2872
2873         if (print_vmpage_stat) {
2874                 vm_countdirtypages();
2875         }
2876
2877 #if DIAGNOSTIC
2878         if (syncprt) {
2879                 vfs_bufstats();
2880         }
2881 #endif /* DIAGNOSTIC */
2882 }
2883
2884 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2885
2886 /*
2887  * An in-kernel sync for power management to call.
2888  * This function always returns within sync_timeout seconds.
2889  */
2890 __private_extern__ int
2891 sync_internal(void)
2892 {
2893         thread_t thd;
2894         int error;
2895         int thread_created = FALSE;
2896         struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2897
2898         lck_mtx_lock(&sync_mtx_lck);
2899         sync_thread_state |= SYNC_THREAD_RUN;
2900         if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2901                 int kr;
2902
2903                 sync_thread_state |= SYNC_THREAD_RUNNING;
2904                 kr = kernel_thread_start(sync_thread, NULL, &thd);
2905                 if (kr != KERN_SUCCESS) {
2906                         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2907                         lck_mtx_unlock(&sync_mtx_lck);
2908                         printf("sync_thread failed\n");
2909                         return 0;
2910                 }
2911                 thread_created = TRUE;
2912         }
2913
2914         error = msleep((caddr_t)&sync_thread_state, &sync_mtx_lck,
2915             (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2916         if (error) {
2917                 struct timeval now;
2918
2919                 microtime(&now);
2920                 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2921                         printf("sync timed out: %d sec\n", sync_timeout_seconds);
2922                         sync_timeout_last_print.tv_sec = now.tv_sec;
2923                 }
2924         }
2925
2926         if (thread_created) {
2927                 thread_deallocate(thd);
2928         }
2929
2930         return 0;
2931 } /* end of sync_internal call */
2932
2933 /*
2934  * Change filesystem quotas.
2935  */
2936 #if QUOTA
2937 int
2938 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2939 {
2940         struct mount *mp;
2941         int error, quota_cmd, quota_status = 0;
2942         caddr_t datap;
2943         size_t fnamelen;
2944         struct nameidata nd;
2945         vfs_context_t ctx = vfs_context_current();
2946         struct dqblk my_dqblk = {};
2947
2948         AUDIT_ARG(uid, uap->uid);
2949         AUDIT_ARG(cmd, uap->cmd);
2950         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2951             uap->path, ctx);
2952         error = namei(&nd);
2953         if (error) {
2954                 return error;
2955         }
2956         mp = nd.ni_vp->v_mount;
2957         mount_ref(mp, 0);
2958         vnode_put(nd.ni_vp);
2959         nameidone(&nd);
2960
2961         /* copyin any data we will need for downstream code */
2962         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2963
2964         switch (quota_cmd) {
2965         case Q_QUOTAON:
2966                 /* uap->arg specifies a file from which to take the quotas */
2967                 fnamelen = MAXPATHLEN;
2968                 datap = zalloc(ZV_NAMEI);
2969                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2970                 break;
2971         case Q_GETQUOTA:
2972                 /* uap->arg is a pointer to a dqblk structure. */
2973                 datap = (caddr_t) &my_dqblk;
2974                 break;
2975         case Q_SETQUOTA:
2976         case Q_SETUSE:
2977                 /* uap->arg is a pointer to a dqblk structure. */
2978                 datap = (caddr_t) &my_dqblk;
2979                 if (proc_is64bit(p)) {
2980                         struct user_dqblk       my_dqblk64;
2981                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2982                         if (error == 0) {
2983                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2984                         }
2985                 } else {
2986                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2987                 }
2988                 break;
2989         case Q_QUOTASTAT:
2990                 /* uap->arg is a pointer to an integer */
2991                 datap = (caddr_t) &quota_status;
2992                 break;
2993         default:
2994                 datap = NULL;
2995                 break;
2996         } /* switch */
2997
2998         if (error == 0) {
2999                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
3000         }
3001
3002         switch (quota_cmd) {
3003         case Q_QUOTAON:
3004                 if (datap != NULL) {
3005                         zfree(ZV_NAMEI, datap);
3006                 }
3007                 break;
3008         case Q_GETQUOTA:
3009                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
3010                 if (error == 0) {
3011                         if (proc_is64bit(p)) {
3012                                 struct user_dqblk       my_dqblk64;
3013
3014                                 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
3015                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
3016                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
3017                         } else {
3018                                 error = copyout(datap, uap->arg, sizeof(struct dqblk));
3019                         }
3020                 }
3021                 break;
3022         case Q_QUOTASTAT:
3023                 /* uap->arg is a pointer to an integer */
3024                 if (error == 0) {
3025                         error = copyout(datap, uap->arg, sizeof(quota_status));
3026                 }
3027                 break;
3028         default:
3029                 break;
3030         } /* switch */
3031
3032         mount_drop(mp, 0);
3033         return error;
3034 }
3035 #else
3036 int
3037 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
3038 {
3039         return EOPNOTSUPP;
3040 }
3041 #endif /* QUOTA */
3042
3043 /*
3044  * Get filesystem statistics.
3045  *
3046  * Returns:     0                       Success
3047  *      namei:???
3048  *      vfs_update_vfsstat:???
3049  *      munge_statfs:EFAULT
3050  */
3051 /* ARGSUSED */
3052 int
3053 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
3054 {
3055         struct mount *mp;
3056         struct vfsstatfs *sp;
3057         int error;
3058         struct nameidata nd;
3059         vfs_context_t ctx = vfs_context_current();
3060         vnode_t vp;
3061
3062         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3063             UIO_USERSPACE, uap->path, ctx);
3064         error = namei(&nd);
3065         if (error != 0) {
3066                 return error;
3067         }
3068         vp = nd.ni_vp;
3069         mp = vp->v_mount;
3070         sp = &mp->mnt_vfsstat;
3071         nameidone(&nd);
3072
3073 #if CONFIG_MACF
3074         error = mac_mount_check_stat(ctx, mp);
3075         if (error != 0) {
3076                 vnode_put(vp);
3077                 return error;
3078         }
3079 #endif
3080
3081         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
3082         if (error != 0) {
3083                 vnode_put(vp);
3084                 return error;
3085         }
3086
3087         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3088         vnode_put(vp);
3089         return error;
3090 }
3091
3092 /*
3093  * Get filesystem statistics.
3094  */
3095 /* ARGSUSED */
3096 int
3097 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
3098 {
3099         vnode_t vp;
3100         struct mount *mp;
3101         struct vfsstatfs *sp;
3102         int error;
3103
3104         AUDIT_ARG(fd, uap->fd);
3105
3106         if ((error = file_vnode(uap->fd, &vp))) {
3107                 return error;
3108         }
3109
3110         error = vnode_getwithref(vp);
3111         if (error) {
3112                 file_drop(uap->fd);
3113                 return error;
3114         }
3115
3116         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3117
3118         mp = vp->v_mount;
3119         if (!mp) {
3120                 error = EBADF;
3121                 goto out;
3122         }
3123
3124 #if CONFIG_MACF
3125         error = mac_mount_check_stat(vfs_context_current(), mp);
3126         if (error != 0) {
3127                 goto out;
3128         }
3129 #endif
3130
3131         sp = &mp->mnt_vfsstat;
3132         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3133                 goto out;
3134         }
3135
3136         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
3137
3138 out:
3139         file_drop(uap->fd);
3140         vnode_put(vp);
3141
3142         return error;
3143 }
3144
3145 void
3146 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
3147 {
3148         struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
3149
3150         bzero(sfs, sizeof(*sfs));
3151
3152         sfs->f_bsize = vsfs->f_bsize;
3153         sfs->f_iosize = (int32_t)vsfs->f_iosize;
3154         sfs->f_blocks = vsfs->f_blocks;
3155         sfs->f_bfree = vsfs->f_bfree;
3156         sfs->f_bavail = vsfs->f_bavail;
3157         sfs->f_files = vsfs->f_files;
3158         sfs->f_ffree = vsfs->f_ffree;
3159         sfs->f_fsid = vsfs->f_fsid;
3160         sfs->f_owner = vsfs->f_owner;
3161         sfs->f_type = mp->mnt_vtable->vfc_typenum;
3162         sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3163         sfs->f_fssubtype = vsfs->f_fssubtype;
3164         sfs->f_flags_ext = (mp->mnt_kern_flag & MNTK_SYSTEMDATA) ? MNT_EXT_ROOT_DATA_VOL : 0;
3165         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3166                 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3167         } else {
3168                 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3169         }
3170         strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3171         strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3172 }
3173
3174 /*
3175  * Get file system statistics in 64-bit mode
3176  */
3177 int
3178 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3179 {
3180         struct mount *mp;
3181         int error;
3182         struct nameidata *ndp;
3183         struct statfs64 *sfsp;
3184         vfs_context_t ctxp = vfs_context_current();
3185         vnode_t vp;
3186         union {
3187                 struct nameidata nd;
3188                 struct statfs64 sfs;
3189         } *__nameidata_statfs64;
3190
3191         __nameidata_statfs64 = kheap_alloc(KHEAP_TEMP, sizeof(*__nameidata_statfs64),
3192             Z_WAITOK);
3193         ndp = &__nameidata_statfs64->nd;
3194
3195         NDINIT(ndp, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3196             UIO_USERSPACE, uap->path, ctxp);
3197         error = namei(ndp);
3198         if (error != 0) {
3199                 goto out;
3200         }
3201         vp = ndp->ni_vp;
3202         mp = vp->v_mount;
3203         nameidone(ndp);
3204
3205 #if CONFIG_MACF
3206         error = mac_mount_check_stat(ctxp, mp);
3207         if (error != 0) {
3208                 vnode_put(vp);
3209                 goto out;
3210         }
3211 #endif
3212
3213         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3214         if (error != 0) {
3215                 vnode_put(vp);
3216                 goto out;
3217         }
3218
3219         sfsp = &__nameidata_statfs64->sfs;
3220         vfs_get_statfs64(mp, sfsp);
3221         if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3222             (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3223                 /* This process does not want to see a seperate data volume mountpoint */
3224                 strlcpy(&sfsp->f_mntonname[0], "/", sizeof("/"));
3225         }
3226         error = copyout(sfsp, uap->buf, sizeof(*sfsp));
3227         vnode_put(vp);
3228
3229 out:
3230         kheap_free(KHEAP_TEMP, __nameidata_statfs64, sizeof(*__nameidata_statfs64));
3231
3232         return error;
3233 }
3234
3235 /*
3236  * Get file system statistics in 64-bit mode
3237  */
3238 int
3239 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3240 {
3241         struct vnode *vp;
3242         struct mount *mp;
3243         struct statfs64 sfs;
3244         int error;
3245
3246         AUDIT_ARG(fd, uap->fd);
3247
3248         if ((error = file_vnode(uap->fd, &vp))) {
3249                 return error;
3250         }
3251
3252         error = vnode_getwithref(vp);
3253         if (error) {
3254                 file_drop(uap->fd);
3255                 return error;
3256         }
3257
3258         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3259
3260         mp = vp->v_mount;
3261         if (!mp) {
3262                 error = EBADF;
3263                 goto out;
3264         }
3265
3266 #if CONFIG_MACF
3267         error = mac_mount_check_stat(vfs_context_current(), mp);
3268         if (error != 0) {
3269                 goto out;
3270         }
3271 #endif
3272
3273         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3274                 goto out;
3275         }
3276
3277         vfs_get_statfs64(mp, &sfs);
3278         if ((mp->mnt_kern_flag & MNTK_SYSTEMDATA) &&
3279             (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3280                 /* This process does not want to see a seperate data volume mountpoint */
3281                 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3282         }
3283         error = copyout(&sfs, uap->buf, sizeof(sfs));
3284
3285 out:
3286         file_drop(uap->fd);
3287         vnode_put(vp);
3288
3289         return error;
3290 }
3291
3292 struct getfsstat_struct {
3293         user_addr_t     sfsp;
3294         user_addr_t     *mp;
3295         int             count;
3296         int             maxcount;
3297         int             flags;
3298         int             error;
3299 };
3300
3301
3302 static int
3303 getfsstat_callback(mount_t mp, void * arg)
3304 {
3305         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3306         struct vfsstatfs *sp;
3307         int error, my_size;
3308         vfs_context_t ctx = vfs_context_current();
3309
3310         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3311 #if CONFIG_MACF
3312                 error = mac_mount_check_stat(ctx, mp);
3313                 if (error != 0) {
3314                         fstp->error = error;
3315                         return VFS_RETURNED_DONE;
3316                 }
3317 #endif
3318                 sp = &mp->mnt_vfsstat;
3319                 /*
3320                  * If MNT_NOWAIT is specified, do not refresh the
3321                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3322                  */
3323                 if ((mp->mnt_lflag & MNT_LDEAD) ||
3324                     (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3325                     (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3326                     (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3327                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3328                         return VFS_RETURNED;
3329                 }
3330
3331                 /*
3332                  * Need to handle LP64 version of struct statfs
3333                  */
3334                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3335                 if (error) {
3336                         fstp->error = error;
3337                         return VFS_RETURNED_DONE;
3338                 }
3339                 fstp->sfsp += my_size;
3340
3341                 if (fstp->mp) {
3342 #if CONFIG_MACF
3343                         error = mac_mount_label_get(mp, *fstp->mp);
3344                         if (error) {
3345                                 fstp->error = error;
3346                                 return VFS_RETURNED_DONE;
3347                         }
3348 #endif
3349                         fstp->mp++;
3350                 }
3351         }
3352         fstp->count++;
3353         return VFS_RETURNED;
3354 }
3355
3356 /*
3357  * Get statistics on all filesystems.
3358  */
3359 int
3360 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3361 {
3362         struct __mac_getfsstat_args muap;
3363
3364         muap.buf = uap->buf;
3365         muap.bufsize = uap->bufsize;
3366         muap.mac = USER_ADDR_NULL;
3367         muap.macsize = 0;
3368         muap.flags = uap->flags;
3369
3370         return __mac_getfsstat(p, &muap, retval);
3371 }
3372
3373 /*
3374  * __mac_getfsstat: Get MAC-related file system statistics
3375  *
3376  * Parameters:    p                        (ignored)
3377  *                uap                      User argument descriptor (see below)
3378  *                retval                   Count of file system statistics (N stats)
3379  *
3380  * Indirect:      uap->bufsize             Buffer size
3381  *                uap->macsize             MAC info size
3382  *                uap->buf                 Buffer where information will be returned
3383  *                uap->mac                 MAC info
3384  *                uap->flags               File system flags
3385  *
3386  *
3387  * Returns:        0                       Success
3388  *                !0                       Not success
3389  *
3390  */
3391 int
3392 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3393 {
3394         user_addr_t sfsp;
3395         user_addr_t *mp;
3396         size_t count, maxcount, bufsize, macsize;
3397         struct getfsstat_struct fst;
3398
3399         if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3400                 return EINVAL;
3401         }
3402
3403         bufsize = (size_t) uap->bufsize;
3404         macsize = (size_t) uap->macsize;
3405
3406         if (IS_64BIT_PROCESS(p)) {
3407                 maxcount = bufsize / sizeof(struct user64_statfs);
3408         } else {
3409                 maxcount = bufsize / sizeof(struct user32_statfs);
3410         }
3411         sfsp = uap->buf;
3412         count = 0;
3413
3414         mp = NULL;
3415
3416 #if CONFIG_MACF
3417         if (uap->mac != USER_ADDR_NULL) {
3418                 u_int32_t *mp0;
3419                 int error;
3420                 unsigned int i;
3421
3422                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3423                 if (count != maxcount) {
3424                         return EINVAL;
3425                 }
3426
3427                 /* Copy in the array */
3428                 mp0 = kheap_alloc(KHEAP_TEMP, macsize, Z_WAITOK);
3429                 if (mp0 == NULL) {
3430                         return ENOMEM;
3431                 }
3432
3433                 error = copyin(uap->mac, mp0, macsize);
3434                 if (error) {
3435                         kheap_free(KHEAP_TEMP, mp0, macsize);
3436                         return error;
3437                 }
3438
3439                 /* Normalize to an array of user_addr_t */
3440                 mp = kheap_alloc(KHEAP_TEMP, count * sizeof(user_addr_t), Z_WAITOK);
3441                 if (mp == NULL) {
3442                         kheap_free(KHEAP_TEMP, mp0, macsize);
3443                         return ENOMEM;
3444                 }
3445
3446                 for (i = 0; i < count; i++) {
3447                         if (IS_64BIT_PROCESS(p)) {
3448                                 mp[i] = ((user_addr_t *)mp0)[i];
3449                         } else {
3450                                 mp[i] = (user_addr_t)mp0[i];
3451                         }
3452                 }
3453                 kheap_free(KHEAP_TEMP, mp0, macsize);
3454         }
3455 #endif
3456
3457
3458         fst.sfsp = sfsp;
3459         fst.mp = mp;
3460         fst.flags = uap->flags;
3461         fst.count = 0;
3462         fst.error = 0;
3463         fst.maxcount = (int)maxcount;
3464
3465
3466         vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3467
3468         if (mp) {
3469                 kheap_free(KHEAP_TEMP, mp, count * sizeof(user_addr_t));
3470         }
3471
3472         if (fst.error) {
3473                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3474                 return fst.error;
3475         }
3476
3477         if (fst.sfsp && fst.count > fst.maxcount) {
3478                 *retval = fst.maxcount;
3479         } else {
3480                 *retval = fst.count;
3481         }
3482         return 0;
3483 }
3484
3485 static int
3486 getfsstat64_callback(mount_t mp, void * arg)
3487 {
3488         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3489         struct vfsstatfs *sp;
3490         struct statfs64 sfs;
3491         int error;
3492
3493         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3494 #if CONFIG_MACF
3495                 error = mac_mount_check_stat(vfs_context_current(), mp);
3496                 if (error != 0) {
3497                         fstp->error = error;
3498                         return VFS_RETURNED_DONE;
3499                 }
3500 #endif
3501                 sp = &mp->mnt_vfsstat;
3502                 /*
3503                  * If MNT_NOWAIT is specified, do not refresh the fsstat
3504                  * cache. MNT_WAIT overrides MNT_NOWAIT.
3505                  *
3506                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
3507                  * getfsstat, since the constants are out of the same
3508                  * namespace.
3509                  */
3510                 if ((mp->mnt_lflag & MNT_LDEAD) ||
3511                     ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3512                     (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3513                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3514                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3515                         return VFS_RETURNED;
3516                 }
3517
3518                 vfs_get_statfs64(mp, &sfs);
3519                 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3520                 if (error) {
3521                         fstp->error = error;
3522                         return VFS_RETURNED_DONE;
3523                 }
3524                 fstp->sfsp += sizeof(sfs);
3525         }
3526         fstp->count++;
3527         return VFS_RETURNED;
3528 }
3529
3530 /*
3531  * Get statistics on all file systems in 64 bit mode.
3532  */
3533 int
3534 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3535 {
3536         user_addr_t sfsp;
3537         int count, maxcount;
3538         struct getfsstat_struct fst;
3539
3540         maxcount = uap->bufsize / sizeof(struct statfs64);
3541
3542         sfsp = uap->buf;
3543         count = 0;
3544
3545         fst.sfsp = sfsp;
3546         fst.flags = uap->flags;
3547         fst.count = 0;
3548         fst.error = 0;
3549         fst.maxcount = maxcount;
3550
3551         vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3552
3553         if (fst.error) {
3554                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3555                 return fst.error;
3556         }
3557
3558         if (fst.sfsp && fst.count > fst.maxcount) {
3559                 *retval = fst.maxcount;
3560         } else {
3561                 *retval = fst.count;
3562         }
3563
3564         return 0;
3565 }
3566
3567 /*
3568  * gets the associated vnode with the file descriptor passed.
3569  * as input
3570  *
3571  * INPUT
3572  * ctx - vfs context of caller
3573  * fd - file descriptor for which vnode is required.
3574  * vpp - Pointer to pointer to vnode to be returned.
3575  *
3576  * The vnode is returned with an iocount so any vnode obtained
3577  * by this call needs a vnode_put
3578  *
3579  */
3580 int
3581 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3582 {
3583         int error;
3584         vnode_t vp;
3585         struct fileproc *fp;
3586         proc_t p = vfs_context_proc(ctx);
3587
3588         *vpp =  NULLVP;
3589
3590         error = fp_getfvp(p, fd, &fp, &vp);
3591         if (error) {
3592                 return error;
3593         }
3594
3595         error = vnode_getwithref(vp);
3596         if (error) {
3597                 (void)fp_drop(p, fd, fp, 0);
3598                 return error;
3599         }
3600
3601         (void)fp_drop(p, fd, fp, 0);
3602         *vpp = vp;
3603         return error;
3604 }
3605
3606 /*
3607  * Wrapper function around namei to start lookup from a directory
3608  * specified by a file descriptor ni_dirfd.
3609  *
3610  * In addition to all the errors returned by namei, this call can
3611  * return ENOTDIR if the file descriptor does not refer to a directory.
3612  * and EBADF if the file descriptor is not valid.
3613  */
3614 int
3615 nameiat(struct nameidata *ndp, int dirfd)
3616 {
3617         if ((dirfd != AT_FDCWD) &&
3618             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3619             !(ndp->ni_cnd.cn_flags & USEDVP)) {
3620                 int error = 0;
3621                 char c;
3622
3623                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3624                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3625                         if (error) {
3626                                 return error;
3627                         }
3628                 } else {
3629                         c = *((char *)(ndp->ni_dirp));
3630                 }
3631
3632                 if (c != '/') {
3633                         vnode_t dvp_at;
3634
3635                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3636                             &dvp_at);
3637                         if (error) {
3638                                 return error;
3639                         }
3640
3641                         if (vnode_vtype(dvp_at) != VDIR) {
3642                                 vnode_put(dvp_at);
3643                                 return ENOTDIR;
3644                         }
3645
3646                         ndp->ni_dvp = dvp_at;
3647                         ndp->ni_cnd.cn_flags |= USEDVP;
3648                         error = namei(ndp);
3649                         ndp->ni_cnd.cn_flags &= ~USEDVP;
3650                         vnode_put(dvp_at);
3651                         return error;
3652                 }
3653         }
3654
3655         return namei(ndp);
3656 }
3657
3658 /*
3659  * Change current working directory to a given file descriptor.
3660  */
3661 /* ARGSUSED */
3662 static int
3663 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3664 {
3665         struct filedesc *fdp = p->p_fd;
3666         vnode_t vp;
3667         vnode_t tdp;
3668         vnode_t tvp;
3669         struct mount *mp;
3670         int error, should_put = 1;
3671         vfs_context_t ctx = vfs_context_current();
3672
3673         AUDIT_ARG(fd, uap->fd);
3674         if (per_thread && uap->fd == -1) {
3675                 /*
3676                  * Switching back from per-thread to per process CWD; verify we
3677                  * in fact have one before proceeding.  The only success case
3678                  * for this code path is to return 0 preemptively after zapping
3679                  * the thread structure contents.
3680                  */
3681                 thread_t th = vfs_context_thread(ctx);
3682                 if (th) {
3683                         uthread_t uth = get_bsdthread_info(th);
3684                         tvp = uth->uu_cdir;
3685                         uth->uu_cdir = NULLVP;
3686                         if (tvp != NULLVP) {
3687                                 vnode_rele(tvp);
3688                                 return 0;
3689                         }
3690                 }
3691                 return EBADF;
3692         }
3693
3694         if ((error = file_vnode(uap->fd, &vp))) {
3695                 return error;
3696         }
3697         if ((error = vnode_getwithref(vp))) {
3698                 file_drop(uap->fd);
3699                 return error;
3700         }
3701
3702         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3703
3704         if (vp->v_type != VDIR) {
3705                 error = ENOTDIR;
3706                 goto out;
3707         }
3708
3709 #if CONFIG_MACF
3710         error = mac_vnode_check_chdir(ctx, vp);
3711         if (error) {
3712                 goto out;
3713         }
3714 #endif
3715         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3716         if (error) {
3717                 goto out;
3718         }
3719
3720         while (!error && (mp = vp->v_mountedhere) != NULL) {
3721                 if (vfs_busy(mp, LK_NOWAIT)) {
3722                         error = EACCES;
3723                         goto out;
3724                 }
3725                 error = VFS_ROOT(mp, &tdp, ctx);
3726                 vfs_unbusy(mp);
3727                 if (error) {
3728                         break;
3729                 }
3730                 vnode_put(vp);
3731                 vp = tdp;
3732         }
3733         if (error) {
3734                 goto out;
3735         }
3736         if ((error = vnode_ref(vp))) {
3737                 goto out;
3738         }
3739         vnode_put(vp);
3740         should_put = 0;
3741
3742         if (per_thread) {
3743                 thread_t th = vfs_context_thread(ctx);
3744                 if (th) {
3745                         uthread_t uth = get_bsdthread_info(th);
3746                         tvp = uth->uu_cdir;
3747                         uth->uu_cdir = vp;
3748                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3749                 } else {
3750                         vnode_rele(vp);
3751                         error = ENOENT;
3752                         goto out;
3753                 }
3754         } else {
3755                 proc_dirs_lock_exclusive(p);
3756                 proc_fdlock(p);
3757                 tvp = fdp->fd_cdir;
3758                 fdp->fd_cdir = vp;
3759                 proc_fdunlock(p);
3760                 proc_dirs_unlock_exclusive(p);
3761         }
3762
3763         if (tvp) {
3764                 vnode_rele(tvp);
3765         }
3766
3767 out:
3768         if (should_put) {
3769                 vnode_put(vp);
3770         }
3771         file_drop(uap->fd);
3772
3773         return error;
3774 }
3775
3776 int
3777 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3778 {
3779         return common_fchdir(p, uap, 0);
3780 }
3781
3782 int
3783 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3784 {
3785         return common_fchdir(p, (void *)uap, 1);
3786 }
3787
3788
3789 /*
3790  * Change current working directory (".").
3791  *
3792  * Returns:     0                       Success
3793  *      change_dir:ENOTDIR
3794  *      change_dir:???
3795  *      vnode_ref:ENOENT                No such file or directory
3796  */
3797 /* ARGSUSED */
3798 int
3799 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3800 {
3801         struct filedesc *fdp = p->p_fd;
3802         int error;
3803         vnode_t tvp;
3804
3805         error = change_dir(ndp, ctx);
3806         if (error) {
3807                 return error;
3808         }
3809         if ((error = vnode_ref(ndp->ni_vp))) {
3810                 vnode_put(ndp->ni_vp);
3811                 return error;
3812         }
3813         /*
3814          * drop the iocount we picked up in change_dir
3815          */
3816         vnode_put(ndp->ni_vp);
3817
3818         if (per_thread) {
3819                 thread_t th = vfs_context_thread(ctx);
3820                 if (th) {
3821                         uthread_t uth = get_bsdthread_info(th);
3822                         tvp = uth->uu_cdir;
3823                         uth->uu_cdir = ndp->ni_vp;
3824                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3825                 } else {
3826                         vnode_rele(ndp->ni_vp);
3827                         return ENOENT;
3828                 }
3829         } else {
3830                 proc_dirs_lock_exclusive(p);
3831                 proc_fdlock(p);
3832                 tvp = fdp->fd_cdir;
3833                 fdp->fd_cdir = ndp->ni_vp;
3834                 proc_fdunlock(p);
3835                 proc_dirs_unlock_exclusive(p);
3836         }
3837
3838         if (tvp) {
3839                 vnode_rele(tvp);
3840         }
3841
3842         return 0;
3843 }
3844
3845
3846 /*
3847  * Change current working directory (".").
3848  *
3849  * Returns:     0                       Success
3850  *      chdir_internal:ENOTDIR
3851  *      chdir_internal:ENOENT           No such file or directory
3852  *      chdir_internal:???
3853  */
3854 /* ARGSUSED */
3855 static int
3856 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3857 {
3858         struct nameidata nd;
3859         vfs_context_t ctx = vfs_context_current();
3860
3861         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3862             UIO_USERSPACE, uap->path, ctx);
3863
3864         return chdir_internal(p, ctx, &nd, per_thread);
3865 }
3866
3867
3868 /*
3869  * chdir
3870  *
3871  * Change current working directory (".") for the entire process
3872  *
3873  * Parameters:  p       Process requesting the call
3874  *              uap     User argument descriptor (see below)
3875  *              retval  (ignored)
3876  *
3877  * Indirect parameters: uap->path       Directory path
3878  *
3879  * Returns:     0                       Success
3880  *              common_chdir: ENOTDIR
3881  *              common_chdir: ENOENT    No such file or directory
3882  *              common_chdir: ???
3883  *
3884  */
3885 int
3886 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3887 {
3888         return common_chdir(p, (void *)uap, 0);
3889 }
3890
3891 /*
3892  * __pthread_chdir
3893  *
3894  * Change current working directory (".") for a single thread
3895  *
3896  * Parameters:  p       Process requesting the call
3897  *              uap     User argument descriptor (see below)
3898  *              retval  (ignored)
3899  *
3900  * Indirect parameters: uap->path       Directory path
3901  *
3902  * Returns:     0                       Success
3903  *              common_chdir: ENOTDIR
3904  *              common_chdir: ENOENT    No such file or directory
3905  *              common_chdir: ???
3906  *
3907  */
3908 int
3909 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3910 {
3911         return common_chdir(p, (void *)uap, 1);
3912 }
3913
3914
3915 /*
3916  * Change notion of root (``/'') directory.
3917  */
3918 /* ARGSUSED */
3919 int
3920 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3921 {
3922         struct filedesc *fdp = p->p_fd;
3923         int error;
3924         struct nameidata nd;
3925         vnode_t tvp;
3926         vfs_context_t ctx = vfs_context_current();
3927
3928         if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3929                 return error;
3930         }
3931
3932         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3933             UIO_USERSPACE, uap->path, ctx);
3934         error = change_dir(&nd, ctx);
3935         if (error) {
3936                 return error;
3937         }
3938
3939 #if CONFIG_MACF
3940         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3941             &nd.ni_cnd);
3942         if (error) {
3943                 vnode_put(nd.ni_vp);
3944                 return error;
3945         }
3946 #endif
3947
3948         if ((error = vnode_ref(nd.ni_vp))) {
3949                 vnode_put(nd.ni_vp);
3950                 return error;
3951         }
3952         vnode_put(nd.ni_vp);
3953
3954         /*
3955          * This lock provides the guarantee that as long as you hold the lock
3956          * fdp->fd_rdir has a usecount on it. This is used to take an iocount
3957          * on a referenced vnode in namei when determining the rootvnode for
3958          * a process.
3959          */
3960         /* needed for synchronization with lookup */
3961         proc_dirs_lock_exclusive(p);
3962         /* needed for setting the flag and other activities on the fd itself */
3963         proc_fdlock(p);
3964         tvp = fdp->fd_rdir;
3965         fdp->fd_rdir = nd.ni_vp;
3966         fdp->fd_flags |= FD_CHROOT;
3967         proc_fdunlock(p);
3968         proc_dirs_unlock_exclusive(p);
3969
3970         if (tvp != NULL) {
3971                 vnode_rele(tvp);
3972         }
3973
3974         return 0;
3975 }
3976
3977 #define PATHSTATICBUFLEN 256
3978 #define PIVOT_ROOT_ENTITLEMENT              \
3979        "com.apple.private.vfs.pivot-root"
3980
3981 #if defined(XNU_TARGET_OS_OSX)
3982 int
3983 pivot_root(proc_t p, struct pivot_root_args *uap, __unused int *retval)
3984 {
3985         int error;
3986         char new_rootfs_path_before[PATHSTATICBUFLEN] = {0};
3987         char old_rootfs_path_after[PATHSTATICBUFLEN] = {0};
3988         char *new_rootfs_path_before_buf = NULL;
3989         char *old_rootfs_path_after_buf = NULL;
3990         char *incoming = NULL;
3991         char *outgoing = NULL;
3992         vnode_t incoming_rootvp = NULLVP;
3993         size_t bytes_copied;
3994
3995         /*
3996          * XXX : Additional restrictions needed
3997          * - perhaps callable only once.
3998          */
3999         if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
4000                 return error;
4001         }
4002
4003         /*
4004          * pivot_root can be executed by launchd only.
4005          * Enforce entitlement.
4006          */
4007         if ((p->p_pid != 1) || !IOTaskHasEntitlement(current_task(), PIVOT_ROOT_ENTITLEMENT)) {
4008                 return EPERM;
4009         }
4010
4011         error = copyinstr(uap->new_rootfs_path_before, &new_rootfs_path_before[0], PATHSTATICBUFLEN, &bytes_copied);
4012         if (error == ENAMETOOLONG) {
4013                 new_rootfs_path_before_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4014                 error = copyinstr(uap->new_rootfs_path_before, new_rootfs_path_before_buf, MAXPATHLEN, &bytes_copied);
4015         }
4016
4017         if (error) {
4018                 goto out;
4019         }
4020
4021         error = copyinstr(uap->old_rootfs_path_after, &old_rootfs_path_after[0], PATHSTATICBUFLEN, &bytes_copied);
4022         if (error == ENAMETOOLONG) {
4023                 old_rootfs_path_after_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
4024                 error = copyinstr(uap->old_rootfs_path_after, old_rootfs_path_after_buf, MAXPATHLEN, &bytes_copied);
4025         }
4026         if (error) {
4027                 goto out;
4028         }
4029
4030         if (new_rootfs_path_before_buf) {
4031                 incoming = new_rootfs_path_before_buf;
4032         } else {
4033                 incoming = &new_rootfs_path_before[0];
4034         }
4035
4036         if (old_rootfs_path_after_buf) {
4037                 outgoing = old_rootfs_path_after_buf;
4038         } else {
4039                 outgoing = &old_rootfs_path_after[0];
4040         }
4041
4042         /*
4043          * The proposed incoming FS MUST be authenticated (i.e. not a chunklist DMG).
4044          * Userland is not allowed to pivot to an image.
4045          */
4046         error = vnode_lookup(incoming, 0, &incoming_rootvp, vfs_context_kernel());
4047         if (error) {
4048                 goto out;
4049         }
4050         error = VNOP_IOCTL(incoming_rootvp, FSIOC_KERNEL_ROOTAUTH, NULL, 0, vfs_context_kernel());
4051         if (error) {
4052                 goto out;
4053         }
4054
4055         error = vfs_switch_root(incoming, outgoing, VFSSR_VIRTUALDEV_PROHIBITED);
4056
4057 out:
4058         if (incoming_rootvp != NULLVP) {
4059                 vnode_put(incoming_rootvp);
4060                 incoming_rootvp = NULLVP;
4061         }
4062
4063         if (old_rootfs_path_after_buf) {
4064                 zfree(ZV_NAMEI, old_rootfs_path_after_buf);
4065         }
4066
4067         if (new_rootfs_path_before_buf) {
4068                 zfree(ZV_NAMEI, new_rootfs_path_before_buf);
4069         }
4070
4071         return error;
4072 }
4073 #else
4074 int
4075 pivot_root(proc_t p, __unused struct pivot_root_args *uap, int *retval)
4076 {
4077         return nosys(p, NULL, retval);
4078 }
4079 #endif /* XNU_TARGET_OS_OSX */
4080
4081 /*
4082  * Common routine for chroot and chdir.
4083  *
4084  * Returns:     0                       Success
4085  *              ENOTDIR                 Not a directory
4086  *              namei:???               [anything namei can return]
4087  *              vnode_authorize:???     [anything vnode_authorize can return]
4088  */
4089 static int
4090 change_dir(struct nameidata *ndp, vfs_context_t ctx)
4091 {
4092         vnode_t vp;
4093         int error;
4094
4095         if ((error = namei(ndp))) {
4096                 return error;
4097         }
4098         nameidone(ndp);
4099         vp = ndp->ni_vp;
4100
4101         if (vp->v_type != VDIR) {
4102                 vnode_put(vp);
4103                 return ENOTDIR;
4104         }
4105
4106 #if CONFIG_MACF
4107         error = mac_vnode_check_chdir(ctx, vp);
4108         if (error) {
4109                 vnode_put(vp);
4110                 return error;
4111         }
4112 #endif
4113
4114         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
4115         if (error) {
4116                 vnode_put(vp);
4117                 return error;
4118         }
4119
4120         return error;
4121 }
4122
4123 /*
4124  * Free the vnode data (for directories) associated with the file glob.
4125  */
4126 struct fd_vn_data *
4127 fg_vn_data_alloc(void)
4128 {
4129         struct fd_vn_data *fvdata;
4130
4131         /* Allocate per fd vnode data */
4132         fvdata = kheap_alloc(KM_FD_VN_DATA, sizeof(struct fd_vn_data),
4133             Z_WAITOK | Z_ZERO);
4134         lck_mtx_init(&fvdata->fv_lock, &fd_vn_lck_grp, &fd_vn_lck_attr);
4135         return fvdata;
4136 }
4137
4138 /*
4139  * Free the vnode data (for directories) associated with the file glob.
4140  */
4141 void
4142 fg_vn_data_free(void *fgvndata)
4143 {
4144         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
4145
4146         kheap_free(KHEAP_DATA_BUFFERS, fvdata->fv_buf, fvdata->fv_bufallocsiz);
4147         lck_mtx_destroy(&fvdata->fv_lock, &fd_vn_lck_grp);
4148         kheap_free(KM_FD_VN_DATA, fvdata, sizeof(struct fd_vn_data));
4149 }
4150
4151 /*
4152  * Check permissions, allocate an open file structure,
4153  * and call the device open routine if any.
4154  *
4155  * Returns:     0                       Success
4156  *              EINVAL
4157  *              EINTR
4158  *      falloc:ENFILE
4159  *      falloc:EMFILE
4160  *      falloc:ENOMEM
4161  *      vn_open_auth:???
4162  *      dupfdopen:???
4163  *      VNOP_ADVLOCK:???
4164  *      vnode_setsize:???
4165  *
4166  * XXX Need to implement uid, gid
4167  */
4168 int
4169 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4170     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
4171     int32_t *retval)
4172 {
4173         proc_t p = vfs_context_proc(ctx);
4174         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
4175         struct fileproc *fp;
4176         vnode_t vp;
4177         int flags, oflags;
4178         int type, indx, error;
4179         struct vfs_context context;
4180
4181         oflags = uflags;
4182
4183         if ((oflags & O_ACCMODE) == O_ACCMODE) {
4184                 return EINVAL;
4185         }
4186
4187         flags = FFLAGS(uflags);
4188         CLR(flags, FENCRYPTED);
4189         CLR(flags, FUNENCRYPTED);
4190
4191         AUDIT_ARG(fflags, oflags);
4192         AUDIT_ARG(mode, vap->va_mode);
4193
4194         if ((error = falloc_withalloc(p,
4195             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
4196                 return error;
4197         }
4198         uu->uu_dupfd = -indx - 1;
4199
4200         if ((error = vn_open_auth(ndp, &flags, vap))) {
4201                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {       /* XXX from fdopen */
4202                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
4203                                 fp_drop(p, indx, NULL, 0);
4204                                 *retval = indx;
4205                                 return 0;
4206                         }
4207                 }
4208                 if (error == ERESTART) {
4209                         error = EINTR;
4210                 }
4211                 fp_free(p, indx, fp);
4212                 return error;
4213         }
4214         uu->uu_dupfd = 0;
4215         vp = ndp->ni_vp;
4216
4217         fp->fp_glob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
4218         fp->fp_glob->fg_ops = &vnops;
4219         fp->fp_glob->fg_data = (caddr_t)vp;
4220
4221         if (flags & (O_EXLOCK | O_SHLOCK)) {
4222                 struct flock lf = {
4223                         .l_whence = SEEK_SET,
4224                 };
4225
4226                 if (flags & O_EXLOCK) {
4227                         lf.l_type = F_WRLCK;
4228                 } else {
4229                         lf.l_type = F_RDLCK;
4230                 }
4231                 type = F_FLOCK;
4232                 if ((flags & FNONBLOCK) == 0) {
4233                         type |= F_WAIT;
4234                 }
4235 #if CONFIG_MACF
4236                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->fp_glob,
4237                     F_SETLK, &lf);
4238                 if (error) {
4239                         goto bad;
4240                 }
4241 #endif
4242                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->fp_glob, F_SETLK, &lf, type, ctx, NULL))) {
4243                         goto bad;
4244                 }
4245                 fp->fp_glob->fg_flag |= FWASLOCKED;
4246         }
4247
4248         /* try to truncate by setting the size attribute */
4249         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
4250                 goto bad;
4251         }
4252
4253         /*
4254          * For directories we hold some additional information in the fd.
4255          */
4256         if (vnode_vtype(vp) == VDIR) {
4257                 fp->fp_glob->fg_vn_data = fg_vn_data_alloc();
4258         } else {
4259                 fp->fp_glob->fg_vn_data = NULL;
4260         }
4261
4262         vnode_put(vp);
4263
4264         /*
4265          * The first terminal open (without a O_NOCTTY) by a session leader
4266          * results in it being set as the controlling terminal.
4267          */
4268         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
4269             !(flags & O_NOCTTY)) {
4270                 int tmp = 0;
4271
4272                 (void)(*fp->fp_glob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
4273                     (caddr_t)&tmp, ctx);
4274         }
4275
4276         proc_fdlock(p);
4277         if (flags & O_CLOEXEC) {
4278                 *fdflags(p, indx) |= UF_EXCLOSE;
4279         }
4280         if (flags & O_CLOFORK) {
4281                 *fdflags(p, indx) |= UF_FORKCLOSE;
4282         }
4283         procfdtbl_releasefd(p, indx, NULL);
4284
4285 #if CONFIG_SECLUDED_MEMORY
4286         if (secluded_for_filecache &&
4287             FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE &&
4288             vnode_vtype(vp) == VREG) {
4289                 memory_object_control_t moc;
4290
4291                 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4292
4293                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4294                         /* nothing to do... */
4295                 } else if (fp->fp_glob->fg_flag & FWRITE) {
4296                         /* writable -> no longer  eligible for secluded pages */
4297                         memory_object_mark_eligible_for_secluded(moc,
4298                             FALSE);
4299                 } else if (secluded_for_filecache == 1) {
4300                         char pathname[32] = { 0, };
4301                         size_t copied;
4302                         /* XXX FBDP: better way to detect /Applications/ ? */
4303                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4304                                 (void)copyinstr(ndp->ni_dirp,
4305                                     pathname,
4306                                     sizeof(pathname),
4307                                     &copied);
4308                         } else {
4309                                 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4310                                     pathname,
4311                                     sizeof(pathname),
4312                                     &copied);
4313                         }
4314                         pathname[sizeof(pathname) - 1] = '\0';
4315                         if (strncmp(pathname,
4316                             "/Applications/",
4317                             strlen("/Applications/")) == 0 &&
4318                             strncmp(pathname,
4319                             "/Applications/Camera.app/",
4320                             strlen("/Applications/Camera.app/")) != 0) {
4321                                 /*
4322                                  * not writable
4323                                  * AND from "/Applications/"
4324                                  * AND not from "/Applications/Camera.app/"
4325                                  * ==> eligible for secluded
4326                                  */
4327                                 memory_object_mark_eligible_for_secluded(moc,
4328                                     TRUE);
4329                         }
4330                 } else if (secluded_for_filecache == 2) {
4331 #if __arm64__
4332 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
4333 #elif __arm__
4334 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
4335 #else
4336 /* not implemented... */
4337 #endif
4338                         size_t len = strlen(vp->v_name);
4339                         if (!strncmp(vp->v_name, DYLD_SHARED_CACHE_NAME, len) ||
4340                             !strncmp(vp->v_name, "dyld", len) ||
4341                             !strncmp(vp->v_name, "launchd", len) ||
4342                             !strncmp(vp->v_name, "Camera", len) ||
4343                             !strncmp(vp->v_name, "mediaserverd", len) ||
4344                             !strncmp(vp->v_name, "SpringBoard", len) ||
4345                             !strncmp(vp->v_name, "backboardd", len)) {
4346                                 /*
4347                                  * This file matters when launching Camera:
4348                                  * do not store its contents in the secluded
4349                                  * pool that will be drained on Camera launch.
4350                                  */
4351                                 memory_object_mark_eligible_for_secluded(moc,
4352                                     FALSE);
4353                         }
4354                 }
4355         }
4356 #endif /* CONFIG_SECLUDED_MEMORY */
4357
4358         fp_drop(p, indx, fp, 1);
4359         proc_fdunlock(p);
4360
4361         *retval = indx;
4362
4363         return 0;
4364 bad:
4365         context = *vfs_context_current();
4366         context.vc_ucred = fp->fp_glob->fg_cred;
4367
4368         if ((fp->fp_glob->fg_flag & FWASLOCKED) &&
4369             (FILEGLOB_DTYPE(fp->fp_glob) == DTYPE_VNODE)) {
4370                 struct flock lf = {
4371                         .l_whence = SEEK_SET,
4372                         .l_type = F_UNLCK,
4373                 };
4374
4375                 (void)VNOP_ADVLOCK(
4376                         vp, (caddr_t)fp->fp_glob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4377         }
4378
4379         vn_close(vp, fp->fp_glob->fg_flag, &context);
4380         vnode_put(vp);
4381         fp_free(p, indx, fp);
4382
4383         return error;
4384 }
4385
4386 /*
4387  * While most of the *at syscall handlers can call nameiat() which
4388  * is a wrapper around namei, the use of namei and initialisation
4389  * of nameidata are far removed and in different functions  - namei
4390  * gets called in vn_open_auth for open1. So we'll just do here what
4391  * nameiat() does.
4392  */
4393 static int
4394 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4395     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
4396     int dirfd)
4397 {
4398         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4399                 int error;
4400                 char c;
4401
4402                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4403                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
4404                         if (error) {
4405                                 return error;
4406                         }
4407                 } else {
4408                         c = *((char *)(ndp->ni_dirp));
4409                 }
4410
4411                 if (c != '/') {
4412                         vnode_t dvp_at;
4413
4414                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4415                             &dvp_at);
4416                         if (error) {
4417                                 return error;
4418                         }
4419
4420                         if (vnode_vtype(dvp_at) != VDIR) {
4421                                 vnode_put(dvp_at);
4422                                 return ENOTDIR;
4423                         }
4424
4425                         ndp->ni_dvp = dvp_at;
4426                         ndp->ni_cnd.cn_flags |= USEDVP;
4427                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
4428                             retval);
4429                         vnode_put(dvp_at);
4430                         return error;
4431                 }
4432         }
4433
4434         return open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval);
4435 }
4436
4437 /*
4438  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4439  *
4440  * Parameters:  p                       Process requesting the open
4441  *              uap                     User argument descriptor (see below)
4442  *              retval                  Pointer to an area to receive the
4443  *                                      return calue from the system call
4444  *
4445  * Indirect:    uap->path               Path to open (same as 'open')
4446  *              uap->flags              Flags to open (same as 'open'
4447  *              uap->uid                UID to set, if creating
4448  *              uap->gid                GID to set, if creating
4449  *              uap->mode               File mode, if creating (same as 'open')
4450  *              uap->xsecurity          ACL to set, if creating
4451  *
4452  * Returns:     0                       Success
4453  *              !0                      errno value
4454  *
4455  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4456  *
4457  * XXX:         We should enummerate the possible errno values here, and where
4458  *              in the code they originated.
4459  */
4460 int
4461 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4462 {
4463         struct filedesc *fdp = p->p_fd;
4464         int ciferror;
4465         kauth_filesec_t xsecdst;
4466         struct vnode_attr va;
4467         struct nameidata nd;
4468         int cmode;
4469
4470         AUDIT_ARG(owner, uap->uid, uap->gid);
4471
4472         xsecdst = NULL;
4473         if ((uap->xsecurity != USER_ADDR_NULL) &&
4474             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4475                 return ciferror;
4476         }
4477
4478         VATTR_INIT(&va);
4479         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4480         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4481         if (uap->uid != KAUTH_UID_NONE) {
4482                 VATTR_SET(&va, va_uid, uap->uid);
4483         }
4484         if (uap->gid != KAUTH_GID_NONE) {
4485                 VATTR_SET(&va, va_gid, uap->gid);
4486         }
4487         if (xsecdst != NULL) {
4488                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4489         }
4490
4491         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4492             uap->path, vfs_context_current());
4493
4494         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4495             fileproc_alloc_init, NULL, retval);
4496         if (xsecdst != NULL) {
4497                 kauth_filesec_free(xsecdst);
4498         }
4499
4500         return ciferror;
4501 }
4502
4503 /*
4504  * Go through the data-protected atomically controlled open (2)
4505  *
4506  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4507  */
4508 int
4509 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4510 {
4511         int flags = uap->flags;
4512         int class = uap->class;
4513         int dpflags = uap->dpflags;
4514
4515         /*
4516          * Follow the same path as normal open(2)
4517          * Look up the item if it exists, and acquire the vnode.
4518          */
4519         struct filedesc *fdp = p->p_fd;
4520         struct vnode_attr va;
4521         struct nameidata nd;
4522         int cmode;
4523         int error;
4524
4525         VATTR_INIT(&va);
4526         /* Mask off all but regular access permissions */
4527         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4528         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4529
4530         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4531             uap->path, vfs_context_current());
4532
4533         /*
4534          * Initialize the extra fields in vnode_attr to pass down our
4535          * extra fields.
4536          * 1. target cprotect class.
4537          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4538          */
4539         if (flags & O_CREAT) {
4540                 /* lower level kernel code validates that the class is valid before applying it. */
4541                 if (class != PROTECTION_CLASS_DEFAULT) {
4542                         /*
4543                          * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4544                          * file behave the same as open (2)
4545                          */
4546                         VATTR_SET(&va, va_dataprotect_class, class);
4547                 }
4548         }
4549
4550         if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4551                 if (flags & (O_RDWR | O_WRONLY)) {
4552                         /* Not allowed to write raw encrypted bytes */
4553                         return EINVAL;
4554                 }
4555                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4556                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4557                 }
4558                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4559                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4560                 }
4561         }
4562
4563         error = open1(vfs_context_current(), &nd, uap->flags, &va,
4564             fileproc_alloc_init, NULL, retval);
4565
4566         return error;
4567 }
4568
4569 static int
4570 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4571     int fd, enum uio_seg segflg, int *retval)
4572 {
4573         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
4574         struct {
4575                 struct vnode_attr va;
4576                 struct nameidata nd;
4577         } *__open_data;
4578         struct vnode_attr *vap;
4579         struct nameidata *ndp;
4580         int cmode;
4581         int error;
4582
4583         __open_data = kheap_alloc(KHEAP_TEMP, sizeof(*__open_data), Z_WAITOK);
4584         vap = &__open_data->va;
4585         ndp = &__open_data->nd;
4586
4587         VATTR_INIT(vap);
4588         /* Mask off all but regular access permissions */
4589         cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4590         VATTR_SET(vap, va_mode, cmode & ACCESSPERMS);
4591
4592         NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4593             segflg, path, ctx);
4594
4595         error = open1at(ctx, ndp, flags, vap, fileproc_alloc_init, NULL,
4596             retval, fd);
4597
4598         kheap_free(KHEAP_TEMP, __open_data, sizeof(*__open_data));
4599
4600         return error;
4601 }
4602
4603 int
4604 open(proc_t p, struct open_args *uap, int32_t *retval)
4605 {
4606         __pthread_testcancel(1);
4607         return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4608 }
4609
4610 int
4611 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4612     int32_t *retval)
4613 {
4614         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4615                    uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4616 }
4617
4618 int
4619 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4620     int32_t *retval)
4621 {
4622         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4623                    uap->mode, uap->fd, UIO_USERSPACE, retval);
4624 }
4625
4626 int
4627 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4628 {
4629         __pthread_testcancel(1);
4630         return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4631 }
4632
4633 /*
4634  * openbyid_np: open a file given a file system id and a file system object id
4635  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
4636  *      file systems that don't support object ids it is a node id (uint64_t).
4637  *
4638  * Parameters:  p                       Process requesting the open
4639  *              uap                     User argument descriptor (see below)
4640  *              retval                  Pointer to an area to receive the
4641  *                                      return calue from the system call
4642  *
4643  * Indirect:    uap->path               Path to open (same as 'open')
4644  *
4645  *              uap->fsid               id of target file system
4646  *              uap->objid              id of target file system object
4647  *              uap->flags              Flags to open (same as 'open')
4648  *
4649  * Returns:     0                       Success
4650  *              !0                      errno value
4651  *
4652  *
4653  * XXX:         We should enummerate the possible errno values here, and where
4654  *              in the code they originated.
4655  */
4656 int
4657 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4658 {
4659         fsid_t fsid;
4660         uint64_t objid;
4661         int error;
4662         char *buf = NULL;
4663         int buflen = MAXPATHLEN;
4664         int pathlen = 0;
4665         vfs_context_t ctx = vfs_context_current();
4666
4667         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4668                 return error;
4669         }
4670
4671         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4672                 return error;
4673         }
4674
4675         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4676         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4677                 return error;
4678         }
4679
4680         AUDIT_ARG(value32, fsid.val[0]);
4681         AUDIT_ARG(value64, objid);
4682
4683         /*resolve path from fsis, objid*/
4684         do {
4685                 buf = kheap_alloc(KHEAP_TEMP, buflen + 1, Z_WAITOK);
4686                 if (buf == NULL) {
4687                         return ENOMEM;
4688                 }
4689
4690                 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4691                     buf, FSOPT_ISREALFSID, &pathlen);
4692
4693                 if (error) {
4694                         kheap_free(KHEAP_TEMP, buf, buflen + 1);
4695                         buf = NULL;
4696                 }
4697         } while (error == ENOSPC && (buflen += MAXPATHLEN));
4698
4699         if (error) {
4700                 return error;
4701         }
4702
4703         buf[pathlen] = 0;
4704
4705         error = openat_internal(
4706                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4707
4708         kheap_free(KHEAP_TEMP, buf, buflen + 1);
4709
4710         return error;
4711 }
4712
4713
4714 /*
4715  * Create a special file.
4716  */
4717 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4718
4719 int
4720 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4721 {
4722         struct vnode_attr va;
4723         vfs_context_t ctx = vfs_context_current();
4724         int error;
4725         struct nameidata nd;
4726         vnode_t vp, dvp;
4727
4728         VATTR_INIT(&va);
4729         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4730         VATTR_SET(&va, va_rdev, uap->dev);
4731
4732         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4733         if ((uap->mode & S_IFMT) == S_IFIFO) {
4734                 return mkfifo1(ctx, uap->path, &va);
4735         }
4736
4737         AUDIT_ARG(mode, (mode_t)uap->mode);
4738         AUDIT_ARG(value32, uap->dev);
4739
4740         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4741                 return error;
4742         }
4743         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4744             UIO_USERSPACE, uap->path, ctx);
4745         error = namei(&nd);
4746         if (error) {
4747                 return error;
4748         }
4749         dvp = nd.ni_dvp;
4750         vp = nd.ni_vp;
4751
4752         if (vp != NULL) {
4753                 error = EEXIST;
4754                 goto out;
4755         }
4756
4757         switch (uap->mode & S_IFMT) {
4758         case S_IFCHR:
4759                 VATTR_SET(&va, va_type, VCHR);
4760                 break;
4761         case S_IFBLK:
4762                 VATTR_SET(&va, va_type, VBLK);
4763                 break;
4764         default:
4765                 error = EINVAL;
4766                 goto out;
4767         }
4768
4769 #if CONFIG_MACF
4770         error = mac_vnode_check_create(ctx,
4771             nd.ni_dvp, &nd.ni_cnd, &va);
4772         if (error) {
4773                 goto out;
4774         }
4775 #endif
4776
4777         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4778                 goto out;
4779         }
4780
4781         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4782                 goto out;
4783         }
4784
4785         if (vp) {
4786                 int     update_flags = 0;
4787
4788                 // Make sure the name & parent pointers are hooked up
4789                 if (vp->v_name == NULL) {
4790                         update_flags |= VNODE_UPDATE_NAME;
4791                 }
4792                 if (vp->v_parent == NULLVP) {
4793                         update_flags |= VNODE_UPDATE_PARENT;
4794                 }
4795
4796                 if (update_flags) {
4797                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4798                 }
4799
4800 #if CONFIG_FSE
4801                 add_fsevent(FSE_CREATE_FILE, ctx,
4802                     FSE_ARG_VNODE, vp,
4803                     FSE_ARG_DONE);
4804 #endif
4805         }
4806
4807 out:
4808         /*
4809          * nameidone has to happen before we vnode_put(dvp)
4810          * since it may need to release the fs_nodelock on the dvp
4811          */
4812         nameidone(&nd);
4813
4814         if (vp) {
4815                 vnode_put(vp);
4816         }
4817         vnode_put(dvp);
4818
4819         return error;
4820 }
4821
4822 /*
4823  * Create a named pipe.
4824  *
4825  * Returns:     0                       Success
4826  *              EEXIST
4827  *      namei:???
4828  *      vnode_authorize:???
4829  *      vn_create:???
4830  */
4831 static int
4832 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4833 {
4834         vnode_t vp, dvp;
4835         int error;
4836         struct nameidata nd;
4837
4838         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4839             UIO_USERSPACE, upath, ctx);
4840         error = namei(&nd);
4841         if (error) {
4842                 return error;
4843         }
4844         dvp = nd.ni_dvp;
4845         vp = nd.ni_vp;
4846
4847         /* check that this is a new file and authorize addition */
4848         if (vp != NULL) {
4849                 error = EEXIST;
4850                 goto out;
4851         }
4852         VATTR_SET(vap, va_type, VFIFO);
4853
4854         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4855                 goto out;
4856         }
4857
4858         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4859 out:
4860         /*
4861          * nameidone has to happen before we vnode_put(dvp)
4862          * since it may need to release the fs_nodelock on the dvp
4863          */
4864         nameidone(&nd);
4865
4866         if (vp) {
4867                 vnode_put(vp);
4868         }
4869         vnode_put(dvp);
4870
4871         return error;
4872 }
4873
4874
4875 /*
4876  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4877  *
4878  * Parameters:  p                       Process requesting the open
4879  *              uap                     User argument descriptor (see below)
4880  *              retval                  (Ignored)
4881  *
4882  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
4883  *              uap->uid                UID to set
4884  *              uap->gid                GID to set
4885  *              uap->mode               File mode to set (same as 'mkfifo')
4886  *              uap->xsecurity          ACL to set, if creating
4887  *
4888  * Returns:     0                       Success
4889  *              !0                      errno value
4890  *
4891  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4892  *
4893  * XXX:         We should enummerate the possible errno values here, and where
4894  *              in the code they originated.
4895  */
4896 int
4897 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4898 {
4899         int ciferror;
4900         kauth_filesec_t xsecdst;
4901         struct vnode_attr va;
4902
4903         AUDIT_ARG(owner, uap->uid, uap->gid);
4904
4905         xsecdst = KAUTH_FILESEC_NONE;
4906         if (uap->xsecurity != USER_ADDR_NULL) {
4907                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4908                         return ciferror;
4909                 }
4910         }
4911
4912         VATTR_INIT(&va);
4913         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4914         if (uap->uid != KAUTH_UID_NONE) {
4915                 VATTR_SET(&va, va_uid, uap->uid);
4916         }
4917         if (uap->gid != KAUTH_GID_NONE) {
4918                 VATTR_SET(&va, va_gid, uap->gid);
4919         }
4920         if (xsecdst != KAUTH_FILESEC_NONE) {
4921                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4922         }
4923
4924         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4925
4926         if (xsecdst != KAUTH_FILESEC_NONE) {
4927                 kauth_filesec_free(xsecdst);
4928         }
4929         return ciferror;
4930 }
4931
4932 /* ARGSUSED */
4933 int
4934 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4935 {
4936         struct vnode_attr va;
4937
4938         VATTR_INIT(&va);
4939         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4940
4941         return mkfifo1(vfs_context_current(), uap->path, &va);
4942 }
4943
4944 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4945 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4946 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4947
4948 int
4949 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4950 {
4951         int ret, len = _len;
4952
4953         *truncated_path = 0;
4954
4955         if (firmlink) {
4956                 ret = vn_getpath(dvp, path, &len);
4957         } else {
4958                 ret = vn_getpath_no_firmlink(dvp, path, &len);
4959         }
4960         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4961                 if (leafname) {
4962                         path[len - 1] = '/';
4963                         len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4964                         if (len > MAXPATHLEN) {
4965                                 char *ptr;
4966
4967                                 // the string got truncated!
4968                                 *truncated_path = 1;
4969                                 ptr = strrchr(path, '/');
4970                                 if (ptr) {
4971                                         *ptr = '\0';   // chop off the string at the last directory component
4972                                 }
4973                                 len = (int)strlen(path) + 1;
4974                         }
4975                 }
4976         } else if (ret == 0) {
4977                 *truncated_path = 1;
4978         } else if (ret != 0) {
4979                 struct vnode *mydvp = dvp;
4980
4981                 if (ret != ENOSPC) {
4982                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4983                             dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4984                 }
4985                 *truncated_path = 1;
4986
4987                 do {
4988                         if (mydvp->v_parent != NULL) {
4989                                 mydvp = mydvp->v_parent;
4990                         } else if (mydvp->v_mount) {
4991                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4992                                 break;
4993                         } else {
4994                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4995                                 strlcpy(path, "/", _len);
4996                                 len = 2;
4997                                 mydvp = NULL;
4998                         }
4999
5000                         if (mydvp == NULL) {
5001                                 break;
5002                         }
5003
5004                         len = _len;
5005                         if (firmlink) {
5006                                 ret = vn_getpath(mydvp, path, &len);
5007                         } else {
5008                                 ret = vn_getpath_no_firmlink(mydvp, path, &len);
5009                         }
5010                 } while (ret == ENOSPC);
5011         }
5012
5013         return len;
5014 }
5015
5016 int
5017 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5018 {
5019         return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
5020 }
5021
5022 int
5023 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
5024 {
5025         return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
5026 }
5027
5028 /*
5029  * Make a hard file link.
5030  *
5031  * Returns:     0                       Success
5032  *              EPERM
5033  *              EEXIST
5034  *              EXDEV
5035  *      namei:???
5036  *      vnode_authorize:???
5037  *      VNOP_LINK:???
5038  */
5039 /* ARGSUSED */
5040 static int
5041 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
5042     user_addr_t link, int flag, enum uio_seg segflg)
5043 {
5044         vnode_t vp, pvp, dvp, lvp;
5045         struct nameidata nd;
5046         int follow;
5047         int error;
5048 #if CONFIG_FSE
5049         fse_info finfo;
5050 #endif
5051         int need_event, has_listeners, need_kpath2;
5052         char *target_path = NULL;
5053         int truncated = 0;
5054
5055         vp = dvp = lvp = NULLVP;
5056
5057         /* look up the object we are linking to */
5058         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
5059         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
5060             segflg, path, ctx);
5061
5062         error = nameiat(&nd, fd1);
5063         if (error) {
5064                 return error;
5065         }
5066         vp = nd.ni_vp;
5067
5068         nameidone(&nd);
5069
5070         /*
5071          * Normally, linking to directories is not supported.
5072          * However, some file systems may have limited support.
5073          */
5074         if (vp->v_type == VDIR) {
5075                 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
5076                         error = EPERM;   /* POSIX */
5077                         goto out;
5078                 }
5079
5080                 /* Linking to a directory requires ownership. */
5081                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
5082                         struct vnode_attr dva;
5083
5084                         VATTR_INIT(&dva);
5085                         VATTR_WANTED(&dva, va_uid);
5086                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
5087                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
5088                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
5089                                 error = EACCES;
5090                                 goto out;
5091                         }
5092                 }
5093         }
5094
5095         /* lookup the target node */
5096 #if CONFIG_TRIGGERS
5097         nd.ni_op = OP_LINK;
5098 #endif
5099         nd.ni_cnd.cn_nameiop = CREATE;
5100         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
5101         nd.ni_dirp = link;
5102         error = nameiat(&nd, fd2);
5103         if (error != 0) {
5104                 goto out;
5105         }
5106         dvp = nd.ni_dvp;
5107         lvp = nd.ni_vp;
5108
5109 #if CONFIG_MACF
5110         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
5111                 goto out2;
5112         }
5113 #endif
5114
5115         /* or to anything that kauth doesn't want us to (eg. immutable items) */
5116         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
5117                 goto out2;
5118         }
5119
5120         /* target node must not exist */
5121         if (lvp != NULLVP) {
5122                 error = EEXIST;
5123                 goto out2;
5124         }
5125         /* cannot link across mountpoints */
5126         if (vnode_mount(vp) != vnode_mount(dvp)) {
5127                 error = EXDEV;
5128                 goto out2;
5129         }
5130
5131         /* authorize creation of the target note */
5132         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
5133                 goto out2;
5134         }
5135
5136         /* and finally make the link */
5137         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
5138         if (error) {
5139                 goto out2;
5140         }
5141
5142 #if CONFIG_MACF
5143         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
5144 #endif
5145
5146 #if CONFIG_FSE
5147         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
5148 #else
5149         need_event = 0;
5150 #endif
5151         has_listeners = kauth_authorize_fileop_has_listeners();
5152
5153         need_kpath2 = 0;
5154 #if CONFIG_AUDIT
5155         if (AUDIT_RECORD_EXISTS()) {
5156                 need_kpath2 = 1;
5157         }
5158 #endif
5159
5160         if (need_event || has_listeners || need_kpath2) {
5161                 char *link_to_path = NULL;
5162                 int len, link_name_len;
5163
5164                 /* build the path to the new link file */
5165                 GET_PATH(target_path);
5166
5167                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
5168
5169                 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
5170
5171                 if (has_listeners) {
5172                         /* build the path to file we are linking to */
5173                         GET_PATH(link_to_path);
5174
5175                         link_name_len = MAXPATHLEN;
5176                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
5177                                 /*
5178                                  * Call out to allow 3rd party notification of rename.
5179                                  * Ignore result of kauth_authorize_fileop call.
5180                                  */
5181                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
5182                                     (uintptr_t)link_to_path,
5183                                     (uintptr_t)target_path);
5184                         }
5185                         if (link_to_path != NULL) {
5186                                 RELEASE_PATH(link_to_path);
5187                         }
5188                 }
5189 #if CONFIG_FSE
5190                 if (need_event) {
5191                         /* construct fsevent */
5192                         if (get_fse_info(vp, &finfo, ctx) == 0) {
5193                                 if (truncated) {
5194                                         finfo.mode |= FSE_TRUNCATED_PATH;
5195                                 }
5196
5197                                 // build the path to the destination of the link
5198                                 add_fsevent(FSE_CREATE_FILE, ctx,
5199                                     FSE_ARG_STRING, len, target_path,
5200                                     FSE_ARG_FINFO, &finfo,
5201                                     FSE_ARG_DONE);
5202                         }
5203
5204                         pvp = vp->v_parent;
5205                         // need an iocount on pvp in this case
5206                         if (pvp && pvp != dvp) {
5207                                 error = vnode_get(pvp);
5208                                 if (error) {
5209                                         pvp = NULLVP;
5210                                         error = 0;
5211                                 }
5212                         }
5213                         if (pvp) {
5214                                 add_fsevent(FSE_STAT_CHANGED, ctx,
5215                                     FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
5216                         }
5217                         if (pvp && pvp != dvp) {
5218                                 vnode_put(pvp);
5219                         }
5220                 }
5221 #endif
5222         }
5223 out2:
5224         /*
5225          * nameidone has to happen before we vnode_put(dvp)
5226          * since it may need to release the fs_nodelock on the dvp
5227          */
5228         nameidone(&nd);
5229         if (target_path != NULL) {
5230                 RELEASE_PATH(target_path);
5231         }
5232 out:
5233         if (lvp) {
5234                 vnode_put(lvp);
5235         }
5236         if (dvp) {
5237                 vnode_put(dvp);
5238         }
5239         vnode_put(vp);
5240         return error;
5241 }
5242
5243 int
5244 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
5245 {
5246         return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
5247                    AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
5248 }
5249
5250 int
5251 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5252 {
5253         if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5254                 return EINVAL;
5255         }
5256
5257         return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5258                    uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5259 }
5260
5261 /*
5262  * Make a symbolic link.
5263  *
5264  * We could add support for ACLs here too...
5265  */
5266 /* ARGSUSED */
5267 static int
5268 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5269     user_addr_t link, enum uio_seg segflg)
5270 {
5271         struct vnode_attr va;
5272         char *path;
5273         int error;
5274         struct nameidata nd;
5275         vnode_t vp, dvp;
5276         size_t dummy = 0;
5277         proc_t p;
5278
5279         error = 0;
5280         if (UIO_SEG_IS_USER_SPACE(segflg)) {
5281                 path = zalloc(ZV_NAMEI);
5282                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5283         } else {
5284                 path = (char *)path_data;
5285         }
5286         if (error) {
5287                 goto out;
5288         }
5289         AUDIT_ARG(text, path);  /* This is the link string */
5290
5291         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5292             segflg, link, ctx);
5293
5294         error = nameiat(&nd, fd);
5295         if (error) {
5296                 goto out;
5297         }
5298         dvp = nd.ni_dvp;
5299         vp = nd.ni_vp;
5300
5301         p = vfs_context_proc(ctx);
5302         VATTR_INIT(&va);
5303         VATTR_SET(&va, va_type, VLNK);
5304         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
5305
5306 #if CONFIG_MACF
5307         error = mac_vnode_check_create(ctx,
5308             dvp, &nd.ni_cnd, &va);
5309 #endif
5310         if (error != 0) {
5311                 goto skipit;
5312         }
5313
5314         if (vp != NULL) {
5315                 error = EEXIST;
5316                 goto skipit;
5317         }
5318
5319         /* authorize */
5320         if (error == 0) {
5321                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5322         }
5323         /* get default ownership, etc. */
5324         if (error == 0) {
5325                 error = vnode_authattr_new(dvp, &va, 0, ctx);
5326         }
5327         if (error == 0) {
5328                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5329         }
5330
5331         /* do fallback attribute handling */
5332         if (error == 0 && vp) {
5333                 error = vnode_setattr_fallback(vp, &va, ctx);
5334         }
5335
5336 #if CONFIG_MACF
5337         if (error == 0 && vp) {
5338                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5339         }
5340 #endif
5341
5342         if (error == 0) {
5343                 int     update_flags = 0;
5344
5345                 /*check if a new vnode was created, else try to get one*/
5346                 if (vp == NULL) {
5347                         nd.ni_cnd.cn_nameiop = LOOKUP;
5348 #if CONFIG_TRIGGERS
5349                         nd.ni_op = OP_LOOKUP;
5350 #endif
5351                         nd.ni_cnd.cn_flags = 0;
5352                         error = nameiat(&nd, fd);
5353                         vp = nd.ni_vp;
5354
5355                         if (vp == NULL) {
5356                                 goto skipit;
5357                         }
5358                 }
5359
5360 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5361                 /* call out to allow 3rd party notification of rename.
5362                  * Ignore result of kauth_authorize_fileop call.
5363                  */
5364                 if (kauth_authorize_fileop_has_listeners() &&
5365                     namei(&nd) == 0) {
5366                         char *new_link_path = NULL;
5367                         int             len;
5368
5369                         /* build the path to the new link file */
5370                         new_link_path = get_pathbuff();
5371                         len = MAXPATHLEN;
5372                         vn_getpath(dvp, new_link_path, &len);
5373                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5374                                 new_link_path[len - 1] = '/';
5375                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5376                         }
5377
5378                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5379                             (uintptr_t)path, (uintptr_t)new_link_path);
5380                         if (new_link_path != NULL) {
5381                                 release_pathbuff(new_link_path);
5382                         }
5383                 }
5384 #endif
5385                 // Make sure the name & parent pointers are hooked up
5386                 if (vp->v_name == NULL) {
5387                         update_flags |= VNODE_UPDATE_NAME;
5388                 }
5389                 if (vp->v_parent == NULLVP) {
5390                         update_flags |= VNODE_UPDATE_PARENT;
5391                 }
5392
5393                 if (update_flags) {
5394                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5395                 }
5396
5397 #if CONFIG_FSE
5398                 add_fsevent(FSE_CREATE_FILE, ctx,
5399                     FSE_ARG_VNODE, vp,
5400                     FSE_ARG_DONE);
5401 #endif
5402         }
5403
5404 skipit:
5405         /*
5406          * nameidone has to happen before we vnode_put(dvp)
5407          * since it may need to release the fs_nodelock on the dvp
5408          */
5409         nameidone(&nd);
5410
5411         if (vp) {
5412                 vnode_put(vp);
5413         }
5414         vnode_put(dvp);
5415 out:
5416         if (path && (path != (char *)path_data)) {
5417                 zfree(ZV_NAMEI, path);
5418         }
5419
5420         return error;
5421 }
5422
5423 int
5424 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5425 {
5426         return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5427                    uap->link, UIO_USERSPACE);
5428 }
5429
5430 int
5431 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5432     __unused int32_t *retval)
5433 {
5434         return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5435                    uap->path2, UIO_USERSPACE);
5436 }
5437
5438 /*
5439  * Delete a whiteout from the filesystem.
5440  * No longer supported.
5441  */
5442 int
5443 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5444 {
5445         return ENOTSUP;
5446 }
5447
5448 /*
5449  * Delete a name from the filesystem.
5450  */
5451 /* ARGSUSED */
5452 static int
5453 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5454     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5455 {
5456         struct nameidata nd;
5457         vnode_t vp, dvp;
5458         int error;
5459         struct componentname *cnp;
5460         char  *path = NULL;
5461         char  *no_firmlink_path = NULL;
5462         int  len_path = 0;
5463         int  len_no_firmlink_path = 0;
5464 #if CONFIG_FSE
5465         fse_info  finfo;
5466         struct vnode_attr va;
5467 #endif
5468         int flags;
5469         int need_event;
5470         int has_listeners;
5471         int truncated_path;
5472         int truncated_no_firmlink_path;
5473         int batched;
5474         struct vnode_attr *vap;
5475         int do_retry;
5476         int retry_count = 0;
5477         int cn_flags;
5478
5479         cn_flags = LOCKPARENT;
5480         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5481                 cn_flags |= AUDITVNPATH1;
5482         }
5483         /* If a starting dvp is passed, it trumps any fd passed. */
5484         if (start_dvp) {
5485                 cn_flags |= USEDVP;
5486         }
5487
5488 #if NAMEDRSRCFORK
5489         /* unlink or delete is allowed on rsrc forks and named streams */
5490         cn_flags |= CN_ALLOWRSRCFORK;
5491 #endif
5492
5493 retry:
5494         do_retry = 0;
5495         flags = 0;
5496         need_event = 0;
5497         has_listeners = 0;
5498         truncated_path = 0;
5499         truncated_no_firmlink_path = 0;
5500         vap = NULL;
5501
5502         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5503
5504         nd.ni_dvp = start_dvp;
5505         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
5506         cnp = &nd.ni_cnd;
5507
5508 continue_lookup:
5509         error = nameiat(&nd, fd);
5510         if (error) {
5511                 return error;
5512         }
5513
5514         dvp = nd.ni_dvp;
5515         vp = nd.ni_vp;
5516
5517
5518         /* With Carbon delete semantics, busy files cannot be deleted */
5519         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5520                 flags |= VNODE_REMOVE_NODELETEBUSY;
5521         }
5522
5523         /* Skip any potential upcalls if told to. */
5524         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5525                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5526         }
5527
5528         if (vp) {
5529                 batched = vnode_compound_remove_available(vp);
5530                 /*
5531                  * The root of a mounted filesystem cannot be deleted.
5532                  */
5533                 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5534                         error = EBUSY;
5535                         goto out;
5536                 }
5537
5538 #if DEVELOPMENT || DEBUG
5539                 /*
5540                  * XXX VSWAP: Check for entitlements or special flag here
5541                  * so we can restrict access appropriately.
5542                  */
5543 #else /* DEVELOPMENT || DEBUG */
5544
5545                 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5546                         error = EPERM;
5547                         goto out;
5548                 }
5549 #endif /* DEVELOPMENT || DEBUG */
5550
5551                 if (!batched) {
5552                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5553                         if (error) {
5554                                 if (error == ENOENT) {
5555                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5556                                                 do_retry = 1;
5557                                                 retry_count++;
5558                                         }
5559                                 }
5560                                 goto out;
5561                         }
5562                 }
5563         } else {
5564                 batched = 1;
5565
5566                 if (!vnode_compound_remove_available(dvp)) {
5567                         panic("No vp, but no compound remove?");
5568                 }
5569         }
5570
5571 #if CONFIG_FSE
5572         need_event = need_fsevent(FSE_DELETE, dvp);
5573         if (need_event) {
5574                 if (!batched) {
5575                         if ((vp->v_flag & VISHARDLINK) == 0) {
5576                                 /* XXX need to get these data in batched VNOP */
5577                                 get_fse_info(vp, &finfo, ctx);
5578                         }
5579                 } else {
5580                         error = vfs_get_notify_attributes(&va);
5581                         if (error) {
5582                                 goto out;
5583                         }
5584
5585                         vap = &va;
5586                 }
5587         }
5588 #endif
5589         has_listeners = kauth_authorize_fileop_has_listeners();
5590         if (need_event || has_listeners) {
5591                 if (path == NULL) {
5592                         GET_PATH(path);
5593                 }
5594                 len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5595                 if (no_firmlink_path == NULL) {
5596                         GET_PATH(no_firmlink_path);
5597                 }
5598                 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5599         }
5600
5601 #if NAMEDRSRCFORK
5602         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5603                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5604         } else
5605 #endif
5606         {
5607                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
5608                 vp = nd.ni_vp;
5609                 if (error == EKEEPLOOKING) {
5610                         if (!batched) {
5611                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5612                         }
5613
5614                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
5615                                 panic("EKEEPLOOKING, but continue flag not set?");
5616                         }
5617
5618                         if (vnode_isdir(vp)) {
5619                                 error = EISDIR;
5620                                 goto out;
5621                         }
5622                         goto continue_lookup;
5623                 } else if (error == ENOENT && batched) {
5624                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5625                                 /*
5626                                  * For compound VNOPs, the authorization callback may
5627                                  * return ENOENT in case of racing hardlink lookups
5628                                  * hitting the name  cache, redrive the lookup.
5629                                  */
5630                                 do_retry = 1;
5631                                 retry_count += 1;
5632                                 goto out;
5633                         }
5634                 }
5635         }
5636
5637         /*
5638          * Call out to allow 3rd party notification of delete.
5639          * Ignore result of kauth_authorize_fileop call.
5640          */
5641         if (!error) {
5642                 if (has_listeners) {
5643                         kauth_authorize_fileop(vfs_context_ucred(ctx),
5644                             KAUTH_FILEOP_DELETE,
5645                             (uintptr_t)vp,
5646                             (uintptr_t)path);
5647                 }
5648
5649                 if (vp->v_flag & VISHARDLINK) {
5650                         //
5651                         // if a hardlink gets deleted we want to blow away the
5652                         // v_parent link because the path that got us to this
5653                         // instance of the link is no longer valid.  this will
5654                         // force the next call to get the path to ask the file
5655                         // system instead of just following the v_parent link.
5656                         //
5657                         vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5658                 }
5659
5660 #if CONFIG_FSE
5661                 if (need_event) {
5662                         if (vp->v_flag & VISHARDLINK) {
5663                                 get_fse_info(vp, &finfo, ctx);
5664                         } else if (vap) {
5665                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
5666                         }
5667                         if (truncated_path) {
5668                                 finfo.mode |= FSE_TRUNCATED_PATH;
5669                         }
5670                         add_fsevent(FSE_DELETE, ctx,
5671                             FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5672                             FSE_ARG_FINFO, &finfo,
5673                             FSE_ARG_DONE);
5674                 }
5675 #endif
5676         }
5677
5678 out:
5679         if (path != NULL) {
5680                 RELEASE_PATH(path);
5681                 path = NULL;
5682         }
5683
5684         if (no_firmlink_path != NULL) {
5685                 RELEASE_PATH(no_firmlink_path);
5686                 no_firmlink_path = NULL;
5687         }
5688 #if NAMEDRSRCFORK
5689         /* recycle the deleted rsrc fork vnode to force a reclaim, which
5690          * will cause its shadow file to go away if necessary.
5691          */
5692         if (vp && (vnode_isnamedstream(vp)) &&
5693             (vp->v_parent != NULLVP) &&
5694             vnode_isshadow(vp)) {
5695                 vnode_recycle(vp);
5696         }
5697 #endif
5698         /*
5699          * nameidone has to happen before we vnode_put(dvp)
5700          * since it may need to release the fs_nodelock on the dvp
5701          */
5702         nameidone(&nd);
5703         vnode_put(dvp);
5704         if (vp) {
5705                 vnode_put(vp);
5706         }
5707
5708         if (do_retry) {
5709                 goto retry;
5710         }
5711
5712         return error;
5713 }
5714
5715 int
5716 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5717     enum uio_seg segflg, int unlink_flags)
5718 {
5719         return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5720                    unlink_flags);
5721 }
5722
5723 /*
5724  * Delete a name from the filesystem using Carbon semantics.
5725  */
5726 int
5727 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5728 {
5729         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5730                    uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5731 }
5732
5733 /*
5734  * Delete a name from the filesystem using POSIX semantics.
5735  */
5736 int
5737 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5738 {
5739         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5740                    uap->path, UIO_USERSPACE, 0);
5741 }
5742
5743 int
5744 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5745 {
5746         if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5747                 return EINVAL;
5748         }
5749
5750         if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5751                 int unlink_flags = 0;
5752
5753                 if (uap->flag & AT_REMOVEDIR_DATALESS) {
5754                         unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5755                 }
5756                 return rmdirat_internal(vfs_context_current(), uap->fd,
5757                            uap->path, UIO_USERSPACE, unlink_flags);
5758         } else {
5759                 return unlinkat_internal(vfs_context_current(), uap->fd,
5760                            NULLVP, uap->path, UIO_USERSPACE, 0);
5761         }
5762 }
5763
5764 /*
5765  * Reposition read/write file offset.
5766  */
5767 int
5768 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5769 {
5770         struct fileproc *fp;
5771         vnode_t vp;
5772         struct vfs_context *ctx;
5773         off_t offset = uap->offset, file_size;
5774         int error;
5775
5776         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5777                 if (error == ENOTSUP) {
5778                         return ESPIPE;
5779                 }
5780                 return error;
5781         }
5782         if (vnode_isfifo(vp)) {
5783                 file_drop(uap->fd);
5784                 return ESPIPE;
5785         }
5786
5787
5788         ctx = vfs_context_current();
5789 #if CONFIG_MACF
5790         if (uap->whence == L_INCR && uap->offset == 0) {
5791                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5792                     fp->fp_glob);
5793         } else {
5794                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5795                     fp->fp_glob);
5796         }
5797         if (error) {
5798                 file_drop(uap->fd);
5799                 return error;
5800         }
5801 #endif
5802         if ((error = vnode_getwithref(vp))) {
5803                 file_drop(uap->fd);
5804                 return error;
5805         }
5806
5807         switch (uap->whence) {
5808         case L_INCR:
5809                 offset += fp->fp_glob->fg_offset;
5810                 break;
5811         case L_XTND:
5812                 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5813                         break;
5814                 }
5815                 offset += file_size;
5816                 break;
5817         case L_SET:
5818                 break;
5819         case SEEK_HOLE:
5820                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5821                 break;
5822         case SEEK_DATA:
5823                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5824                 break;
5825         default:
5826                 error = EINVAL;
5827         }
5828         if (error == 0) {
5829                 if (uap->offset > 0 && offset < 0) {
5830                         /* Incremented/relative move past max size */
5831                         error = EOVERFLOW;
5832                 } else {
5833                         /*
5834                          * Allow negative offsets on character devices, per
5835                          * POSIX 1003.1-2001.  Most likely for writing disk
5836                          * labels.
5837                          */
5838                         if (offset < 0 && vp->v_type != VCHR) {
5839                                 /* Decremented/relative move before start */
5840                                 error = EINVAL;
5841                         } else {
5842                                 /* Success */
5843                                 fp->fp_glob->fg_offset = offset;
5844                                 *retval = fp->fp_glob->fg_offset;
5845                         }
5846                 }
5847         }
5848
5849         /*
5850          * An lseek can affect whether data is "available to read."  Use
5851          * hint of NOTE_NONE so no EVFILT_VNODE events fire
5852          */
5853         post_event_if_success(vp, error, NOTE_NONE);
5854         (void)vnode_put(vp);
5855         file_drop(uap->fd);
5856         return error;
5857 }
5858
5859
5860 /*
5861  * Check access permissions.
5862  *
5863  * Returns:     0                       Success
5864  *              vnode_authorize:???
5865  */
5866 static int
5867 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5868 {
5869         kauth_action_t action;
5870         int error;
5871
5872         /*
5873          * If just the regular access bits, convert them to something
5874          * that vnode_authorize will understand.
5875          */
5876         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5877                 action = 0;
5878                 if (uflags & R_OK) {
5879                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
5880                 }
5881                 if (uflags & W_OK) {
5882                         if (vnode_isdir(vp)) {
5883                                 action |= KAUTH_VNODE_ADD_FILE |
5884                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
5885                                 /* might want delete rights here too */
5886                         } else {
5887                                 action |= KAUTH_VNODE_WRITE_DATA;
5888                         }
5889                 }
5890                 if (uflags & X_OK) {
5891                         if (vnode_isdir(vp)) {
5892                                 action |= KAUTH_VNODE_SEARCH;
5893                         } else {
5894                                 action |= KAUTH_VNODE_EXECUTE;
5895                         }
5896                 }
5897         } else {
5898                 /* take advantage of definition of uflags */
5899                 action = uflags >> 8;
5900         }
5901
5902 #if CONFIG_MACF
5903         error = mac_vnode_check_access(ctx, vp, uflags);
5904         if (error) {
5905                 return error;
5906         }
5907 #endif /* MAC */
5908
5909         /* action == 0 means only check for existence */
5910         if (action != 0) {
5911                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5912         } else {
5913                 error = 0;
5914         }
5915
5916         return error;
5917 }
5918
5919
5920
5921 /*
5922  * access_extended: Check access permissions in bulk.
5923  *
5924  * Description: uap->entries            Pointer to an array of accessx
5925  *                                      descriptor structs, plus one or
5926  *                                      more NULL terminated strings (see
5927  *                                      "Notes" section below).
5928  *              uap->size               Size of the area pointed to by
5929  *                                      uap->entries.
5930  *              uap->results            Pointer to the results array.
5931  *
5932  * Returns:     0                       Success
5933  *              ENOMEM                  Insufficient memory
5934  *              EINVAL                  Invalid arguments
5935  *              namei:EFAULT            Bad address
5936  *              namei:ENAMETOOLONG      Filename too long
5937  *              namei:ENOENT            No such file or directory
5938  *              namei:ELOOP             Too many levels of symbolic links
5939  *              namei:EBADF             Bad file descriptor
5940  *              namei:ENOTDIR           Not a directory
5941  *              namei:???
5942  *              access1:
5943  *
5944  * Implicit returns:
5945  *              uap->results            Array contents modified
5946  *
5947  * Notes:       The uap->entries are structured as an arbitrary length array
5948  *              of accessx descriptors, followed by one or more NULL terminated
5949  *              strings
5950  *
5951  *                      struct accessx_descriptor[0]
5952  *                      ...
5953  *                      struct accessx_descriptor[n]
5954  *                      char name_data[0];
5955  *
5956  *              We determine the entry count by walking the buffer containing
5957  *              the uap->entries argument descriptor.  For each descriptor we
5958  *              see, the valid values for the offset ad_name_offset will be
5959  *              in the byte range:
5960  *
5961  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
5962  *                                              to
5963  *                              [ uap->entries + uap->size - 2 ]
5964  *
5965  *              since we must have at least one string, and the string must
5966  *              be at least one character plus the NULL terminator in length.
5967  *
5968  * XXX:         Need to support the check-as uid argument
5969  */
5970 int
5971 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5972 {
5973         struct accessx_descriptor *input = NULL;
5974         errno_t *result = NULL;
5975         errno_t error = 0;
5976         int wantdelete = 0;
5977         size_t desc_max, desc_actual;
5978         unsigned int i, j;
5979         struct vfs_context context;
5980         struct nameidata nd;
5981         int niopts;
5982         vnode_t vp = NULL;
5983         vnode_t dvp = NULL;
5984 #define ACCESSX_MAX_DESCR_ON_STACK 10
5985         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5986
5987         context.vc_ucred = NULL;
5988
5989         /*
5990          * Validate parameters; if valid, copy the descriptor array and string
5991          * arguments into local memory.  Before proceeding, the following
5992          * conditions must have been met:
5993          *
5994          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5995          * o    There must be sufficient room in the request for at least one
5996          *      descriptor and a one yte NUL terminated string.
5997          * o    The allocation of local storage must not fail.
5998          */
5999         if (uap->size > ACCESSX_MAX_TABLESIZE) {
6000                 return ENOMEM;
6001         }
6002         if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
6003                 return EINVAL;
6004         }
6005         if (uap->size <= sizeof(stack_input)) {
6006                 input = stack_input;
6007         } else {
6008                 input = kheap_alloc(KHEAP_DATA_BUFFERS, uap->size, Z_WAITOK);
6009                 if (input == NULL) {
6010                         error = ENOMEM;
6011                         goto out;
6012                 }
6013         }
6014         error = copyin(uap->entries, input, uap->size);
6015         if (error) {
6016                 goto out;
6017         }
6018
6019         AUDIT_ARG(opaque, input, uap->size);
6020
6021         /*
6022          * Force NUL termination of the copyin buffer to avoid nami() running
6023          * off the end.  If the caller passes us bogus data, they may get a
6024          * bogus result.
6025          */
6026         ((char *)input)[uap->size - 1] = 0;
6027
6028         /*
6029          * Access is defined as checking against the process' real identity,
6030          * even if operations are checking the effective identity.  This
6031          * requires that we use a local vfs context.
6032          */
6033         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6034         context.vc_thread = current_thread();
6035
6036         /*
6037          * Find out how many entries we have, so we can allocate the result
6038          * array by walking the list and adjusting the count downward by the
6039          * earliest string offset we see.
6040          */
6041         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
6042         desc_actual = desc_max;
6043         for (i = 0; i < desc_actual; i++) {
6044                 /*
6045                  * Take the offset to the name string for this entry and
6046                  * convert to an input array index, which would be one off
6047                  * the end of the array if this entry was the lowest-addressed
6048                  * name string.
6049                  */
6050                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
6051
6052                 /*
6053                  * An offset greater than the max allowable offset is an error.
6054                  * It is also an error for any valid entry to point
6055                  * to a location prior to the end of the current entry, if
6056                  * it's not a reference to the string of the previous entry.
6057                  */
6058                 if (j > desc_max || (j != 0 && j <= i)) {
6059                         error = EINVAL;
6060                         goto out;
6061                 }
6062
6063                 /* Also do not let ad_name_offset point to something beyond the size of the input */
6064                 if (input[i].ad_name_offset >= uap->size) {
6065                         error = EINVAL;
6066                         goto out;
6067                 }
6068
6069                 /*
6070                  * An offset of 0 means use the previous descriptor's offset;
6071                  * this is used to chain multiple requests for the same file
6072                  * to avoid multiple lookups.
6073                  */
6074                 if (j == 0) {
6075                         /* This is not valid for the first entry */
6076                         if (i == 0) {
6077                                 error = EINVAL;
6078                                 goto out;
6079                         }
6080                         continue;
6081                 }
6082
6083                 /*
6084                  * If the offset of the string for this descriptor is before
6085                  * what we believe is the current actual last descriptor,
6086                  * then we need to adjust our estimate downward; this permits
6087                  * the string table following the last descriptor to be out
6088                  * of order relative to the descriptor list.
6089                  */
6090                 if (j < desc_actual) {
6091                         desc_actual = j;
6092                 }
6093         }
6094
6095         /*
6096          * We limit the actual number of descriptors we are willing to process
6097          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
6098          * requested does not exceed this limit,
6099          */
6100         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
6101                 error = ENOMEM;
6102                 goto out;
6103         }
6104         result = kheap_alloc(KHEAP_DATA_BUFFERS, desc_actual * sizeof(errno_t),
6105             Z_WAITOK | Z_ZERO);
6106         if (result == NULL) {
6107                 error = ENOMEM;
6108                 goto out;
6109         }
6110
6111         /*
6112          * Do the work by iterating over the descriptor entries we know to
6113          * at least appear to contain valid data.
6114          */
6115         error = 0;
6116         for (i = 0; i < desc_actual; i++) {
6117                 /*
6118                  * If the ad_name_offset is 0, then we use the previous
6119                  * results to make the check; otherwise, we are looking up
6120                  * a new file name.
6121                  */
6122                 if (input[i].ad_name_offset != 0) {
6123                         /* discard old vnodes */
6124                         if (vp) {
6125                                 vnode_put(vp);
6126                                 vp = NULL;
6127                         }
6128                         if (dvp) {
6129                                 vnode_put(dvp);
6130                                 dvp = NULL;
6131                         }
6132
6133                         /*
6134                          * Scan forward in the descriptor list to see if we
6135                          * need the parent vnode.  We will need it if we are
6136                          * deleting, since we must have rights  to remove
6137                          * entries in the parent directory, as well as the
6138                          * rights to delete the object itself.
6139                          */
6140                         wantdelete = input[i].ad_flags & _DELETE_OK;
6141                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
6142                                 if (input[j].ad_flags & _DELETE_OK) {
6143                                         wantdelete = 1;
6144                                 }
6145                         }
6146
6147                         niopts = FOLLOW | AUDITVNPATH1;
6148
6149                         /* need parent for vnode_authorize for deletion test */
6150                         if (wantdelete) {
6151                                 niopts |= WANTPARENT;
6152                         }
6153
6154                         /* do the lookup */
6155                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
6156                             CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
6157                             &context);
6158                         error = namei(&nd);
6159                         if (!error) {
6160                                 vp = nd.ni_vp;
6161                                 if (wantdelete) {
6162                                         dvp = nd.ni_dvp;
6163                                 }
6164                         }
6165                         nameidone(&nd);
6166                 }
6167
6168                 /*
6169                  * Handle lookup errors.
6170                  */
6171                 switch (error) {
6172                 case ENOENT:
6173                 case EACCES:
6174                 case EPERM:
6175                 case ENOTDIR:
6176                         result[i] = error;
6177                         break;
6178                 case 0:
6179                         /* run this access check */
6180                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
6181                         break;
6182                 default:
6183                         /* fatal lookup error */
6184
6185                         goto out;
6186                 }
6187         }
6188
6189         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
6190
6191         /* copy out results */
6192         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
6193
6194 out:
6195         if (input && input != stack_input) {
6196                 kheap_free(KHEAP_DATA_BUFFERS, input, uap->size);
6197         }
6198         if (result) {
6199                 kheap_free(KHEAP_DATA_BUFFERS, result, desc_actual * sizeof(errno_t));
6200         }
6201         if (vp) {
6202                 vnode_put(vp);
6203         }
6204         if (dvp) {
6205                 vnode_put(dvp);
6206         }
6207         if (IS_VALID_CRED(context.vc_ucred)) {
6208                 kauth_cred_unref(&context.vc_ucred);
6209         }
6210         return error;
6211 }
6212
6213
6214 /*
6215  * Returns:     0                       Success
6216  *              namei:EFAULT            Bad address
6217  *              namei:ENAMETOOLONG      Filename too long
6218  *              namei:ENOENT            No such file or directory
6219  *              namei:ELOOP             Too many levels of symbolic links
6220  *              namei:EBADF             Bad file descriptor
6221  *              namei:ENOTDIR           Not a directory
6222  *              namei:???
6223  *              access1:
6224  */
6225 static int
6226 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
6227     int flag, enum uio_seg segflg)
6228 {
6229         int error;
6230         struct nameidata nd;
6231         int niopts;
6232         struct vfs_context context;
6233 #if NAMEDRSRCFORK
6234         int is_namedstream = 0;
6235 #endif
6236
6237         /*
6238          * Unless the AT_EACCESS option is used, Access is defined as checking
6239          * against the process' real identity, even if operations are checking
6240          * the effective identity.  So we need to tweak the credential
6241          * in the context for that case.
6242          */
6243         if (!(flag & AT_EACCESS)) {
6244                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6245         } else {
6246                 context.vc_ucred = ctx->vc_ucred;
6247         }
6248         context.vc_thread = ctx->vc_thread;
6249
6250
6251         niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6252         /* need parent for vnode_authorize for deletion test */
6253         if (amode & _DELETE_OK) {
6254                 niopts |= WANTPARENT;
6255         }
6256         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6257             path, &context);
6258
6259 #if NAMEDRSRCFORK
6260         /* access(F_OK) calls are allowed for resource forks. */
6261         if (amode == F_OK) {
6262                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6263         }
6264 #endif
6265         error = nameiat(&nd, fd);
6266         if (error) {
6267                 goto out;
6268         }
6269
6270 #if NAMEDRSRCFORK
6271         /* Grab reference on the shadow stream file vnode to
6272          * force an inactive on release which will mark it
6273          * for recycle.
6274          */
6275         if (vnode_isnamedstream(nd.ni_vp) &&
6276             (nd.ni_vp->v_parent != NULLVP) &&
6277             vnode_isshadow(nd.ni_vp)) {
6278                 is_namedstream = 1;
6279                 vnode_ref(nd.ni_vp);
6280         }
6281 #endif
6282
6283         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6284
6285 #if NAMEDRSRCFORK
6286         if (is_namedstream) {
6287                 vnode_rele(nd.ni_vp);
6288         }
6289 #endif
6290
6291         vnode_put(nd.ni_vp);
6292         if (amode & _DELETE_OK) {
6293                 vnode_put(nd.ni_dvp);
6294         }
6295         nameidone(&nd);
6296
6297 out:
6298         if (!(flag & AT_EACCESS)) {
6299                 kauth_cred_unref(&context.vc_ucred);
6300         }
6301         return error;
6302 }
6303
6304 int
6305 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6306 {
6307         return faccessat_internal(vfs_context_current(), AT_FDCWD,
6308                    uap->path, uap->flags, 0, UIO_USERSPACE);
6309 }
6310
6311 int
6312 faccessat(__unused proc_t p, struct faccessat_args *uap,
6313     __unused int32_t *retval)
6314 {
6315         if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
6316                 return EINVAL;
6317         }
6318
6319         return faccessat_internal(vfs_context_current(), uap->fd,
6320                    uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6321 }
6322
6323 /*
6324  * Returns:     0                       Success
6325  *              EFAULT
6326  *      copyout:EFAULT
6327  *      namei:???
6328  *      vn_stat:???
6329  */
6330 static int
6331 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6332     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6333     enum uio_seg segflg, int fd, int flag)
6334 {
6335         struct nameidata nd;
6336         int follow;
6337         union {
6338                 struct stat sb;
6339                 struct stat64 sb64;
6340         } source = {};
6341         union {
6342                 struct user64_stat user64_sb;
6343                 struct user32_stat user32_sb;
6344                 struct user64_stat64 user64_sb64;
6345                 struct user32_stat64 user32_sb64;
6346         } dest = {};
6347         caddr_t sbp;
6348         int error, my_size;
6349         kauth_filesec_t fsec;
6350         size_t xsecurity_bufsize;
6351         void * statptr;
6352         struct fileproc *fp = NULL;
6353         int needsrealdev = 0;
6354
6355         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6356         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6357             segflg, path, ctx);
6358
6359 #if NAMEDRSRCFORK
6360         int is_namedstream = 0;
6361         /* stat calls are allowed for resource forks. */
6362         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6363 #endif
6364
6365         if (flag & AT_FDONLY) {
6366                 vnode_t fvp;
6367
6368                 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6369                 if (error) {
6370                         return error;
6371                 }
6372                 if ((error = vnode_getwithref(fvp))) {
6373                         file_drop(fd);
6374                         return error;
6375                 }
6376                 nd.ni_vp = fvp;
6377         } else {
6378                 error = nameiat(&nd, fd);
6379                 if (error) {
6380                         return error;
6381                 }
6382         }
6383         fsec = KAUTH_FILESEC_NONE;
6384
6385         statptr = (void *)&source;
6386
6387 #if NAMEDRSRCFORK
6388         /* Grab reference on the shadow stream file vnode to
6389          * force an inactive on release which will mark it
6390          * for recycle.
6391          */
6392         if (vnode_isnamedstream(nd.ni_vp) &&
6393             (nd.ni_vp->v_parent != NULLVP) &&
6394             vnode_isshadow(nd.ni_vp)) {
6395                 is_namedstream = 1;
6396                 vnode_ref(nd.ni_vp);
6397         }
6398 #endif
6399
6400         needsrealdev = flag & AT_REALDEV ? 1 : 0;
6401         if (fp && (xsecurity == USER_ADDR_NULL)) {
6402                 /*
6403                  * If the caller has the file open, and is not
6404                  * requesting extended security information, we are
6405                  * going to let them get the basic stat information.
6406                  */
6407                 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6408                     fp->fp_glob->fg_cred);
6409         } else {
6410                 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6411                     isstat64, needsrealdev, ctx);
6412         }
6413
6414 #if NAMEDRSRCFORK
6415         if (is_namedstream) {
6416                 vnode_rele(nd.ni_vp);
6417         }
6418 #endif
6419         vnode_put(nd.ni_vp);
6420         nameidone(&nd);
6421         if (fp) {
6422                 file_drop(fd);
6423                 fp = NULL;
6424         }
6425
6426         if (error) {
6427                 return error;
6428         }
6429         /* Zap spare fields */
6430         if (isstat64 != 0) {
6431                 source.sb64.st_lspare = 0;
6432                 source.sb64.st_qspare[0] = 0LL;
6433                 source.sb64.st_qspare[1] = 0LL;
6434                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6435                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6436                         my_size = sizeof(dest.user64_sb64);
6437                         sbp = (caddr_t)&dest.user64_sb64;
6438                 } else {
6439                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6440                         my_size = sizeof(dest.user32_sb64);
6441                         sbp = (caddr_t)&dest.user32_sb64;
6442                 }
6443                 /*
6444                  * Check if we raced (post lookup) against the last unlink of a file.
6445                  */
6446                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6447                         source.sb64.st_nlink = 1;
6448                 }
6449         } else {
6450                 source.sb.st_lspare = 0;
6451                 source.sb.st_qspare[0] = 0LL;
6452                 source.sb.st_qspare[1] = 0LL;
6453                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6454                         munge_user64_stat(&source.sb, &dest.user64_sb);
6455                         my_size = sizeof(dest.user64_sb);
6456                         sbp = (caddr_t)&dest.user64_sb;
6457                 } else {
6458                         munge_user32_stat(&source.sb, &dest.user32_sb);
6459                         my_size = sizeof(dest.user32_sb);
6460                         sbp = (caddr_t)&dest.user32_sb;
6461                 }
6462
6463                 /*
6464                  * Check if we raced (post lookup) against the last unlink of a file.
6465                  */
6466                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6467                         source.sb.st_nlink = 1;
6468                 }
6469         }
6470         if ((error = copyout(sbp, ub, my_size)) != 0) {
6471                 goto out;
6472         }
6473
6474         /* caller wants extended security information? */
6475         if (xsecurity != USER_ADDR_NULL) {
6476                 /* did we get any? */
6477                 if (fsec == KAUTH_FILESEC_NONE) {
6478                         if (susize(xsecurity_size, 0) != 0) {
6479                                 error = EFAULT;
6480                                 goto out;
6481                         }
6482                 } else {
6483                         /* find the user buffer size */
6484                         xsecurity_bufsize = fusize(xsecurity_size);
6485
6486                         /* copy out the actual data size */
6487                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6488                                 error = EFAULT;
6489                                 goto out;
6490                         }
6491
6492                         /* if the caller supplied enough room, copy out to it */
6493                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6494                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6495                         }
6496                 }
6497         }
6498 out:
6499         if (fsec != KAUTH_FILESEC_NONE) {
6500                 kauth_filesec_free(fsec);
6501         }
6502         return error;
6503 }
6504
6505 /*
6506  * stat_extended: Get file status; with extended security (ACL).
6507  *
6508  * Parameters:    p                       (ignored)
6509  *                uap                     User argument descriptor (see below)
6510  *                retval                  (ignored)
6511  *
6512  * Indirect:      uap->path               Path of file to get status from
6513  *                uap->ub                 User buffer (holds file status info)
6514  *                uap->xsecurity          ACL to get (extended security)
6515  *                uap->xsecurity_size     Size of ACL
6516  *
6517  * Returns:        0                      Success
6518  *                !0                      errno value
6519  *
6520  */
6521 int
6522 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6523     __unused int32_t *retval)
6524 {
6525         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6526                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6527                    0);
6528 }
6529
6530 /*
6531  * Returns:     0                       Success
6532  *      fstatat_internal:???            [see fstatat_internal() in this file]
6533  */
6534 int
6535 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6536 {
6537         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6538                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6539 }
6540
6541 int
6542 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6543 {
6544         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6545                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6546 }
6547
6548 /*
6549  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6550  *
6551  * Parameters:    p                       (ignored)
6552  *                uap                     User argument descriptor (see below)
6553  *                retval                  (ignored)
6554  *
6555  * Indirect:      uap->path               Path of file to get status from
6556  *                uap->ub                 User buffer (holds file status info)
6557  *                uap->xsecurity          ACL to get (extended security)
6558  *                uap->xsecurity_size     Size of ACL
6559  *
6560  * Returns:        0                      Success
6561  *                !0                      errno value
6562  *
6563  */
6564 int
6565 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6566 {
6567         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6568                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6569                    0);
6570 }
6571
6572 /*
6573  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6574  *
6575  * Parameters:    p                       (ignored)
6576  *                uap                     User argument descriptor (see below)
6577  *                retval                  (ignored)
6578  *
6579  * Indirect:      uap->path               Path of file to get status from
6580  *                uap->ub                 User buffer (holds file status info)
6581  *                uap->xsecurity          ACL to get (extended security)
6582  *                uap->xsecurity_size     Size of ACL
6583  *
6584  * Returns:        0                      Success
6585  *                !0                      errno value
6586  *
6587  */
6588 int
6589 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6590 {
6591         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6592                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6593                    AT_SYMLINK_NOFOLLOW);
6594 }
6595
6596 /*
6597  * Get file status; this version does not follow links.
6598  */
6599 int
6600 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6601 {
6602         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6603                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6604 }
6605
6606 int
6607 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6608 {
6609         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6610                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6611 }
6612
6613 /*
6614  * lstat64_extended: Get file status; can handle large inode numbers; does not
6615  * follow links; with extended security (ACL).
6616  *
6617  * Parameters:    p                       (ignored)
6618  *                uap                     User argument descriptor (see below)
6619  *                retval                  (ignored)
6620  *
6621  * Indirect:      uap->path               Path of file to get status from
6622  *                uap->ub                 User buffer (holds file status info)
6623  *                uap->xsecurity          ACL to get (extended security)
6624  *                uap->xsecurity_size     Size of ACL
6625  *
6626  * Returns:        0                      Success
6627  *                !0                      errno value
6628  *
6629  */
6630 int
6631 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6632 {
6633         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6634                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6635                    AT_SYMLINK_NOFOLLOW);
6636 }
6637
6638 int
6639 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6640 {
6641         if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6642                 return EINVAL;
6643         }
6644
6645         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6646                    0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6647 }
6648
6649 int
6650 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6651     __unused int32_t *retval)
6652 {
6653         if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6654                 return EINVAL;
6655         }
6656
6657         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6658                    0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6659 }
6660
6661 /*
6662  * Get configurable pathname variables.
6663  *
6664  * Returns:     0                       Success
6665  *      namei:???
6666  *      vn_pathconf:???
6667  *
6668  * Notes:       Global implementation  constants are intended to be
6669  *              implemented in this function directly; all other constants
6670  *              are per-FS implementation, and therefore must be handled in
6671  *              each respective FS, instead.
6672  *
6673  * XXX We implement some things globally right now that should actually be
6674  * XXX per-FS; we will need to deal with this at some point.
6675  */
6676 /* ARGSUSED */
6677 int
6678 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6679 {
6680         int error;
6681         struct nameidata nd;
6682         vfs_context_t ctx = vfs_context_current();
6683
6684         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6685             UIO_USERSPACE, uap->path, ctx);
6686         error = namei(&nd);
6687         if (error) {
6688                 return error;
6689         }
6690
6691         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6692
6693         vnode_put(nd.ni_vp);
6694         nameidone(&nd);
6695         return error;
6696 }
6697
6698 /*
6699  * Return target name of a symbolic link.
6700  */
6701 /* ARGSUSED */
6702 static int
6703 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
6704     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6705     int *retval)
6706 {
6707         vnode_t vp;
6708         uio_t auio;
6709         int error;
6710         struct nameidata nd;
6711         char uio_buf[UIO_SIZEOF(1)];
6712
6713         if (bufsize > INT32_MAX) {
6714                 return EINVAL;
6715         }
6716
6717         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6718             seg, path, ctx);
6719
6720         error = nameiat(&nd, fd);
6721         if (error) {
6722                 return error;
6723         }
6724         vp = nd.ni_vp;
6725
6726         nameidone(&nd);
6727
6728         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6729             &uio_buf[0], sizeof(uio_buf));
6730         uio_addiov(auio, buf, bufsize);
6731         if (vp->v_type != VLNK) {
6732                 error = EINVAL;
6733         } else {
6734 #if CONFIG_MACF
6735                 error = mac_vnode_check_readlink(ctx, vp);
6736 #endif
6737                 if (error == 0) {
6738                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6739                             ctx);
6740                 }
6741                 if (error == 0) {
6742                         error = VNOP_READLINK(vp, auio, ctx);
6743                 }
6744         }
6745         vnode_put(vp);
6746
6747         *retval = (int)(bufsize - uio_resid(auio));
6748         return error;
6749 }
6750
6751 int
6752 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6753 {
6754         enum uio_seg procseg;
6755
6756         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6757         return readlinkat_internal(vfs_context_current(), AT_FDCWD,
6758                    CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6759                    uap->count, procseg, retval);
6760 }
6761
6762 int
6763 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6764 {
6765         enum uio_seg procseg;
6766
6767         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6768         return readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
6769                    procseg, uap->buf, uap->bufsize, procseg, retval);
6770 }
6771
6772 /*
6773  * Change file flags, the deep inner layer.
6774  */
6775 static int
6776 chflags0(vnode_t vp, struct vnode_attr *va,
6777     int (*setattr)(vnode_t, void *, vfs_context_t),
6778     void *arg, vfs_context_t ctx)
6779 {
6780         kauth_action_t action = 0;
6781         int error;
6782
6783 #if CONFIG_MACF
6784         error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6785         if (error) {
6786                 goto out;
6787         }
6788 #endif
6789
6790         /* request authorisation, disregard immutability */
6791         if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6792                 goto out;
6793         }
6794         /*
6795          * Request that the auth layer disregard those file flags it's allowed to when
6796          * authorizing this operation; we need to do this in order to be able to
6797          * clear immutable flags.
6798          */
6799         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6800                 goto out;
6801         }
6802         error = (*setattr)(vp, arg, ctx);
6803
6804 #if CONFIG_MACF
6805         if (error == 0) {
6806                 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6807         }
6808 #endif
6809
6810 out:
6811         return error;
6812 }
6813
6814 /*
6815  * Change file flags.
6816  *
6817  * NOTE: this will vnode_put() `vp'
6818  */
6819 static int
6820 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6821 {
6822         struct vnode_attr va;
6823         int error;
6824
6825         VATTR_INIT(&va);
6826         VATTR_SET(&va, va_flags, flags);
6827
6828         error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6829         vnode_put(vp);
6830
6831         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6832                 error = ENOTSUP;
6833         }
6834
6835         return error;
6836 }
6837
6838 /*
6839  * Change flags of a file given a path name.
6840  */
6841 /* ARGSUSED */
6842 int
6843 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6844 {
6845         vnode_t vp;
6846         vfs_context_t ctx = vfs_context_current();
6847         int error;
6848         struct nameidata nd;
6849
6850         AUDIT_ARG(fflags, uap->flags);
6851         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6852             UIO_USERSPACE, uap->path, ctx);
6853         error = namei(&nd);
6854         if (error) {
6855                 return error;
6856         }
6857         vp = nd.ni_vp;
6858         nameidone(&nd);
6859
6860         /* we don't vnode_put() here because chflags1 does internally */
6861         error = chflags1(vp, uap->flags, ctx);
6862
6863         return error;
6864 }
6865
6866 /*
6867  * Change flags of a file given a file descriptor.
6868  */
6869 /* ARGSUSED */
6870 int
6871 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6872 {
6873         vnode_t vp;
6874         int error;
6875
6876         AUDIT_ARG(fd, uap->fd);
6877         AUDIT_ARG(fflags, uap->flags);
6878         if ((error = file_vnode(uap->fd, &vp))) {
6879                 return error;
6880         }
6881
6882         if ((error = vnode_getwithref(vp))) {
6883                 file_drop(uap->fd);
6884                 return error;
6885         }
6886
6887         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6888
6889         /* we don't vnode_put() here because chflags1 does internally */
6890         error = chflags1(vp, uap->flags, vfs_context_current());
6891
6892         file_drop(uap->fd);
6893         return error;
6894 }
6895
6896 /*
6897  * Change security information on a filesystem object.
6898  *
6899  * Returns:     0                       Success
6900  *              EPERM                   Operation not permitted
6901  *              vnode_authattr:???      [anything vnode_authattr can return]
6902  *              vnode_authorize:???     [anything vnode_authorize can return]
6903  *              vnode_setattr:???       [anything vnode_setattr can return]
6904  *
6905  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
6906  *              translated to EPERM before being returned.
6907  */
6908 static int
6909 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6910 {
6911         kauth_action_t action;
6912         int error;
6913
6914         AUDIT_ARG(mode, vap->va_mode);
6915         /* XXX audit new args */
6916
6917 #if NAMEDSTREAMS
6918         /* chmod calls are not allowed for resource forks. */
6919         if (vp->v_flag & VISNAMEDSTREAM) {
6920                 return EPERM;
6921         }
6922 #endif
6923
6924 #if CONFIG_MACF
6925         if (VATTR_IS_ACTIVE(vap, va_mode) &&
6926             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6927                 return error;
6928         }
6929
6930         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6931                 if ((error = mac_vnode_check_setowner(ctx, vp,
6932                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6933                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6934                         return error;
6935                 }
6936         }
6937
6938         if (VATTR_IS_ACTIVE(vap, va_acl) &&
6939             (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
6940                 return error;
6941         }
6942 #endif
6943
6944         /* make sure that the caller is allowed to set this security information */
6945         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6946             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6947                 if (error == EACCES) {
6948                         error = EPERM;
6949                 }
6950                 return error;
6951         }
6952
6953         if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
6954                 return error;
6955         }
6956
6957 #if CONFIG_MACF
6958         if (VATTR_IS_ACTIVE(vap, va_mode)) {
6959                 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6960         }
6961
6962         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6963                 mac_vnode_notify_setowner(ctx, vp,
6964                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6965                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6966         }
6967
6968         if (VATTR_IS_ACTIVE(vap, va_acl)) {
6969                 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6970         }
6971 #endif
6972
6973         return error;
6974 }
6975
6976
6977 /*
6978  * Change mode of a file given a path name.
6979  *
6980  * Returns:     0                       Success
6981  *              namei:???               [anything namei can return]
6982  *              chmod_vnode:???         [anything chmod_vnode can return]
6983  */
6984 static int
6985 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6986     int fd, int flag, enum uio_seg segflg)
6987 {
6988         struct nameidata nd;
6989         int follow, error;
6990
6991         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6992         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
6993             segflg, path, ctx);
6994         if ((error = nameiat(&nd, fd))) {
6995                 return error;
6996         }
6997         error = chmod_vnode(ctx, nd.ni_vp, vap);
6998         vnode_put(nd.ni_vp);
6999         nameidone(&nd);
7000         return error;
7001 }
7002
7003 /*
7004  * chmod_extended: Change the mode of a file given a path name; with extended
7005  * argument list (including extended security (ACL)).
7006  *
7007  * Parameters:  p                       Process requesting the open
7008  *              uap                     User argument descriptor (see below)
7009  *              retval                  (ignored)
7010  *
7011  * Indirect:    uap->path               Path to object (same as 'chmod')
7012  *              uap->uid                UID to set
7013  *              uap->gid                GID to set
7014  *              uap->mode               File mode to set (same as 'chmod')
7015  *              uap->xsecurity          ACL to set (or delete)
7016  *
7017  * Returns:     0                       Success
7018  *              !0                      errno value
7019  *
7020  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
7021  *
7022  * XXX:         We should enummerate the possible errno values here, and where
7023  *              in the code they originated.
7024  */
7025 int
7026 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
7027 {
7028         int error;
7029         struct vnode_attr va;
7030         kauth_filesec_t xsecdst;
7031
7032         AUDIT_ARG(owner, uap->uid, uap->gid);
7033
7034         VATTR_INIT(&va);
7035         if (uap->mode != -1) {
7036                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7037         }
7038         if (uap->uid != KAUTH_UID_NONE) {
7039                 VATTR_SET(&va, va_uid, uap->uid);
7040         }
7041         if (uap->gid != KAUTH_GID_NONE) {
7042                 VATTR_SET(&va, va_gid, uap->gid);
7043         }
7044
7045         xsecdst = NULL;
7046         switch (uap->xsecurity) {
7047         /* explicit remove request */
7048         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
7049                 VATTR_SET(&va, va_acl, NULL);
7050                 break;
7051         /* not being set */
7052         case USER_ADDR_NULL:
7053                 break;
7054         default:
7055                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7056                         return error;
7057                 }
7058                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7059                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
7060         }
7061
7062         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
7063             UIO_USERSPACE);
7064
7065         if (xsecdst != NULL) {
7066                 kauth_filesec_free(xsecdst);
7067         }
7068         return error;
7069 }
7070
7071 /*
7072  * Returns:     0                       Success
7073  *              chmodat:???             [anything chmodat can return]
7074  */
7075 static int
7076 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
7077     int flag, enum uio_seg segflg)
7078 {
7079         struct vnode_attr va;
7080
7081         VATTR_INIT(&va);
7082         VATTR_SET(&va, va_mode, mode & ALLPERMS);
7083
7084         return chmodat(ctx, path, &va, fd, flag, segflg);
7085 }
7086
7087 int
7088 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
7089 {
7090         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7091                    AT_FDCWD, 0, UIO_USERSPACE);
7092 }
7093
7094 int
7095 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
7096 {
7097         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7098                 return EINVAL;
7099         }
7100
7101         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
7102                    uap->fd, uap->flag, UIO_USERSPACE);
7103 }
7104
7105 /*
7106  * Change mode of a file given a file descriptor.
7107  */
7108 static int
7109 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
7110 {
7111         vnode_t vp;
7112         int error;
7113
7114         AUDIT_ARG(fd, fd);
7115
7116         if ((error = file_vnode(fd, &vp)) != 0) {
7117                 return error;
7118         }
7119         if ((error = vnode_getwithref(vp)) != 0) {
7120                 file_drop(fd);
7121                 return error;
7122         }
7123         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7124
7125         error = chmod_vnode(vfs_context_current(), vp, vap);
7126         (void)vnode_put(vp);
7127         file_drop(fd);
7128
7129         return error;
7130 }
7131
7132 /*
7133  * fchmod_extended: Change mode of a file given a file descriptor; with
7134  * extended argument list (including extended security (ACL)).
7135  *
7136  * Parameters:    p                       Process requesting to change file mode
7137  *                uap                     User argument descriptor (see below)
7138  *                retval                  (ignored)
7139  *
7140  * Indirect:      uap->mode               File mode to set (same as 'chmod')
7141  *                uap->uid                UID to set
7142  *                uap->gid                GID to set
7143  *                uap->xsecurity          ACL to set (or delete)
7144  *                uap->fd                 File descriptor of file to change mode
7145  *
7146  * Returns:        0                      Success
7147  *                !0                      errno value
7148  *
7149  */
7150 int
7151 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
7152 {
7153         int error;
7154         struct vnode_attr va;
7155         kauth_filesec_t xsecdst;
7156
7157         AUDIT_ARG(owner, uap->uid, uap->gid);
7158
7159         VATTR_INIT(&va);
7160         if (uap->mode != -1) {
7161                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7162         }
7163         if (uap->uid != KAUTH_UID_NONE) {
7164                 VATTR_SET(&va, va_uid, uap->uid);
7165         }
7166         if (uap->gid != KAUTH_GID_NONE) {
7167                 VATTR_SET(&va, va_gid, uap->gid);
7168         }
7169
7170         xsecdst = NULL;
7171         switch (uap->xsecurity) {
7172         case USER_ADDR_NULL:
7173                 VATTR_SET(&va, va_acl, NULL);
7174                 break;
7175         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
7176                 VATTR_SET(&va, va_acl, NULL);
7177                 break;
7178         /* not being set */
7179         case CAST_USER_ADDR_T(-1):
7180                 break;
7181         default:
7182                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
7183                         return error;
7184                 }
7185                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7186         }
7187
7188         error = fchmod1(p, uap->fd, &va);
7189
7190
7191         switch (uap->xsecurity) {
7192         case USER_ADDR_NULL:
7193         case CAST_USER_ADDR_T(-1):
7194                 break;
7195         default:
7196                 if (xsecdst != NULL) {
7197                         kauth_filesec_free(xsecdst);
7198                 }
7199         }
7200         return error;
7201 }
7202
7203 int
7204 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
7205 {
7206         struct vnode_attr va;
7207
7208         VATTR_INIT(&va);
7209         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
7210
7211         return fchmod1(p, uap->fd, &va);
7212 }
7213
7214
7215 /*
7216  * Set ownership given a path name.
7217  */
7218 /* ARGSUSED */
7219 static int
7220 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
7221     gid_t gid, int flag, enum uio_seg segflg)
7222 {
7223         vnode_t vp;
7224         struct vnode_attr va;
7225         int error;
7226         struct nameidata nd;
7227         int follow;
7228         kauth_action_t action;
7229
7230         AUDIT_ARG(owner, uid, gid);
7231
7232         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7233         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
7234             path, ctx);
7235         error = nameiat(&nd, fd);
7236         if (error) {
7237                 return error;
7238         }
7239         vp = nd.ni_vp;
7240
7241         nameidone(&nd);
7242
7243         VATTR_INIT(&va);
7244         if (uid != (uid_t)VNOVAL) {
7245                 VATTR_SET(&va, va_uid, uid);
7246         }
7247         if (gid != (gid_t)VNOVAL) {
7248                 VATTR_SET(&va, va_gid, gid);
7249         }
7250
7251 #if CONFIG_MACF
7252         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7253         if (error) {
7254                 goto out;
7255         }
7256 #endif
7257
7258         /* preflight and authorize attribute changes */
7259         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7260                 goto out;
7261         }
7262         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7263                 goto out;
7264         }
7265         error = vnode_setattr(vp, &va, ctx);
7266
7267 #if CONFIG_MACF
7268         if (error == 0) {
7269                 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7270         }
7271 #endif
7272
7273 out:
7274         /*
7275          * EACCES is only allowed from namei(); permissions failure should
7276          * return EPERM, so we need to translate the error code.
7277          */
7278         if (error == EACCES) {
7279                 error = EPERM;
7280         }
7281
7282         vnode_put(vp);
7283         return error;
7284 }
7285
7286 int
7287 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7288 {
7289         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7290                    uap->uid, uap->gid, 0, UIO_USERSPACE);
7291 }
7292
7293 int
7294 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7295 {
7296         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7297                    uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7298 }
7299
7300 int
7301 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7302 {
7303         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7304                 return EINVAL;
7305         }
7306
7307         return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7308                    uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7309 }
7310
7311 /*
7312  * Set ownership given a file descriptor.
7313  */
7314 /* ARGSUSED */
7315 int
7316 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7317 {
7318         struct vnode_attr va;
7319         vfs_context_t ctx = vfs_context_current();
7320         vnode_t vp;
7321         int error;
7322         kauth_action_t action;
7323
7324         AUDIT_ARG(owner, uap->uid, uap->gid);
7325         AUDIT_ARG(fd, uap->fd);
7326
7327         if ((error = file_vnode(uap->fd, &vp))) {
7328                 return error;
7329         }
7330
7331         if ((error = vnode_getwithref(vp))) {
7332                 file_drop(uap->fd);
7333                 return error;
7334         }
7335         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7336
7337         VATTR_INIT(&va);
7338         if (uap->uid != VNOVAL) {
7339                 VATTR_SET(&va, va_uid, uap->uid);
7340         }
7341         if (uap->gid != VNOVAL) {
7342                 VATTR_SET(&va, va_gid, uap->gid);
7343         }
7344
7345 #if NAMEDSTREAMS
7346         /* chown calls are not allowed for resource forks. */
7347         if (vp->v_flag & VISNAMEDSTREAM) {
7348                 error = EPERM;
7349                 goto out;
7350         }
7351 #endif
7352
7353 #if CONFIG_MACF
7354         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7355         if (error) {
7356                 goto out;
7357         }
7358 #endif
7359
7360         /* preflight and authorize attribute changes */
7361         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7362                 goto out;
7363         }
7364         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7365                 if (error == EACCES) {
7366                         error = EPERM;
7367                 }
7368                 goto out;
7369         }
7370         error = vnode_setattr(vp, &va, ctx);
7371
7372 #if CONFIG_MACF
7373         if (error == 0) {
7374                 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7375         }
7376 #endif
7377
7378 out:
7379         (void)vnode_put(vp);
7380         file_drop(uap->fd);
7381         return error;
7382 }
7383
7384 static int
7385 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7386 {
7387         int error;
7388
7389         if (usrtvp == USER_ADDR_NULL) {
7390                 struct timeval old_tv;
7391                 /* XXX Y2038 bug because of microtime argument */
7392                 microtime(&old_tv);
7393                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7394                 tsp[1] = tsp[0];
7395         } else {
7396                 if (IS_64BIT_PROCESS(current_proc())) {
7397                         struct user64_timeval tv[2];
7398                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
7399                         if (error) {
7400                                 return error;
7401                         }
7402                         TIMEVAL64_TO_TIMESPEC(&tv[0], &tsp[0]);
7403                         TIMEVAL64_TO_TIMESPEC(&tv[1], &tsp[1]);
7404                 } else {
7405                         struct user32_timeval tv[2];
7406                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
7407                         if (error) {
7408                                 return error;
7409                         }
7410                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7411                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7412                 }
7413         }
7414         return 0;
7415 }
7416
7417 static int
7418 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7419     int nullflag)
7420 {
7421         int error;
7422         struct vnode_attr va;
7423         kauth_action_t action;
7424
7425         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7426
7427         VATTR_INIT(&va);
7428         VATTR_SET(&va, va_access_time, ts[0]);
7429         VATTR_SET(&va, va_modify_time, ts[1]);
7430         if (nullflag) {
7431                 va.va_vaflags |= VA_UTIMES_NULL;
7432         }
7433
7434 #if NAMEDSTREAMS
7435         /* utimes calls are not allowed for resource forks. */
7436         if (vp->v_flag & VISNAMEDSTREAM) {
7437                 error = EPERM;
7438                 goto out;
7439         }
7440 #endif
7441
7442 #if CONFIG_MACF
7443         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7444         if (error) {
7445                 goto out;
7446         }
7447 #endif
7448         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7449                 if (!nullflag && error == EACCES) {
7450                         error = EPERM;
7451                 }
7452                 goto out;
7453         }
7454
7455         /* since we may not need to auth anything, check here */
7456         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7457                 if (!nullflag && error == EACCES) {
7458                         error = EPERM;
7459                 }
7460                 goto out;
7461         }
7462         error = vnode_setattr(vp, &va, ctx);
7463
7464 #if CONFIG_MACF
7465         if (error == 0) {
7466                 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7467         }
7468 #endif
7469
7470 out:
7471         return error;
7472 }
7473
7474 /*
7475  * Set the access and modification times of a file.
7476  */
7477 /* ARGSUSED */
7478 int
7479 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7480 {
7481         struct timespec ts[2];
7482         user_addr_t usrtvp;
7483         int error;
7484         struct nameidata nd;
7485         vfs_context_t ctx = vfs_context_current();
7486
7487         /*
7488          * AUDIT: Needed to change the order of operations to do the
7489          * name lookup first because auditing wants the path.
7490          */
7491         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7492             UIO_USERSPACE, uap->path, ctx);
7493         error = namei(&nd);
7494         if (error) {
7495                 return error;
7496         }
7497         nameidone(&nd);
7498
7499         /*
7500          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
7501          * the current time instead.
7502          */
7503         usrtvp = uap->tptr;
7504         if ((error = getutimes(usrtvp, ts)) != 0) {
7505                 goto out;
7506         }
7507
7508         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7509
7510 out:
7511         vnode_put(nd.ni_vp);
7512         return error;
7513 }
7514
7515 /*
7516  * Set the access and modification times of a file.
7517  */
7518 /* ARGSUSED */
7519 int
7520 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7521 {
7522         struct timespec ts[2];
7523         vnode_t vp;
7524         user_addr_t usrtvp;
7525         int error;
7526
7527         AUDIT_ARG(fd, uap->fd);
7528         usrtvp = uap->tptr;
7529         if ((error = getutimes(usrtvp, ts)) != 0) {
7530                 return error;
7531         }
7532         if ((error = file_vnode(uap->fd, &vp)) != 0) {
7533                 return error;
7534         }
7535         if ((error = vnode_getwithref(vp))) {
7536                 file_drop(uap->fd);
7537                 return error;
7538         }
7539
7540         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7541         vnode_put(vp);
7542         file_drop(uap->fd);
7543         return error;
7544 }
7545
7546 /*
7547  * Truncate a file given its path name.
7548  */
7549 /* ARGSUSED */
7550 int
7551 truncate(proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7552 {
7553         vnode_t vp;
7554         struct vnode_attr va;
7555         vfs_context_t ctx = vfs_context_current();
7556         int error;
7557         struct nameidata nd;
7558         kauth_action_t action;
7559         rlim_t fsize_limit;
7560
7561         if (uap->length < 0) {
7562                 return EINVAL;
7563         }
7564
7565         fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE, TRUE);
7566         if ((rlim_t)uap->length > fsize_limit) {
7567                 psignal(p, SIGXFSZ);
7568                 return EFBIG;
7569         }
7570
7571         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7572             UIO_USERSPACE, uap->path, ctx);
7573         if ((error = namei(&nd))) {
7574                 return error;
7575         }
7576         vp = nd.ni_vp;
7577
7578         nameidone(&nd);
7579
7580         VATTR_INIT(&va);
7581         VATTR_SET(&va, va_data_size, uap->length);
7582
7583 #if CONFIG_MACF
7584         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7585         if (error) {
7586                 goto out;
7587         }
7588 #endif
7589
7590         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7591                 goto out;
7592         }
7593         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7594                 goto out;
7595         }
7596         error = vnode_setattr(vp, &va, ctx);
7597
7598 #if CONFIG_MACF
7599         if (error == 0) {
7600                 mac_vnode_notify_truncate(ctx, NOCRED, vp);
7601         }
7602 #endif
7603
7604 out:
7605         vnode_put(vp);
7606         return error;
7607 }
7608
7609 /*
7610  * Truncate a file given a file descriptor.
7611  */
7612 /* ARGSUSED */
7613 int
7614 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7615 {
7616         vfs_context_t ctx = vfs_context_current();
7617         struct vnode_attr va;
7618         vnode_t vp;
7619         struct fileproc *fp;
7620         int error;
7621         int fd = uap->fd;
7622         rlim_t fsize_limit;
7623
7624         AUDIT_ARG(fd, uap->fd);
7625         if (uap->length < 0) {
7626                 return EINVAL;
7627         }
7628
7629         fsize_limit = proc_limitgetcur(p, RLIMIT_FSIZE, TRUE);
7630         if ((rlim_t)uap->length > fsize_limit) {
7631                 psignal(p, SIGXFSZ);
7632                 return EFBIG;
7633         }
7634
7635         if ((error = fp_lookup(p, fd, &fp, 0))) {
7636                 return error;
7637         }
7638
7639         switch (FILEGLOB_DTYPE(fp->fp_glob)) {
7640         case DTYPE_PSXSHM:
7641                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7642                 goto out;
7643         case DTYPE_VNODE:
7644                 break;
7645         default:
7646                 error = EINVAL;
7647                 goto out;
7648         }
7649
7650         vp = (vnode_t)fp->fp_glob->fg_data;
7651
7652         if ((fp->fp_glob->fg_flag & FWRITE) == 0) {
7653                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7654                 error = EINVAL;
7655                 goto out;
7656         }
7657
7658         if ((error = vnode_getwithref(vp)) != 0) {
7659                 goto out;
7660         }
7661
7662         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7663
7664 #if CONFIG_MACF
7665         error = mac_vnode_check_truncate(ctx,
7666             fp->fp_glob->fg_cred, vp);
7667         if (error) {
7668                 (void)vnode_put(vp);
7669                 goto out;
7670         }
7671 #endif
7672         VATTR_INIT(&va);
7673         VATTR_SET(&va, va_data_size, uap->length);
7674         error = vnode_setattr(vp, &va, ctx);
7675
7676 #if CONFIG_MACF
7677         if (error == 0) {
7678                 mac_vnode_notify_truncate(ctx, fp->fp_glob->fg_cred, vp);
7679         }
7680 #endif
7681
7682         (void)vnode_put(vp);
7683 out:
7684         file_drop(fd);
7685         return error;
7686 }
7687
7688
7689 /*
7690  * Sync an open file with synchronized I/O _file_ integrity completion
7691  */
7692 /* ARGSUSED */
7693 int
7694 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7695 {
7696         __pthread_testcancel(1);
7697         return fsync_common(p, uap, MNT_WAIT);
7698 }
7699
7700
7701 /*
7702  * Sync an open file with synchronized I/O _file_ integrity completion
7703  *
7704  * Notes:       This is a legacy support function that does not test for
7705  *              thread cancellation points.
7706  */
7707 /* ARGSUSED */
7708 int
7709 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7710 {
7711         return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7712 }
7713
7714
7715 /*
7716  * Sync an open file with synchronized I/O _data_ integrity completion
7717  */
7718 /* ARGSUSED */
7719 int
7720 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7721 {
7722         __pthread_testcancel(1);
7723         return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7724 }
7725
7726
7727 /*
7728  * fsync_common
7729  *
7730  * Common fsync code to support both synchronized I/O file integrity completion
7731  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7732  *
7733  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7734  * will only guarantee that the file data contents are retrievable.  If
7735  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7736  * includes additional metadata unnecessary for retrieving the file data
7737  * contents, such as atime, mtime, ctime, etc., also be committed to stable
7738  * storage.
7739  *
7740  * Parameters:  p                               The process
7741  *              uap->fd                         The descriptor to synchronize
7742  *              flags                           The data integrity flags
7743  *
7744  * Returns:     int                             Success
7745  *      fp_getfvp:EBADF                         Bad file descriptor
7746  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
7747  *      VNOP_FSYNC:???                          unspecified
7748  *
7749  * Notes:       We use struct fsync_args because it is a short name, and all
7750  *              caller argument structures are otherwise identical.
7751  */
7752 static int
7753 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7754 {
7755         vnode_t vp;
7756         struct fileproc *fp;
7757         vfs_context_t ctx = vfs_context_current();
7758         int error;
7759
7760         AUDIT_ARG(fd, uap->fd);
7761
7762         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7763                 return error;
7764         }
7765         if ((error = vnode_getwithref(vp))) {
7766                 file_drop(uap->fd);
7767                 return error;
7768         }
7769
7770         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7771
7772         error = VNOP_FSYNC(vp, flags, ctx);
7773
7774 #if NAMEDRSRCFORK
7775         /* Sync resource fork shadow file if necessary. */
7776         if ((error == 0) &&
7777             (vp->v_flag & VISNAMEDSTREAM) &&
7778             (vp->v_parent != NULLVP) &&
7779             vnode_isshadow(vp) &&
7780             (fp->fp_glob->fg_flag & FWASWRITTEN)) {
7781                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7782         }
7783 #endif
7784
7785         (void)vnode_put(vp);
7786         file_drop(uap->fd);
7787         return error;
7788 }
7789
7790 /*
7791  * Duplicate files.  Source must be a file, target must be a file or
7792  * must not exist.
7793  *
7794  * XXX Copyfile authorisation checking is woefully inadequate, and will not
7795  *     perform inheritance correctly.
7796  */
7797 /* ARGSUSED */
7798 int
7799 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7800 {
7801         vnode_t tvp, fvp, tdvp, sdvp;
7802         struct nameidata fromnd, tond;
7803         int error;
7804         vfs_context_t ctx = vfs_context_current();
7805 #if CONFIG_MACF
7806         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
7807         struct vnode_attr va;
7808 #endif
7809
7810         /* Check that the flags are valid. */
7811
7812         if (uap->flags & ~CPF_MASK) {
7813                 return EINVAL;
7814         }
7815
7816         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7817             UIO_USERSPACE, uap->from, ctx);
7818         if ((error = namei(&fromnd))) {
7819                 return error;
7820         }
7821         fvp = fromnd.ni_vp;
7822
7823         NDINIT(&tond, CREATE, OP_LINK,
7824             LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7825             UIO_USERSPACE, uap->to, ctx);
7826         if ((error = namei(&tond))) {
7827                 goto out1;
7828         }
7829         tdvp = tond.ni_dvp;
7830         tvp = tond.ni_vp;
7831
7832         if (tvp != NULL) {
7833                 if (!(uap->flags & CPF_OVERWRITE)) {
7834                         error = EEXIST;
7835                         goto out;
7836                 }
7837         }
7838
7839         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7840                 error = EISDIR;
7841                 goto out;
7842         }
7843
7844         /* This calls existing MAC hooks for open  */
7845         if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
7846             NULL))) {
7847                 goto out;
7848         }
7849
7850         if (tvp) {
7851                 /*
7852                  * See unlinkat_internal for an explanation of the potential
7853                  * ENOENT from the MAC hook but the gist is that the MAC hook
7854                  * can fail because vn_getpath isn't able to return the full
7855                  * path. We choose to ignore this failure.
7856                  */
7857                 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
7858                 if (error && error != ENOENT) {
7859                         goto out;
7860                 }
7861                 error = 0;
7862         }
7863
7864 #if CONFIG_MACF
7865         VATTR_INIT(&va);
7866         VATTR_SET(&va, va_type, fvp->v_type);
7867         /* Mask off all but regular access permissions */
7868         VATTR_SET(&va, va_mode,
7869             ((((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
7870         error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
7871         if (error) {
7872                 goto out;
7873         }
7874 #endif /* CONFIG_MACF */
7875
7876         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7877                 goto out;
7878         }
7879
7880         if (fvp == tdvp) {
7881                 error = EINVAL;
7882         }
7883         /*
7884          * If source is the same as the destination (that is the
7885          * same inode number) then there is nothing to do.
7886          * (fixed to have POSIX semantics - CSM 3/2/98)
7887          */
7888         if (fvp == tvp) {
7889                 error = -1;
7890         }
7891         if (!error) {
7892                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7893         }
7894 out:
7895         sdvp = tond.ni_startdir;
7896         /*
7897          * nameidone has to happen before we vnode_put(tdvp)
7898          * since it may need to release the fs_nodelock on the tdvp
7899          */
7900         nameidone(&tond);
7901
7902         if (tvp) {
7903                 vnode_put(tvp);
7904         }
7905         vnode_put(tdvp);
7906         vnode_put(sdvp);
7907 out1:
7908         vnode_put(fvp);
7909
7910         nameidone(&fromnd);
7911
7912         if (error == -1) {
7913                 return 0;
7914         }
7915         return error;
7916 }
7917
7918 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7919
7920 /*
7921  * Helper function for doing clones. The caller is expected to provide an
7922  * iocounted source vnode and release it.
7923  */
7924 static int
7925 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7926     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7927 {
7928         vnode_t tvp, tdvp;
7929         struct nameidata tond;
7930         int error;
7931         int follow;
7932         boolean_t free_src_acl;
7933         boolean_t attr_cleanup;
7934         enum vtype v_type;
7935         kauth_action_t action;
7936         struct componentname *cnp;
7937         uint32_t defaulted;
7938         struct vnode_attr va;
7939         struct vnode_attr nva;
7940         uint32_t vnop_flags;
7941
7942         v_type = vnode_vtype(fvp);
7943         switch (v_type) {
7944         case VLNK:
7945         /* FALLTHRU */
7946         case VREG:
7947                 action = KAUTH_VNODE_ADD_FILE;
7948                 break;
7949         case VDIR:
7950                 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7951                     fvp->v_mountedhere) {
7952                         return EINVAL;
7953                 }
7954                 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7955                 break;
7956         default:
7957                 return EINVAL;
7958         }
7959
7960         AUDIT_ARG(fd2, dst_dirfd);
7961         AUDIT_ARG(value32, flags);
7962
7963         follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7964         NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7965             UIO_USERSPACE, dst, ctx);
7966         if ((error = nameiat(&tond, dst_dirfd))) {
7967                 return error;
7968         }
7969         cnp = &tond.ni_cnd;
7970         tdvp = tond.ni_dvp;
7971         tvp = tond.ni_vp;
7972
7973         free_src_acl = FALSE;
7974         attr_cleanup = FALSE;
7975
7976         if (tvp != NULL) {
7977                 error = EEXIST;
7978                 goto out;
7979         }
7980
7981         if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7982                 error = EXDEV;
7983                 goto out;
7984         }
7985
7986 #if CONFIG_MACF
7987         if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
7988                 goto out;
7989         }
7990 #endif
7991         if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
7992                 goto out;
7993         }
7994
7995         action = KAUTH_VNODE_GENERIC_READ_BITS;
7996         if (data_read_authorised) {
7997                 action &= ~KAUTH_VNODE_READ_DATA;
7998         }
7999         if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
8000                 goto out;
8001         }
8002
8003         /*
8004          * certain attributes may need to be changed from the source, we ask for
8005          * those here with the exception of source file's ACL. The clone file
8006          * will inherit the target directory's ACL.
8007          */
8008         VATTR_INIT(&va);
8009         VATTR_WANTED(&va, va_uid);
8010         VATTR_WANTED(&va, va_gid);
8011         VATTR_WANTED(&va, va_mode);
8012         VATTR_WANTED(&va, va_flags);
8013
8014         if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
8015                 goto out;
8016         }
8017
8018         VATTR_INIT(&nva);
8019         VATTR_SET(&nva, va_type, v_type);
8020         if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
8021                 VATTR_SET(&nva, va_acl, va.va_acl);
8022                 free_src_acl = TRUE;
8023         }
8024
8025         /* Handle ACL inheritance, initialize vap. */
8026         if (v_type == VLNK) {
8027                 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
8028         } else {
8029                 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
8030                 if (error) {
8031                         goto out;
8032                 }
8033                 attr_cleanup = TRUE;
8034         }
8035
8036         vnop_flags = VNODE_CLONEFILE_DEFAULT;
8037         /*
8038          * We've got initial values for all security parameters,
8039          * If we are superuser, then we can change owners to be the
8040          * same as the source. Both superuser and the owner have default
8041          * WRITE_SECURITY privileges so all other fields can be taken
8042          * from source as well.
8043          */
8044         if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
8045                 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
8046                         VATTR_SET(&nva, va_uid, va.va_uid);
8047                 }
8048                 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
8049                         VATTR_SET(&nva, va_gid, va.va_gid);
8050                 }
8051         } else {
8052                 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
8053         }
8054
8055         if (VATTR_IS_SUPPORTED(&va, va_mode)) {
8056                 VATTR_SET(&nva, va_mode, va.va_mode);
8057         }
8058         if (VATTR_IS_SUPPORTED(&va, va_flags)) {
8059                 VATTR_SET(&nva, va_flags,
8060                     ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
8061                     (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
8062         }
8063
8064         error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
8065
8066         if (!error && tvp) {
8067                 int     update_flags = 0;
8068 #if CONFIG_FSE
8069                 int fsevent;
8070 #endif /* CONFIG_FSE */
8071
8072                 /*
8073                  * If some of the requested attributes weren't handled by the
8074                  * VNOP, use our fallback code.
8075                  */
8076                 if (!VATTR_ALL_SUPPORTED(&nva)) {
8077                         (void)vnode_setattr_fallback(tvp, &nva, ctx);
8078                 }
8079
8080 #if CONFIG_MACF
8081                 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
8082                     VNODE_LABEL_CREATE, ctx);
8083 #endif
8084
8085                 // Make sure the name & parent pointers are hooked up
8086                 if (tvp->v_name == NULL) {
8087                         update_flags |= VNODE_UPDATE_NAME;
8088                 }
8089                 if (tvp->v_parent == NULLVP) {
8090                         update_flags |= VNODE_UPDATE_PARENT;
8091                 }
8092
8093                 if (update_flags) {
8094                         (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
8095                             cnp->cn_namelen, cnp->cn_hash, update_flags);
8096                 }
8097
8098 #if CONFIG_FSE
8099                 switch (vnode_vtype(tvp)) {
8100                 case VLNK:
8101                 /* FALLTHRU */
8102                 case VREG:
8103                         fsevent = FSE_CREATE_FILE;
8104                         break;
8105                 case VDIR:
8106                         fsevent = FSE_CREATE_DIR;
8107                         break;
8108                 default:
8109                         goto out;
8110                 }
8111
8112                 if (need_fsevent(fsevent, tvp)) {
8113                         /*
8114                          * The following is a sequence of three explicit events.
8115                          * A pair of FSE_CLONE events representing the source and destination
8116                          * followed by an FSE_CREATE_[FILE | DIR] for the destination.
8117                          * fseventsd may coalesce the destination clone and create events
8118                          * into a single event resulting in the following sequence for a client
8119                          * FSE_CLONE (src)
8120                          * FSE_CLONE | FSE_CREATE (dst)
8121                          */
8122                         add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
8123                             FSE_ARG_DONE);
8124                         add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
8125                             FSE_ARG_DONE);
8126                 }
8127 #endif /* CONFIG_FSE */
8128         }
8129
8130 out:
8131         if (attr_cleanup) {
8132                 vn_attribute_cleanup(&nva, defaulted);
8133         }
8134         if (free_src_acl && va.va_acl) {
8135                 kauth_acl_free(va.va_acl);
8136         }
8137         nameidone(&tond);
8138         if (tvp) {
8139                 vnode_put(tvp);
8140         }
8141         vnode_put(tdvp);
8142         return error;
8143 }
8144
8145 /*
8146  * clone files or directories, target must not exist.
8147  */
8148 /* ARGSUSED */
8149 int
8150 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
8151     __unused int32_t *retval)
8152 {
8153         vnode_t fvp;
8154         struct nameidata fromnd;
8155         int follow;
8156         int error;
8157         vfs_context_t ctx = vfs_context_current();
8158
8159         /* Check that the flags are valid. */
8160         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8161                 return EINVAL;
8162         }
8163
8164         AUDIT_ARG(fd, uap->src_dirfd);
8165
8166         follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
8167         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
8168             UIO_USERSPACE, uap->src, ctx);
8169         if ((error = nameiat(&fromnd, uap->src_dirfd))) {
8170                 return error;
8171         }
8172
8173         fvp = fromnd.ni_vp;
8174         nameidone(&fromnd);
8175
8176         error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
8177             uap->flags, ctx);
8178
8179         vnode_put(fvp);
8180         return error;
8181 }
8182
8183 int
8184 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
8185     __unused int32_t *retval)
8186 {
8187         vnode_t fvp;
8188         struct fileproc *fp;
8189         int error;
8190         vfs_context_t ctx = vfs_context_current();
8191
8192         /* Check that the flags are valid. */
8193         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
8194                 return EINVAL;
8195         }
8196
8197         AUDIT_ARG(fd, uap->src_fd);
8198         error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
8199         if (error) {
8200                 return error;
8201         }
8202
8203         if ((fp->fp_glob->fg_flag & FREAD) == 0) {
8204                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
8205                 error = EBADF;
8206                 goto out;
8207         }
8208
8209         if ((error = vnode_getwithref(fvp))) {
8210                 goto out;
8211         }
8212
8213         AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
8214
8215         error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
8216             uap->flags, ctx);
8217
8218         vnode_put(fvp);
8219 out:
8220         file_drop(uap->src_fd);
8221         return error;
8222 }
8223
8224 static int
8225 rename_submounts_callback(mount_t mp, void *arg)
8226 {
8227         int error = 0;
8228         mount_t pmp = (mount_t)arg;
8229         int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname);
8230
8231         if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
8232                 return 0;
8233         }
8234
8235         if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
8236                 return 0;
8237         }
8238
8239         if ((error = vfs_busy(mp, LK_NOWAIT))) {
8240                 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
8241                 return -1;
8242         }
8243
8244         int pathlen = MAXPATHLEN;
8245         if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
8246                 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
8247         }
8248
8249         vfs_unbusy(mp);
8250
8251         return error;
8252 }
8253
8254 /*
8255  * Rename files.  Source and destination must either both be directories,
8256  * or both not be directories.  If target is a directory, it must be empty.
8257  */
8258 /* ARGSUSED */
8259 static int
8260 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
8261     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
8262 {
8263         if (flags & ~VFS_RENAME_FLAGS_MASK) {
8264                 return EINVAL;
8265         }
8266
8267         if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
8268                 return EINVAL;
8269         }
8270
8271         vnode_t tvp, tdvp;
8272         vnode_t fvp, fdvp;
8273         vnode_t mnt_fvp;
8274         struct nameidata *fromnd, *tond;
8275         int error;
8276         int do_retry;
8277         int retry_count;
8278         int mntrename;
8279         int need_event;
8280         int need_kpath2;
8281         int has_listeners;
8282         const char *oname = NULL;
8283         char *from_name = NULL, *to_name = NULL;
8284         char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8285         int from_len = 0, to_len = 0;
8286         int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8287         int holding_mntlock;
8288         int vn_authorize_skipped;
8289         mount_t locked_mp = NULL;
8290         vnode_t oparent = NULLVP;
8291 #if CONFIG_FSE
8292         fse_info from_finfo, to_finfo;
8293 #endif
8294         int from_truncated = 0, to_truncated = 0;
8295         int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8296         int batched = 0;
8297         struct vnode_attr *fvap, *tvap;
8298         int continuing = 0;
8299         /* carving out a chunk for structs that are too big to be on stack. */
8300         struct {
8301                 struct nameidata from_node, to_node;
8302                 struct vnode_attr fv_attr, tv_attr;
8303         } * __rename_data;
8304         __rename_data = kheap_alloc(KHEAP_TEMP, sizeof(*__rename_data), Z_WAITOK);
8305         fromnd = &__rename_data->from_node;
8306         tond = &__rename_data->to_node;
8307
8308         holding_mntlock = 0;
8309         do_retry = 0;
8310         retry_count = 0;
8311 retry:
8312         fvp = tvp = NULL;
8313         fdvp = tdvp = NULL;
8314         fvap = tvap = NULL;
8315         mnt_fvp = NULLVP;
8316         mntrename = FALSE;
8317         vn_authorize_skipped = FALSE;
8318
8319         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8320             segflg, from, ctx);
8321         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
8322
8323         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8324             segflg, to, ctx);
8325         tond->ni_flag = NAMEI_COMPOUNDRENAME;
8326
8327 continue_lookup:
8328         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8329                 if ((error = nameiat(fromnd, fromfd))) {
8330                         goto out1;
8331                 }
8332                 fdvp = fromnd->ni_dvp;
8333                 fvp  = fromnd->ni_vp;
8334
8335                 if (fvp && fvp->v_type == VDIR) {
8336                         tond->ni_cnd.cn_flags |= WILLBEDIR;
8337                 }
8338         }
8339
8340         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8341                 if ((error = nameiat(tond, tofd))) {
8342                         /*
8343                          * Translate error code for rename("dir1", "dir2/.").
8344                          */
8345                         if (error == EISDIR && fvp->v_type == VDIR) {
8346                                 error = EINVAL;
8347                         }
8348                         goto out1;
8349                 }
8350                 tdvp = tond->ni_dvp;
8351                 tvp  = tond->ni_vp;
8352         }
8353
8354 #if DEVELOPMENT || DEBUG
8355         /*
8356          * XXX VSWAP: Check for entitlements or special flag here
8357          * so we can restrict access appropriately.
8358          */
8359 #else /* DEVELOPMENT || DEBUG */
8360
8361         if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8362                 error = EPERM;
8363                 goto out1;
8364         }
8365
8366         if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8367                 error = EPERM;
8368                 goto out1;
8369         }
8370 #endif /* DEVELOPMENT || DEBUG */
8371
8372         if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8373                 error = ENOENT;
8374                 goto out1;
8375         }
8376
8377         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8378                 int32_t pval = 0;
8379                 int err = 0;
8380
8381                 /*
8382                  * We allow rename with VFS_RENAME_EXCL flag for an existing file which
8383                  * has the same name as target iff the following conditions are met:
8384                  * 1. the target file system is case insensitive
8385                  * 2. source and target directories are the same
8386                  * 3. source and target files are the same
8387                  * 4. name only differs in case (determined by underlying filesystem)
8388                  */
8389                 if (fvp != tvp || fdvp != tdvp) {
8390                         error = EEXIST;
8391                         goto out1;
8392                 }
8393
8394                 /*
8395                  * Assume that the target file system is case sensitive if
8396                  * _PC_CASE_SENSITIVE selector isn't supported.
8397                  */
8398                 err = VNOP_PATHCONF(tvp, _PC_CASE_SENSITIVE, &pval, ctx);
8399                 if (err != 0 || pval != 0) {
8400                         error = EEXIST;
8401                         goto out1;
8402                 }
8403         }
8404
8405         batched = vnode_compound_rename_available(fdvp);
8406
8407 #if CONFIG_FSE
8408         need_event = need_fsevent(FSE_RENAME, fdvp);
8409         if (need_event) {
8410                 if (fvp) {
8411                         get_fse_info(fvp, &from_finfo, ctx);
8412                 } else {
8413                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8414                         if (error) {
8415                                 goto out1;
8416                         }
8417
8418                         fvap = &__rename_data->fv_attr;
8419                 }
8420
8421                 if (tvp) {
8422                         get_fse_info(tvp, &to_finfo, ctx);
8423                 } else if (batched) {
8424                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8425                         if (error) {
8426                                 goto out1;
8427                         }
8428
8429                         tvap = &__rename_data->tv_attr;
8430                 }
8431         }
8432 #else
8433         need_event = 0;
8434 #endif /* CONFIG_FSE */
8435
8436         has_listeners = kauth_authorize_fileop_has_listeners();
8437
8438         need_kpath2 = 0;
8439 #if CONFIG_AUDIT
8440         if (AUDIT_RECORD_EXISTS()) {
8441                 need_kpath2 = 1;
8442         }
8443 #endif
8444
8445         if (need_event || has_listeners) {
8446                 if (from_name == NULL) {
8447                         GET_PATH(from_name);
8448                 }
8449
8450                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8451
8452                 if (from_name_no_firmlink == NULL) {
8453                         GET_PATH(from_name_no_firmlink);
8454                 }
8455
8456                 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8457         }
8458
8459         if (need_event || need_kpath2 || has_listeners) {
8460                 if (to_name == NULL) {
8461                         GET_PATH(to_name);
8462                 }
8463
8464                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8465
8466                 if (to_name_no_firmlink == NULL) {
8467                         GET_PATH(to_name_no_firmlink);
8468                 }
8469
8470                 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8471                 if (to_name && need_kpath2) {
8472                         AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8473                 }
8474         }
8475         if (!fvp) {
8476                 /*
8477                  * Claim: this check will never reject a valid rename.
8478                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8479                  * Suppose fdvp and tdvp are not on the same mount.
8480                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
8481                  *      then you can't move it to within another dir on the same mountpoint.
8482                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8483                  *
8484                  * If this check passes, then we are safe to pass these vnodes to the same FS.
8485                  */
8486                 if (fdvp->v_mount != tdvp->v_mount) {
8487                         error = EXDEV;
8488                         goto out1;
8489                 }
8490                 goto skipped_lookup;
8491         }
8492
8493         /*
8494          * If the source and destination are the same (i.e. they're
8495          * links to the same vnode) and the target file system is
8496          * case sensitive, then there is nothing to do.
8497          *
8498          * XXX Come back to this.
8499          */
8500         if (fvp == tvp) {
8501                 int pathconf_val;
8502
8503                 /*
8504                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8505                  * then assume that this file system is case sensitive.
8506                  */
8507                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8508                     pathconf_val != 0) {
8509                         vn_authorize_skipped = TRUE;
8510                         goto out1;
8511                 }
8512         }
8513
8514         /*
8515          * Allow the renaming of mount points.
8516          * - target must not exist
8517          * - target must reside in the same directory as source
8518          * - union mounts cannot be renamed
8519          * - the root fs, and tightly-linked system volumes, cannot be renamed
8520          *
8521          * XXX Handle this in VFS after a continued lookup (if we missed
8522          * in the cache to start off)
8523          *
8524          * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8525          * we'll skip past here.  The file system is responsible for
8526          * checking that @tvp is not a descendent of @fvp and vice versa
8527          * so it should always return EINVAL if either @tvp or @fvp is the
8528          * root of a volume.
8529          */
8530         if ((fvp->v_flag & VROOT) &&
8531             (fvp->v_type == VDIR) &&
8532             (tvp == NULL) &&
8533             (fvp->v_mountedhere == NULL) &&
8534             (fdvp == tdvp) &&
8535             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8536             ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8537             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8538                 vnode_t coveredvp;
8539
8540                 /* switch fvp to the covered vnode */
8541                 coveredvp = fvp->v_mount->mnt_vnodecovered;
8542                 if ((vnode_getwithref(coveredvp))) {
8543                         error = ENOENT;
8544                         goto out1;
8545                 }
8546                 /*
8547                  * Save the 'fvp' as it is needed for vn_authorize_renamex_with_paths()
8548                  * later.
8549                  */
8550                 mnt_fvp = fvp;
8551
8552                 fvp = coveredvp;
8553                 mntrename = TRUE;
8554         }
8555         /*
8556          * Check for cross-device rename.
8557          */
8558         if ((fvp->v_mount != tdvp->v_mount) ||
8559             (tvp && (fvp->v_mount != tvp->v_mount))) {
8560                 error = EXDEV;
8561                 goto out1;
8562         }
8563
8564         /*
8565          * If source is the same as the destination (that is the
8566          * same inode number) then there is nothing to do...
8567          * EXCEPT if the underlying file system supports case
8568          * insensitivity and is case preserving.  In this case
8569          * the file system needs to handle the special case of
8570          * getting the same vnode as target (fvp) and source (tvp).
8571          *
8572          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8573          * and _PC_CASE_PRESERVING can have this exception, and they need to
8574          * handle the special case of getting the same vnode as target and
8575          * source.  NOTE: Then the target is unlocked going into vnop_rename,
8576          * so not to cause locking problems. There is a single reference on tvp.
8577          *
8578          * NOTE - that fvp == tvp also occurs if they are hard linked and
8579          * that correct behaviour then is just to return success without doing
8580          * anything.
8581          *
8582          * XXX filesystem should take care of this itself, perhaps...
8583          */
8584         if (fvp == tvp && fdvp == tdvp) {
8585                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8586                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8587                     fromnd->ni_cnd.cn_namelen)) {
8588                         vn_authorize_skipped = TRUE;
8589                         goto out1;
8590                 }
8591         }
8592
8593         if (holding_mntlock && fvp->v_mount != locked_mp) {
8594                 /*
8595                  * we're holding a reference and lock
8596                  * on locked_mp, but it no longer matches
8597                  * what we want to do... so drop our hold
8598                  */
8599                 mount_unlock_renames(locked_mp);
8600                 mount_drop(locked_mp, 0);
8601                 holding_mntlock = 0;
8602         }
8603         if (tdvp != fdvp && fvp->v_type == VDIR) {
8604                 /*
8605                  * serialize renames that re-shape
8606                  * the tree... if holding_mntlock is
8607                  * set, then we're ready to go...
8608                  * otherwise we
8609                  * first need to drop the iocounts
8610                  * we picked up, second take the
8611                  * lock to serialize the access,
8612                  * then finally start the lookup
8613                  * process over with the lock held
8614                  */
8615                 if (!holding_mntlock) {
8616                         /*
8617                          * need to grab a reference on
8618                          * the mount point before we
8619                          * drop all the iocounts... once
8620                          * the iocounts are gone, the mount
8621                          * could follow
8622                          */
8623                         locked_mp = fvp->v_mount;
8624                         mount_ref(locked_mp, 0);
8625
8626                         /*
8627                          * nameidone has to happen before we vnode_put(tvp)
8628                          * since it may need to release the fs_nodelock on the tvp
8629                          */
8630                         nameidone(tond);
8631
8632                         if (tvp) {
8633                                 vnode_put(tvp);
8634                         }
8635                         vnode_put(tdvp);
8636
8637                         /*
8638                          * nameidone has to happen before we vnode_put(fdvp)
8639                          * since it may need to release the fs_nodelock on the fvp
8640                          */
8641                         nameidone(fromnd);
8642
8643                         vnode_put(fvp);
8644                         vnode_put(fdvp);
8645
8646                         if (mnt_fvp != NULLVP) {
8647                                 vnode_put(mnt_fvp);
8648                         }
8649
8650                         mount_lock_renames(locked_mp);
8651                         holding_mntlock = 1;
8652
8653                         goto retry;
8654                 }
8655         } else {
8656                 /*
8657                  * when we dropped the iocounts to take
8658                  * the lock, we allowed the identity of
8659                  * the various vnodes to change... if they did,
8660                  * we may no longer be dealing with a rename
8661                  * that reshapes the tree... once we're holding
8662                  * the iocounts, the vnodes can't change type
8663                  * so we're free to drop the lock at this point
8664                  * and continue on
8665                  */
8666                 if (holding_mntlock) {
8667                         mount_unlock_renames(locked_mp);
8668                         mount_drop(locked_mp, 0);
8669                         holding_mntlock = 0;
8670                 }
8671         }
8672
8673         if (!batched) {
8674                 error = vn_authorize_renamex_with_paths(fdvp, mntrename ? mnt_fvp : fvp,
8675                     &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8676                     flags, NULL);
8677                 if (error) {
8678                         if (error == ENOENT) {
8679                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8680                                         /*
8681                                          * We encountered a race where after doing the namei,
8682                                          * tvp stops being valid. If so, simply re-drive the rename
8683                                          * call from the top.
8684                                          */
8685                                         do_retry = 1;
8686                                         retry_count += 1;
8687                                 }
8688                         }
8689                         goto out1;
8690                 }
8691         }
8692
8693         /* Release the 'mnt_fvp' now that it is no longer needed. */
8694         if (mnt_fvp != NULLVP) {
8695                 vnode_put(mnt_fvp);
8696                 mnt_fvp = NULLVP;
8697         }
8698
8699         // save these off so we can later verify that fvp is the same
8700         oname   = fvp->v_name;
8701         oparent = fvp->v_parent;
8702
8703 skipped_lookup:
8704         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8705             tdvp, &tvp, &tond->ni_cnd, tvap,
8706             flags, ctx);
8707
8708         if (holding_mntlock) {
8709                 /*
8710                  * we can drop our serialization
8711                  * lock now
8712                  */
8713                 mount_unlock_renames(locked_mp);
8714                 mount_drop(locked_mp, 0);
8715                 holding_mntlock = 0;
8716         }
8717         if (error) {
8718                 if (error == EDATALESS) {
8719                         /*
8720                          * If we've been here before, something has gone
8721                          * horribly wrong and we should just get out lest
8722                          * we spiral around the drain forever.
8723                          */
8724                         if (flags & VFS_RENAME_DATALESS) {
8725                                 error = EIO;
8726                                 goto out1;
8727                         }
8728
8729                         /*
8730                          * The object we're renaming is dataless (or has a
8731                          * dataless descendent) and requires materialization
8732                          * before the rename occurs.  But we're holding the
8733                          * mount point's rename lock, so it's not safe to
8734                          * make the upcall.
8735                          *
8736                          * In this case, we release the lock, perform the
8737                          * materialization, and start the whole thing over.
8738                          */
8739                         error = vnode_materialize_dataless_file(fvp,
8740                             NAMESPACE_HANDLER_RENAME_OP);
8741
8742                         if (error == 0) {
8743                                 /*
8744                                  * The next time around we need to tell the
8745                                  * file system that the materializtaion has
8746                                  * been performed.
8747                                  */
8748                                 flags |= VFS_RENAME_DATALESS;
8749                                 do_retry = 1;
8750                         }
8751                         goto out1;
8752                 }
8753                 if (error == EKEEPLOOKING) {
8754                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8755                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8756                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8757                                 }
8758                         }
8759
8760                         fromnd->ni_vp = fvp;
8761                         tond->ni_vp = tvp;
8762
8763                         goto continue_lookup;
8764                 }
8765
8766                 /*
8767                  * We may encounter a race in the VNOP where the destination didn't
8768                  * exist when we did the namei, but it does by the time we go and
8769                  * try to create the entry. In this case, we should re-drive this rename
8770                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
8771                  * but other filesystems susceptible to this race could return it, too.
8772                  */
8773                 if (error == ERECYCLE) {
8774                         if (retry_count < MAX_RENAME_ERECYCLE_RETRIES) {
8775                                 do_retry = 1;
8776                                 retry_count += 1;
8777                         } else {
8778                                 printf("rename retry limit due to ERECYCLE reached\n");
8779                                 error = ENOENT;
8780                         }
8781                 }
8782
8783                 /*
8784                  * For compound VNOPs, the authorization callback may return
8785                  * ENOENT in case of racing hardlink lookups hitting the name
8786                  * cache, redrive the lookup.
8787                  */
8788                 if (batched && error == ENOENT) {
8789                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8790                                 do_retry = 1;
8791                                 retry_count += 1;
8792                         }
8793                 }
8794
8795                 goto out1;
8796         }
8797
8798         /* call out to allow 3rd party notification of rename.
8799          * Ignore result of kauth_authorize_fileop call.
8800          */
8801         kauth_authorize_fileop(vfs_context_ucred(ctx),
8802             KAUTH_FILEOP_RENAME,
8803             (uintptr_t)from_name, (uintptr_t)to_name);
8804         if (flags & VFS_RENAME_SWAP) {
8805                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8806                     KAUTH_FILEOP_RENAME,
8807                     (uintptr_t)to_name, (uintptr_t)from_name);
8808         }
8809
8810 #if CONFIG_FSE
8811         if (from_name != NULL && to_name != NULL) {
8812                 if (from_truncated || to_truncated) {
8813                         // set it here since only the from_finfo gets reported up to user space
8814                         from_finfo.mode |= FSE_TRUNCATED_PATH;
8815                 }
8816
8817                 if (tvap && tvp) {
8818                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8819                 }
8820                 if (fvap) {
8821                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8822                 }
8823
8824                 if (tvp) {
8825                         add_fsevent(FSE_RENAME, ctx,
8826                             FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8827                             FSE_ARG_FINFO, &from_finfo,
8828                             FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8829                             FSE_ARG_FINFO, &to_finfo,
8830                             FSE_ARG_DONE);
8831                         if (flags & VFS_RENAME_SWAP) {
8832                                 /*
8833                                  * Strictly speaking, swap is the equivalent of
8834                                  * *three* renames.  FSEvents clients should only take
8835                                  * the events as a hint, so we only bother reporting
8836                                  * two.
8837                                  */
8838                                 add_fsevent(FSE_RENAME, ctx,
8839                                     FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8840                                     FSE_ARG_FINFO, &to_finfo,
8841                                     FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8842                                     FSE_ARG_FINFO, &from_finfo,
8843                                     FSE_ARG_DONE);
8844                         }
8845                 } else {
8846                         add_fsevent(FSE_RENAME, ctx,
8847                             FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8848                             FSE_ARG_FINFO, &from_finfo,
8849                             FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8850                             FSE_ARG_DONE);
8851                 }
8852         }
8853 #endif /* CONFIG_FSE */
8854
8855         /*
8856          * update filesystem's mount point data
8857          */
8858         if (mntrename) {
8859                 char *cp, *pathend, *mpname;
8860                 char * tobuf;
8861                 struct mount *mp;
8862                 int maxlen;
8863                 size_t len = 0;
8864
8865                 mp = fvp->v_mountedhere;
8866
8867                 if (vfs_busy(mp, LK_NOWAIT)) {
8868                         error = EBUSY;
8869                         goto out1;
8870                 }
8871                 tobuf = zalloc(ZV_NAMEI);
8872
8873                 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8874                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8875                 } else {
8876                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8877                 }
8878                 if (!error) {
8879                         /* find current mount point prefix */
8880                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
8881                         for (cp = pathend; *cp != '\0'; ++cp) {
8882                                 if (*cp == '/') {
8883                                         pathend = cp + 1;
8884                                 }
8885                         }
8886                         /* find last component of target name */
8887                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8888                                 if (*cp == '/') {
8889                                         mpname = cp + 1;
8890                                 }
8891                         }
8892
8893                         /* Update f_mntonname of sub mounts */
8894                         vfs_iterate(0, rename_submounts_callback, (void *)mp);
8895
8896                         /* append name to prefix */
8897                         maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname);
8898                         bzero(pathend, maxlen);
8899
8900                         strlcpy(pathend, mpname, maxlen);
8901                 }
8902                 zfree(ZV_NAMEI, tobuf);
8903
8904                 vfs_unbusy(mp);
8905
8906                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8907         }
8908         /*
8909          * fix up name & parent pointers.  note that we first
8910          * check that fvp has the same name/parent pointers it
8911          * had before the rename call... this is a 'weak' check
8912          * at best...
8913          *
8914          * XXX oparent and oname may not be set in the compound vnop case
8915          */
8916         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8917                 int update_flags;
8918
8919                 update_flags = VNODE_UPDATE_NAME;
8920
8921                 if (fdvp != tdvp) {
8922                         update_flags |= VNODE_UPDATE_PARENT;
8923                 }
8924
8925                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8926         }
8927 out1:
8928         /*
8929          * There are some cases (for e.g. 'fvp == tvp') when vn_authorize was
8930          * skipped earlier as no actual rename was performed.
8931          */
8932         if (vn_authorize_skipped && error == 0) {
8933                 error = vn_authorize_renamex_with_paths(fdvp, fvp,
8934                     &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx,
8935                     flags, NULL);
8936                 if (error && error == ENOENT) {
8937                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8938                                 do_retry = 1;
8939                                 retry_count += 1;
8940                         }
8941                 }
8942         }
8943         if (to_name != NULL) {
8944                 RELEASE_PATH(to_name);
8945                 to_name = NULL;
8946         }
8947         if (to_name_no_firmlink != NULL) {
8948                 RELEASE_PATH(to_name_no_firmlink);
8949                 to_name_no_firmlink = NULL;
8950         }
8951         if (from_name != NULL) {
8952                 RELEASE_PATH(from_name);
8953                 from_name = NULL;
8954         }
8955         if (from_name_no_firmlink != NULL) {
8956                 RELEASE_PATH(from_name_no_firmlink);
8957                 from_name_no_firmlink = NULL;
8958         }
8959         if (holding_mntlock) {
8960                 mount_unlock_renames(locked_mp);
8961                 mount_drop(locked_mp, 0);
8962                 holding_mntlock = 0;
8963         }
8964         if (tdvp) {
8965                 /*
8966                  * nameidone has to happen before we vnode_put(tdvp)
8967                  * since it may need to release the fs_nodelock on the tdvp
8968                  */
8969                 nameidone(tond);
8970
8971                 if (tvp) {
8972                         vnode_put(tvp);
8973                 }
8974                 vnode_put(tdvp);
8975         }
8976         if (fdvp) {
8977                 /*
8978                  * nameidone has to happen before we vnode_put(fdvp)
8979                  * since it may need to release the fs_nodelock on the fdvp
8980                  */
8981                 nameidone(fromnd);
8982
8983                 if (fvp) {
8984                         vnode_put(fvp);
8985                 }
8986                 vnode_put(fdvp);
8987         }
8988         if (mnt_fvp != NULLVP) {
8989                 vnode_put(mnt_fvp);
8990         }
8991         /*
8992          * If things changed after we did the namei, then we will re-drive
8993          * this rename call from the top.
8994          */
8995         if (do_retry) {
8996                 do_retry = 0;
8997                 goto retry;
8998         }
8999
9000         kheap_free(KHEAP_TEMP, __rename_data, sizeof(*__rename_data));
9001         return error;
9002 }
9003
9004 int
9005 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
9006 {
9007         return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
9008                    AT_FDCWD, uap->to, UIO_USERSPACE, 0);
9009 }
9010
9011 int
9012 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
9013 {
9014         return renameat_internal(
9015                 vfs_context_current(),
9016                 uap->fromfd, uap->from,
9017                 uap->tofd, uap->to,
9018                 UIO_USERSPACE, uap->flags);
9019 }
9020
9021 int
9022 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
9023 {
9024         return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
9025                    uap->tofd, uap->to, UIO_USERSPACE, 0);
9026 }
9027
9028 /*
9029  * Make a directory file.
9030  *
9031  * Returns:     0                       Success
9032  *              EEXIST
9033  *      namei:???
9034  *      vnode_authorize:???
9035  *      vn_create:???
9036  */
9037 /* ARGSUSED */
9038 static int
9039 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
9040     enum uio_seg segflg)
9041 {
9042         vnode_t vp, dvp;
9043         int error;
9044         int update_flags = 0;
9045         int batched;
9046         struct nameidata nd;
9047
9048         AUDIT_ARG(mode, vap->va_mode);
9049         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
9050             path, ctx);
9051         nd.ni_cnd.cn_flags |= WILLBEDIR;
9052         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
9053
9054 continue_lookup:
9055         error = nameiat(&nd, fd);
9056         if (error) {
9057                 return error;
9058         }
9059         dvp = nd.ni_dvp;
9060         vp = nd.ni_vp;
9061
9062         if (vp != NULL) {
9063                 error = EEXIST;
9064                 goto out;
9065         }
9066
9067         batched = vnode_compound_mkdir_available(dvp);
9068
9069         VATTR_SET(vap, va_type, VDIR);
9070
9071         /*
9072          * XXX
9073          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
9074          * only get EXISTS or EISDIR for existing path components, and not that it could see
9075          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
9076          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
9077          */
9078         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
9079                 if (error == EACCES || error == EPERM) {
9080                         int error2;
9081
9082                         nameidone(&nd);
9083                         vnode_put(dvp);
9084                         dvp = NULLVP;
9085
9086                         /*
9087                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
9088                          * rather than EACCESS if the target exists.
9089                          */
9090                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
9091                             path, ctx);
9092                         error2 = nameiat(&nd, fd);
9093                         if (error2) {
9094                                 goto out;
9095                         } else {
9096                                 vp = nd.ni_vp;
9097                                 error = EEXIST;
9098                                 goto out;
9099                         }
9100                 }
9101
9102                 goto out;
9103         }
9104
9105         /*
9106          * make the directory
9107          */
9108         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
9109                 if (error == EKEEPLOOKING) {
9110                         nd.ni_vp = vp;
9111                         goto continue_lookup;
9112                 }
9113
9114                 goto out;
9115         }
9116
9117         // Make sure the name & parent pointers are hooked up
9118         if (vp->v_name == NULL) {
9119                 update_flags |= VNODE_UPDATE_NAME;
9120         }
9121         if (vp->v_parent == NULLVP) {
9122                 update_flags |= VNODE_UPDATE_PARENT;
9123         }
9124
9125         if (update_flags) {
9126                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
9127         }
9128
9129 #if CONFIG_FSE
9130         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
9131 #endif
9132
9133 out:
9134         /*
9135          * nameidone has to happen before we vnode_put(dvp)
9136          * since it may need to release the fs_nodelock on the dvp
9137          */
9138         nameidone(&nd);
9139
9140         if (vp) {
9141                 vnode_put(vp);
9142         }
9143         if (dvp) {
9144                 vnode_put(dvp);
9145         }
9146
9147         return error;
9148 }
9149
9150 /*
9151  * mkdir_extended: Create a directory; with extended security (ACL).
9152  *
9153  * Parameters:    p                       Process requesting to create the directory
9154  *                uap                     User argument descriptor (see below)
9155  *                retval                  (ignored)
9156  *
9157  * Indirect:      uap->path               Path of directory to create
9158  *                uap->mode               Access permissions to set
9159  *                uap->xsecurity          ACL to set
9160  *
9161  * Returns:        0                      Success
9162  *                !0                      Not success
9163  *
9164  */
9165 int
9166 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
9167 {
9168         int ciferror;
9169         kauth_filesec_t xsecdst;
9170         struct vnode_attr va;
9171
9172         AUDIT_ARG(owner, uap->uid, uap->gid);
9173
9174         xsecdst = NULL;
9175         if ((uap->xsecurity != USER_ADDR_NULL) &&
9176             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
9177                 return ciferror;
9178         }
9179
9180         VATTR_INIT(&va);
9181         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
9182         if (xsecdst != NULL) {
9183                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
9184         }
9185
9186         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9187             UIO_USERSPACE);
9188         if (xsecdst != NULL) {
9189                 kauth_filesec_free(xsecdst);
9190         }
9191         return ciferror;
9192 }
9193
9194 int
9195 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
9196 {
9197         struct vnode_attr va;
9198
9199         VATTR_INIT(&va);
9200         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
9201
9202         return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
9203                    UIO_USERSPACE);
9204 }
9205
9206 int
9207 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
9208 {
9209         struct vnode_attr va;
9210
9211         VATTR_INIT(&va);
9212         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
9213
9214         return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
9215                    UIO_USERSPACE);
9216 }
9217
9218 static int
9219 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
9220     enum uio_seg segflg, int unlink_flags)
9221 {
9222         vnode_t vp, dvp;
9223         int error;
9224         struct nameidata nd;
9225         char     *path = NULL;
9226         char     *no_firmlink_path = NULL;
9227         int       len_path = 0;
9228         int       len_no_firmlink_path = 0;
9229         int has_listeners = 0;
9230         int need_event = 0;
9231         int truncated_path = 0;
9232         int truncated_no_firmlink_path = 0;
9233 #if CONFIG_FSE
9234         struct vnode_attr va;
9235 #endif /* CONFIG_FSE */
9236         struct vnode_attr *vap = NULL;
9237         int restart_count = 0;
9238         int batched;
9239
9240         int restart_flag;
9241
9242         /*
9243          * This loop exists to restart rmdir in the unlikely case that two
9244          * processes are simultaneously trying to remove the same directory
9245          * containing orphaned appleDouble files.
9246          */
9247         do {
9248                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
9249                     segflg, dirpath, ctx);
9250                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
9251 continue_lookup:
9252                 restart_flag = 0;
9253                 vap = NULL;
9254
9255                 error = nameiat(&nd, fd);
9256                 if (error) {
9257                         return error;
9258                 }
9259
9260                 dvp = nd.ni_dvp;
9261                 vp = nd.ni_vp;
9262
9263                 if (vp) {
9264                         batched = vnode_compound_rmdir_available(vp);
9265
9266                         if (vp->v_flag & VROOT) {
9267                                 /*
9268                                  * The root of a mounted filesystem cannot be deleted.
9269                                  */
9270                                 error = EBUSY;
9271                                 goto out;
9272                         }
9273
9274 #if DEVELOPMENT || DEBUG
9275                         /*
9276                          * XXX VSWAP: Check for entitlements or special flag here
9277                          * so we can restrict access appropriately.
9278                          */
9279 #else /* DEVELOPMENT || DEBUG */
9280
9281                         if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
9282                                 error = EPERM;
9283                                 goto out;
9284                         }
9285 #endif /* DEVELOPMENT || DEBUG */
9286
9287                         /*
9288                          * Removed a check here; we used to abort if vp's vid
9289                          * was not the same as what we'd seen the last time around.
9290                          * I do not think that check was valid, because if we retry
9291                          * and all dirents are gone, the directory could legitimately
9292                          * be recycled but still be present in a situation where we would
9293                          * have had permission to delete.  Therefore, we won't make
9294                          * an effort to preserve that check now that we may not have a
9295                          * vp here.
9296                          */
9297
9298                         if (!batched) {
9299                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
9300                                 if (error) {
9301                                         if (error == ENOENT) {
9302                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9303                                                         restart_flag = 1;
9304                                                         restart_count += 1;
9305                                                 }
9306                                         }
9307                                         goto out;
9308                                 }
9309                         }
9310                 } else {
9311                         batched = 1;
9312
9313                         if (!vnode_compound_rmdir_available(dvp)) {
9314                                 panic("No error, but no compound rmdir?");
9315                         }
9316                 }
9317
9318 #if CONFIG_FSE
9319                 fse_info  finfo = {0};
9320
9321                 need_event = need_fsevent(FSE_DELETE, dvp);
9322                 if (need_event) {
9323                         if (!batched) {
9324                                 get_fse_info(vp, &finfo, ctx);
9325                         } else {
9326                                 error = vfs_get_notify_attributes(&va);
9327                                 if (error) {
9328                                         goto out;
9329                                 }
9330
9331                                 vap = &va;
9332                         }
9333                 }
9334 #endif
9335                 has_listeners = kauth_authorize_fileop_has_listeners();
9336                 if (need_event || has_listeners) {
9337                         if (path == NULL) {
9338                                 GET_PATH(path);
9339                         }
9340
9341                         len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9342
9343                         if (no_firmlink_path == NULL) {
9344                                 GET_PATH(no_firmlink_path);
9345                         }
9346
9347                         len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9348 #if CONFIG_FSE
9349                         if (truncated_no_firmlink_path) {
9350                                 finfo.mode |= FSE_TRUNCATED_PATH;
9351                         }
9352 #endif
9353                 }
9354
9355                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9356                 nd.ni_vp = vp;
9357                 if (vp == NULLVP) {
9358                         /* Couldn't find a vnode */
9359                         goto out;
9360                 }
9361
9362                 if (error == EKEEPLOOKING) {
9363                         goto continue_lookup;
9364                 } else if (batched && error == ENOENT) {
9365                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9366                                 /*
9367                                  * For compound VNOPs, the authorization callback
9368                                  * may return ENOENT in case of racing hard link lookups
9369                                  * redrive the lookup.
9370                                  */
9371                                 restart_flag = 1;
9372                                 restart_count += 1;
9373                                 goto out;
9374                         }
9375                 }
9376
9377                 /*
9378                  * XXX There's no provision for passing flags
9379                  * to VNOP_RMDIR().  So, if vn_rmdir() fails
9380                  * because it's not empty, then we try again
9381                  * with VNOP_REMOVE(), passing in a special
9382                  * flag that clever file systems will know
9383                  * how to handle.
9384                  */
9385                 if (error == ENOTEMPTY &&
9386                     (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9387                         /*
9388                          * If this fails, we want to keep the original
9389                          * error.
9390                          */
9391                         if (vn_remove(dvp, &vp, &nd,
9392                             VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9393                                 error = 0;
9394                         }
9395                 }
9396
9397 #if CONFIG_APPLEDOUBLE
9398                 /*
9399                  * Special case to remove orphaned AppleDouble
9400                  * files. I don't like putting this in the kernel,
9401                  * but carbon does not like putting this in carbon either,
9402                  * so here we are.
9403                  */
9404                 if (error == ENOTEMPTY) {
9405                         int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9406                         if (ad_error == EBUSY) {
9407                                 error = ad_error;
9408                                 goto out;
9409                         }
9410
9411
9412                         /*
9413                          * Assuming everything went well, we will try the RMDIR again
9414                          */
9415                         if (!ad_error) {
9416                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9417                         }
9418                 }
9419 #endif /* CONFIG_APPLEDOUBLE */
9420                 /*
9421                  * Call out to allow 3rd party notification of delete.
9422                  * Ignore result of kauth_authorize_fileop call.
9423                  */
9424                 if (!error) {
9425                         if (has_listeners) {
9426                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
9427                                     KAUTH_FILEOP_DELETE,
9428                                     (uintptr_t)vp,
9429                                     (uintptr_t)path);
9430                         }
9431
9432                         if (vp->v_flag & VISHARDLINK) {
9433                                 // see the comment in unlink1() about why we update
9434                                 // the parent of a hard link when it is removed
9435                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9436                         }
9437
9438 #if CONFIG_FSE
9439                         if (need_event) {
9440                                 if (vap) {
9441                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
9442                                 }
9443                                 add_fsevent(FSE_DELETE, ctx,
9444                                     FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9445                                     FSE_ARG_FINFO, &finfo,
9446                                     FSE_ARG_DONE);
9447                         }
9448 #endif
9449                 }
9450
9451 out:
9452                 if (path != NULL) {
9453                         RELEASE_PATH(path);
9454                         path = NULL;
9455                 }
9456
9457                 if (no_firmlink_path != NULL) {
9458                         RELEASE_PATH(no_firmlink_path);
9459                         no_firmlink_path = NULL;
9460                 }
9461
9462                 /*
9463                  * nameidone has to happen before we vnode_put(dvp)
9464                  * since it may need to release the fs_nodelock on the dvp
9465                  */
9466                 nameidone(&nd);
9467                 vnode_put(dvp);
9468
9469                 if (vp) {
9470                         vnode_put(vp);
9471                 }
9472
9473                 if (restart_flag == 0) {
9474                         wakeup_one((caddr_t)vp);
9475                         return error;
9476                 }
9477                 tsleep(vp, PVFS, "rm AD", 1);
9478         } while (restart_flag != 0);
9479
9480         return error;
9481 }
9482
9483 /*
9484  * Remove a directory file.
9485  */
9486 /* ARGSUSED */
9487 int
9488 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9489 {
9490         return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9491                    CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9492 }
9493
9494 /* Get direntry length padded to 8 byte alignment */
9495 #define DIRENT64_LEN(namlen) \
9496         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9497
9498 /* Get dirent length padded to 4 byte alignment */
9499 #define DIRENT_LEN(namelen) \
9500         ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9501
9502 /* Get the end of this dirent */
9503 #define DIRENT_END(dep) \
9504         (((char *)(dep)) + (dep)->d_reclen - 1)
9505
9506 errno_t
9507 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9508     int *numdirent, vfs_context_t ctxp)
9509 {
9510         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
9511         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9512             ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9513                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9514         } else {
9515                 size_t bufsize;
9516                 void * bufptr;
9517                 uio_t auio;
9518                 struct direntry *entry64;
9519                 struct dirent *dep;
9520                 size_t bytesread;
9521                 int error;
9522
9523                 /*
9524                  * We're here because the underlying file system does not
9525                  * support direnties or we mounted denying support so we must
9526                  * fall back to dirents and convert them to direntries.
9527                  *
9528                  * Our kernel buffer needs to be smaller since re-packing will
9529                  * expand each dirent.  The worse case (when the name length
9530                  * is 3 or less) corresponds to a struct direntry size of 32
9531                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9532                  * (4-byte aligned).  So having a buffer that is 3/8 the size
9533                  * will prevent us from reading more than we can pack.
9534                  *
9535                  * Since this buffer is wired memory, we will limit the
9536                  * buffer size to a maximum of 32K. We would really like to
9537                  * use 32K in the MIN(), but we use magic number 87371 to
9538                  * prevent uio_resid() * 3 / 8 from overflowing.
9539                  */
9540                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9541                 bufptr = kheap_alloc(KHEAP_DATA_BUFFERS, bufsize, Z_WAITOK);
9542                 if (bufptr == NULL) {
9543                         return ENOMEM;
9544                 }
9545
9546                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9547                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9548                 auio->uio_offset = uio->uio_offset;
9549
9550                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9551
9552                 dep = (struct dirent *)bufptr;
9553                 bytesread = bufsize - uio_resid(auio);
9554
9555                 entry64 = kheap_alloc(KHEAP_TEMP, sizeof(struct direntry), Z_WAITOK);
9556                 /*
9557                  * Convert all the entries and copy them out to user's buffer.
9558                  */
9559                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9560                         /* First check that the dirent struct up to d_name is within the buffer */
9561                         if ((char*)dep + offsetof(struct dirent, d_name) > ((char *)bufptr + bytesread) ||
9562                             /* Check that the length of the entire dirent is within the buffer */
9563                             DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9564                             /* Check that the actual length including the name doesn't exceed d_reclen */
9565                             DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9566                                 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9567                                     vp->v_mount->mnt_vfsstat.f_mntonname,
9568                                     vp->v_name ? vp->v_name : "<unknown>");
9569                                 error = EIO;
9570                                 break;
9571                         }
9572
9573                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
9574
9575                         bzero(entry64, enbufsize);
9576                         /* Convert a dirent to a dirent64. */
9577                         entry64->d_ino = dep->d_ino;
9578                         entry64->d_seekoff = 0;
9579                         entry64->d_reclen = (uint16_t)enbufsize;
9580                         entry64->d_namlen = dep->d_namlen;
9581                         entry64->d_type = dep->d_type;
9582                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9583
9584                         /* Move to next entry. */
9585                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
9586
9587                         /* Copy entry64 to user's buffer. */
9588                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9589                 }
9590
9591                 /* Update the real offset using the offset we got from VNOP_READDIR. */
9592                 if (error == 0) {
9593                         uio->uio_offset = auio->uio_offset;
9594                 }
9595                 uio_free(auio);
9596                 kheap_free(KHEAP_DATA_BUFFERS, bufptr, bufsize);
9597                 kheap_free(KHEAP_TEMP, entry64, sizeof(struct direntry));
9598                 return error;
9599         }
9600 }
9601
9602 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
9603
9604 /*
9605  * Read a block of directory entries in a file system independent format.
9606  */
9607 static int
9608 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9609     off_t *offset, int *eofflag, int flags)
9610 {
9611         vnode_t vp;
9612         struct vfs_context context = *vfs_context_current();    /* local copy */
9613         struct fileproc *fp;
9614         uio_t auio;
9615         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9616         off_t loff;
9617         int error, numdirent;
9618         char uio_buf[UIO_SIZEOF(1)];
9619
9620         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9621         if (error) {
9622                 return error;
9623         }
9624         if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9625                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9626                 error = EBADF;
9627                 goto out;
9628         }
9629
9630         if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9631                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
9632         }
9633
9634 #if CONFIG_MACF
9635         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->fp_glob);
9636         if (error) {
9637                 goto out;
9638         }
9639 #endif
9640         if ((error = vnode_getwithref(vp))) {
9641                 goto out;
9642         }
9643         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9644
9645 unionread:
9646         if (vp->v_type != VDIR) {
9647                 (void)vnode_put(vp);
9648                 error = EINVAL;
9649                 goto out;
9650         }
9651
9652 #if CONFIG_MACF
9653         error = mac_vnode_check_readdir(&context, vp);
9654         if (error != 0) {
9655                 (void)vnode_put(vp);
9656                 goto out;
9657         }
9658 #endif /* MAC */
9659
9660         loff = fp->fp_glob->fg_offset;
9661         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9662         uio_addiov(auio, bufp, bufsize);
9663
9664         if (flags & VNODE_READDIR_EXTENDED) {
9665                 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9666                 fp->fp_glob->fg_offset = uio_offset(auio);
9667         } else {
9668                 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9669                 fp->fp_glob->fg_offset = uio_offset(auio);
9670         }
9671         if (error) {
9672                 (void)vnode_put(vp);
9673                 goto out;
9674         }
9675
9676         if ((user_ssize_t)bufsize == uio_resid(auio)) {
9677                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
9678                         struct vnode *tvp = vp;
9679                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
9680                                 vnode_ref(vp);
9681                                 fp->fp_glob->fg_data = (caddr_t) vp;
9682                                 fp->fp_glob->fg_offset = 0;
9683                                 vnode_rele(tvp);
9684                                 vnode_put(tvp);
9685                                 goto unionread;
9686                         }
9687                         vp = tvp;
9688                 }
9689         }
9690
9691         vnode_put(vp);
9692         if (offset) {
9693                 *offset = loff;
9694         }
9695
9696         *bytesread = bufsize - uio_resid(auio);
9697 out:
9698         file_drop(fd);
9699         return error;
9700 }
9701
9702
9703 int
9704 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9705 {
9706         off_t offset;
9707         ssize_t bytesread;
9708         int error, eofflag;
9709
9710         AUDIT_ARG(fd, uap->fd);
9711         error = getdirentries_common(uap->fd, uap->buf, uap->count,
9712             &bytesread, &offset, &eofflag, 0);
9713
9714         if (error == 0) {
9715                 if (proc_is64bit(p)) {
9716                         user64_long_t base = (user64_long_t)offset;
9717                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9718                 } else {
9719                         user32_long_t base = (user32_long_t)offset;
9720                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9721                 }
9722                 *retval = (int)bytesread;
9723         }
9724         return error;
9725 }
9726
9727 int
9728 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9729 {
9730         off_t offset;
9731         ssize_t bytesread;
9732         int error, eofflag;
9733         user_size_t bufsize;
9734
9735         AUDIT_ARG(fd, uap->fd);
9736
9737         /*
9738          * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9739          * then the kernel carves out the last 4 bytes to return extended
9740          * information to userspace (namely whether we reached EOF with this call).
9741          */
9742         if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9743                 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9744         } else {
9745                 bufsize = uap->bufsize;
9746         }
9747
9748         error = getdirentries_common(uap->fd, uap->buf, bufsize,
9749             &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9750
9751         if (error == 0) {
9752                 *retval = bytesread;
9753                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9754
9755                 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9756                         getdirentries64_flags_t flags = 0;
9757                         if (eofflag) {
9758                                 flags |= GETDIRENTRIES64_EOF;
9759                         }
9760                         error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9761                             sizeof(flags));
9762                 }
9763         }
9764         return error;
9765 }
9766
9767
9768 /*
9769  * Set the mode mask for creation of filesystem nodes.
9770  * XXX implement xsecurity
9771  */
9772 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
9773 static int
9774 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9775 {
9776         struct filedesc *fdp;
9777
9778         AUDIT_ARG(mask, newmask);
9779         proc_fdlock(p);
9780         fdp = p->p_fd;
9781         *retval = fdp->fd_cmask;
9782         fdp->fd_cmask = newmask & ALLPERMS;
9783         proc_fdunlock(p);
9784         return 0;
9785 }
9786
9787 /*
9788  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9789  *
9790  * Parameters:    p                       Process requesting to set the umask
9791  *                uap                     User argument descriptor (see below)
9792  *                retval                  umask of the process (parameter p)
9793  *
9794  * Indirect:      uap->newmask            umask to set
9795  *                uap->xsecurity          ACL to set
9796  *
9797  * Returns:        0                      Success
9798  *                !0                      Not success
9799  *
9800  */
9801 int
9802 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9803 {
9804         int ciferror;
9805         kauth_filesec_t xsecdst;
9806
9807         xsecdst = KAUTH_FILESEC_NONE;
9808         if (uap->xsecurity != USER_ADDR_NULL) {
9809                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
9810                         return ciferror;
9811                 }
9812         } else {
9813                 xsecdst = KAUTH_FILESEC_NONE;
9814         }
9815
9816         ciferror = umask1(p, uap->newmask, xsecdst, retval);
9817
9818         if (xsecdst != KAUTH_FILESEC_NONE) {
9819                 kauth_filesec_free(xsecdst);
9820         }
9821         return ciferror;
9822 }
9823
9824 int
9825 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9826 {
9827         return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9828 }
9829
9830 /*
9831  * Void all references to file by ripping underlying filesystem
9832  * away from vnode.
9833  */
9834 /* ARGSUSED */
9835 int
9836 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9837 {
9838         vnode_t vp;
9839         struct vnode_attr va;
9840         vfs_context_t ctx = vfs_context_current();
9841         int error;
9842         struct nameidata nd;
9843
9844         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9845             uap->path, ctx);
9846         error = namei(&nd);
9847         if (error) {
9848                 return error;
9849         }
9850         vp = nd.ni_vp;
9851
9852         nameidone(&nd);
9853
9854         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9855                 error = ENOTSUP;
9856                 goto out;
9857         }
9858
9859         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9860                 error = EBUSY;
9861                 goto out;
9862         }
9863
9864 #if CONFIG_MACF
9865         error = mac_vnode_check_revoke(ctx, vp);
9866         if (error) {
9867                 goto out;
9868         }
9869 #endif
9870
9871         VATTR_INIT(&va);
9872         VATTR_WANTED(&va, va_uid);
9873         if ((error = vnode_getattr(vp, &va, ctx))) {
9874                 goto out;
9875         }
9876         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9877             (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9878                 goto out;
9879         }
9880         if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9881                 VNOP_REVOKE(vp, REVOKEALL, ctx);
9882         }
9883 out:
9884         vnode_put(vp);
9885         return error;
9886 }
9887
9888
9889 /*
9890  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9891  *  The following system calls are designed to support features
9892  *  which are specific to the HFS & HFS Plus volume formats
9893  */
9894
9895
9896 /*
9897  * Obtain attribute information on objects in a directory while enumerating
9898  * the directory.
9899  */
9900 /* ARGSUSED */
9901 int
9902 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9903 {
9904         vnode_t vp;
9905         struct fileproc *fp;
9906         uio_t auio = NULL;
9907         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9908         uint32_t count = 0, savecount = 0;
9909         uint32_t newstate = 0;
9910         int error, eofflag;
9911         off_t loff = 0;
9912         struct attrlist attributelist;
9913         vfs_context_t ctx = vfs_context_current();
9914         int fd = uap->fd;
9915         char uio_buf[UIO_SIZEOF(1)];
9916         kauth_action_t action;
9917
9918         AUDIT_ARG(fd, fd);
9919
9920         /* Get the attributes into kernel space */
9921         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9922                 return error;
9923         }
9924         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9925                 return error;
9926         }
9927         savecount = count;
9928         if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9929                 return error;
9930         }
9931         if ((fp->fp_glob->fg_flag & FREAD) == 0) {
9932                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9933                 error = EBADF;
9934                 goto out;
9935         }
9936
9937
9938 #if CONFIG_MACF
9939         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
9940             fp->fp_glob);
9941         if (error) {
9942                 goto out;
9943         }
9944 #endif
9945
9946
9947         if ((error = vnode_getwithref(vp))) {
9948                 goto out;
9949         }
9950
9951         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9952
9953 unionread:
9954         if (vp->v_type != VDIR) {
9955                 (void)vnode_put(vp);
9956                 error = EINVAL;
9957                 goto out;
9958         }
9959
9960 #if CONFIG_MACF
9961         error = mac_vnode_check_readdir(ctx, vp);
9962         if (error != 0) {
9963                 (void)vnode_put(vp);
9964                 goto out;
9965         }
9966 #endif /* MAC */
9967
9968         /* set up the uio structure which will contain the users return buffer */
9969         loff = fp->fp_glob->fg_offset;
9970         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9971         uio_addiov(auio, uap->buffer, uap->buffersize);
9972
9973         /*
9974          * If the only item requested is file names, we can let that past with
9975          * just LIST_DIRECTORY.  If they want any other attributes, that means
9976          * they need SEARCH as well.
9977          */
9978         action = KAUTH_VNODE_LIST_DIRECTORY;
9979         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
9980             attributelist.fileattr || attributelist.dirattr) {
9981                 action |= KAUTH_VNODE_SEARCH;
9982         }
9983
9984         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
9985                 /* Believe it or not, uap->options only has 32-bits of valid
9986                  * info, so truncate before extending again */
9987
9988                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
9989                     (uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
9990         }
9991
9992         if (error) {
9993                 (void) vnode_put(vp);
9994                 goto out;
9995         }
9996
9997         /*
9998          * If we've got the last entry of a directory in a union mount
9999          * then reset the eofflag and pretend there's still more to come.
10000          * The next call will again set eofflag and the buffer will be empty,
10001          * so traverse to the underlying directory and do the directory
10002          * read there.
10003          */
10004         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
10005                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
10006                         eofflag = 0;
10007                 } else {                                                // Empty buffer
10008                         struct vnode *tvp = vp;
10009                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
10010                                 vnode_ref_ext(vp, fp->fp_glob->fg_flag & O_EVTONLY, 0);
10011                                 fp->fp_glob->fg_data = (caddr_t) vp;
10012                                 fp->fp_glob->fg_offset = 0; // reset index for new dir
10013                                 count = savecount;
10014                                 vnode_rele_internal(tvp, fp->fp_glob->fg_flag & O_EVTONLY, 0, 0);
10015                                 vnode_put(tvp);
10016                                 goto unionread;
10017                         }
10018                         vp = tvp;
10019                 }
10020         }
10021
10022         (void)vnode_put(vp);
10023
10024         if (error) {
10025                 goto out;
10026         }
10027         fp->fp_glob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
10028
10029         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
10030                 goto out;
10031         }
10032         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
10033                 goto out;
10034         }
10035         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
10036                 goto out;
10037         }
10038
10039         *retval = eofflag;  /* similar to getdirentries */
10040         error = 0;
10041 out:
10042         file_drop(fd);
10043         return error; /* return error earlier, an retval of 0 or 1 now */
10044 } /* end of getdirentriesattr system call */
10045
10046 /*
10047  * Exchange data between two files
10048  */
10049
10050 /* ARGSUSED */
10051 int
10052 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
10053 {
10054         struct nameidata fnd, snd;
10055         vfs_context_t ctx = vfs_context_current();
10056         vnode_t fvp;
10057         vnode_t svp;
10058         int error;
10059         u_int32_t nameiflags;
10060         char *fpath = NULL;
10061         char *spath = NULL;
10062         int   flen = 0, slen = 0;
10063         int from_truncated = 0, to_truncated = 0;
10064 #if CONFIG_FSE
10065         fse_info f_finfo, s_finfo;
10066 #endif
10067
10068         nameiflags = 0;
10069         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10070                 nameiflags |= FOLLOW;
10071         }
10072
10073         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
10074             UIO_USERSPACE, uap->path1, ctx);
10075
10076         error = namei(&fnd);
10077         if (error) {
10078                 goto out2;
10079         }
10080
10081         nameidone(&fnd);
10082         fvp = fnd.ni_vp;
10083
10084         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
10085             UIO_USERSPACE, uap->path2, ctx);
10086
10087         error = namei(&snd);
10088         if (error) {
10089                 vnode_put(fvp);
10090                 goto out2;
10091         }
10092         nameidone(&snd);
10093         svp = snd.ni_vp;
10094
10095         /*
10096          * if the files are the same, return an inval error
10097          */
10098         if (svp == fvp) {
10099                 error = EINVAL;
10100                 goto out;
10101         }
10102
10103         /*
10104          * if the files are on different volumes, return an error
10105          */
10106         if (svp->v_mount != fvp->v_mount) {
10107                 error = EXDEV;
10108                 goto out;
10109         }
10110
10111         /* If they're not files, return an error */
10112         if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
10113                 error = EINVAL;
10114                 goto out;
10115         }
10116
10117 #if CONFIG_MACF
10118         error = mac_vnode_check_exchangedata(ctx,
10119             fvp, svp);
10120         if (error) {
10121                 goto out;
10122         }
10123 #endif
10124         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
10125             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
10126                 goto out;
10127         }
10128
10129         if (
10130 #if CONFIG_FSE
10131                 need_fsevent(FSE_EXCHANGE, fvp) ||
10132 #endif
10133                 kauth_authorize_fileop_has_listeners()) {
10134                 GET_PATH(fpath);
10135                 GET_PATH(spath);
10136
10137                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
10138                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
10139
10140 #if CONFIG_FSE
10141                 get_fse_info(fvp, &f_finfo, ctx);
10142                 get_fse_info(svp, &s_finfo, ctx);
10143                 if (from_truncated || to_truncated) {
10144                         // set it here since only the f_finfo gets reported up to user space
10145                         f_finfo.mode |= FSE_TRUNCATED_PATH;
10146                 }
10147 #endif
10148         }
10149         /* Ok, make the call */
10150         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
10151
10152         if (error == 0) {
10153                 const char *tmpname;
10154
10155                 if (fpath != NULL && spath != NULL) {
10156                         /* call out to allow 3rd party notification of exchangedata.
10157                          * Ignore result of kauth_authorize_fileop call.
10158                          */
10159                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
10160                             (uintptr_t)fpath, (uintptr_t)spath);
10161                 }
10162                 name_cache_lock();
10163
10164                 tmpname     = fvp->v_name;
10165                 fvp->v_name = svp->v_name;
10166                 svp->v_name = tmpname;
10167
10168                 if (fvp->v_parent != svp->v_parent) {
10169                         vnode_t tmp;
10170
10171                         tmp           = fvp->v_parent;
10172                         fvp->v_parent = svp->v_parent;
10173                         svp->v_parent = tmp;
10174                 }
10175                 name_cache_unlock();
10176
10177 #if CONFIG_FSE
10178                 if (fpath != NULL && spath != NULL) {
10179                         add_fsevent(FSE_EXCHANGE, ctx,
10180                             FSE_ARG_STRING, flen, fpath,
10181                             FSE_ARG_FINFO, &f_finfo,
10182                             FSE_ARG_STRING, slen, spath,
10183                             FSE_ARG_FINFO, &s_finfo,
10184                             FSE_ARG_DONE);
10185                 }
10186 #endif
10187         }
10188
10189 out:
10190         if (fpath != NULL) {
10191                 RELEASE_PATH(fpath);
10192         }
10193         if (spath != NULL) {
10194                 RELEASE_PATH(spath);
10195         }
10196         vnode_put(svp);
10197         vnode_put(fvp);
10198 out2:
10199         return error;
10200 }
10201
10202 /*
10203  * Return (in MB) the amount of freespace on the given vnode's volume.
10204  */
10205 uint32_t freespace_mb(vnode_t vp);
10206
10207 uint32_t
10208 freespace_mb(vnode_t vp)
10209 {
10210         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
10211         return (uint32_t)(((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
10212                vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
10213 }
10214
10215 #if CONFIG_SEARCHFS
10216
10217 /* ARGSUSED */
10218
10219 int
10220 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
10221 {
10222         vnode_t vp, tvp;
10223         int i, error = 0;
10224         int fserror = 0;
10225         struct nameidata nd;
10226         struct user64_fssearchblock searchblock;
10227         struct searchstate *state;
10228         struct attrlist *returnattrs;
10229         struct timeval timelimit;
10230         void *searchparams1, *searchparams2;
10231         uio_t auio = NULL;
10232         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10233         uint32_t nummatches;
10234         size_t mallocsize;
10235         uint32_t nameiflags;
10236         vfs_context_t ctx = vfs_context_current();
10237         char uio_buf[UIO_SIZEOF(1)];
10238
10239         /* Start by copying in fsearchblock parameter list */
10240         if (IS_64BIT_PROCESS(p)) {
10241                 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
10242                 timelimit.tv_sec = searchblock.timelimit.tv_sec;
10243                 timelimit.tv_usec = searchblock.timelimit.tv_usec;
10244         } else {
10245                 struct user32_fssearchblock tmp_searchblock;
10246
10247                 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
10248                 // munge into 64-bit version
10249                 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
10250                 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
10251                 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
10252                 searchblock.maxmatches = tmp_searchblock.maxmatches;
10253                 /*
10254                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
10255                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
10256                  */
10257                 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
10258                 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
10259                 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
10260                 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
10261                 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
10262                 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
10263                 searchblock.searchattrs = tmp_searchblock.searchattrs;
10264         }
10265         if (error) {
10266                 return error;
10267         }
10268
10269         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
10270          */
10271         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
10272             searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
10273                 return EINVAL;
10274         }
10275
10276         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
10277         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
10278         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
10279         /* block.                                                                                             */
10280         /*                                                                                                    */
10281         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
10282         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
10283         /*       assumes the size is still 556 bytes it will continue to work                                 */
10284
10285         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
10286             sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
10287
10288         searchparams1 = kheap_alloc(KHEAP_DATA_BUFFERS, mallocsize, Z_WAITOK);
10289
10290         /* Now set up the various pointers to the correct place in our newly allocated memory */
10291
10292         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
10293         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
10294         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
10295
10296         /* Now copy in the stuff given our local variables. */
10297
10298         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10299                 goto freeandexit;
10300         }
10301
10302         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10303                 goto freeandexit;
10304         }
10305
10306         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10307                 goto freeandexit;
10308         }
10309
10310         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10311                 goto freeandexit;
10312         }
10313
10314         /*
10315          * When searching a union mount, need to set the
10316          * start flag at the first call on each layer to
10317          * reset state for the new volume.
10318          */
10319         if (uap->options & SRCHFS_START) {
10320                 state->ss_union_layer = 0;
10321         } else {
10322                 uap->options |= state->ss_union_flags;
10323         }
10324         state->ss_union_flags = 0;
10325
10326         /*
10327          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10328          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10329          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10330          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10331          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10332          */
10333
10334         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10335                 attrreference_t* string_ref;
10336                 u_int32_t* start_length;
10337                 user64_size_t param_length;
10338
10339                 /* validate searchparams1 */
10340                 param_length = searchblock.sizeofsearchparams1;
10341                 /* skip the word that specifies length of the buffer */
10342                 start_length = (u_int32_t*) searchparams1;
10343                 start_length = start_length + 1;
10344                 string_ref = (attrreference_t*) start_length;
10345
10346                 /* ensure no negative offsets or too big offsets */
10347                 if (string_ref->attr_dataoffset < 0) {
10348                         error = EINVAL;
10349                         goto freeandexit;
10350                 }
10351                 if (string_ref->attr_length > MAXPATHLEN) {
10352                         error = EINVAL;
10353                         goto freeandexit;
10354                 }
10355
10356                 /* Check for pointer overflow in the string ref */
10357                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10358                         error = EINVAL;
10359                         goto freeandexit;
10360                 }
10361
10362                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10363                         error = EINVAL;
10364                         goto freeandexit;
10365                 }
10366                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10367                         error = EINVAL;
10368                         goto freeandexit;
10369                 }
10370         }
10371
10372         /* set up the uio structure which will contain the users return buffer */
10373         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10374         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10375
10376         nameiflags = 0;
10377         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10378                 nameiflags |= FOLLOW;
10379         }
10380         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10381             UIO_USERSPACE, uap->path, ctx);
10382
10383         error = namei(&nd);
10384         if (error) {
10385                 goto freeandexit;
10386         }
10387         vp = nd.ni_vp;
10388         nameidone(&nd);
10389
10390         /*
10391          * Switch to the root vnode for the volume
10392          */
10393         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10394         vnode_put(vp);
10395         if (error) {
10396                 goto freeandexit;
10397         }
10398         vp = tvp;
10399
10400         /*
10401          * If it's a union mount, the path lookup takes
10402          * us to the top layer. But we may need to descend
10403          * to a lower layer. For non-union mounts the layer
10404          * is always zero.
10405          */
10406         for (i = 0; i < (int) state->ss_union_layer; i++) {
10407                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10408                         break;
10409                 }
10410                 tvp = vp;
10411                 vp = vp->v_mount->mnt_vnodecovered;
10412                 if (vp == NULL) {
10413                         vnode_put(tvp);
10414                         error = ENOENT;
10415                         goto freeandexit;
10416                 }
10417                 error = vnode_getwithref(vp);
10418                 vnode_put(tvp);
10419                 if (error) {
10420                         goto freeandexit;
10421                 }
10422         }
10423
10424 #if CONFIG_MACF
10425         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
10426         if (error) {
10427                 vnode_put(vp);
10428                 goto freeandexit;
10429         }
10430 #endif
10431
10432
10433         /*
10434          * If searchblock.maxmatches == 0, then skip the search. This has happened
10435          * before and sometimes the underlying code doesnt deal with it well.
10436          */
10437         if (searchblock.maxmatches == 0) {
10438                 nummatches = 0;
10439                 goto saveandexit;
10440         }
10441
10442         /*
10443          * Allright, we have everything we need, so lets make that call.
10444          *
10445          * We keep special track of the return value from the file system:
10446          * EAGAIN is an acceptable error condition that shouldn't keep us
10447          * from copying out any results...
10448          */
10449
10450         fserror = VNOP_SEARCHFS(vp,
10451             searchparams1,
10452             searchparams2,
10453             &searchblock.searchattrs,
10454             (uint32_t)searchblock.maxmatches,
10455             &timelimit,
10456             returnattrs,
10457             &nummatches,
10458             (uint32_t)uap->scriptcode,
10459             (uint32_t)uap->options,
10460             auio,
10461             (struct searchstate *) &state->ss_fsstate,
10462             ctx);
10463
10464         /*
10465          * If it's a union mount we need to be called again
10466          * to search the mounted-on filesystem.
10467          */
10468         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10469                 state->ss_union_flags = SRCHFS_START;
10470                 state->ss_union_layer++;        // search next layer down
10471                 fserror = EAGAIN;
10472         }
10473
10474 saveandexit:
10475
10476         vnode_put(vp);
10477
10478         /* Now copy out the stuff that needs copying out. That means the number of matches, the
10479          *  search state.  Everything was already put into he return buffer by the vop call. */
10480
10481         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10482                 goto freeandexit;
10483         }
10484
10485         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10486                 goto freeandexit;
10487         }
10488
10489         error = fserror;
10490
10491 freeandexit:
10492
10493         kheap_free(KHEAP_DATA_BUFFERS, searchparams1, mallocsize);
10494
10495         return error;
10496 } /* end of searchfs system call */
10497
10498 #else /* CONFIG_SEARCHFS */
10499
10500 int
10501 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10502 {
10503         return ENOTSUP;
10504 }
10505
10506 #endif /* CONFIG_SEARCHFS */
10507
10508
10509 #if CONFIG_DATALESS_FILES
10510
10511 /*
10512  * === Namespace Resolver Up-call Mechanism ===
10513  *
10514  * When I/O is performed to a dataless file or directory (read, write,
10515  * lookup-in, etc.), the file system performs an upcall to the namespace
10516  * resolver (filecoordinationd) to materialize the object.
10517  *
10518  * We need multiple up-calls to be in flight at once, and we need these
10519  * up-calls to be interruptible, thus the following implementation:
10520  *
10521  * => The nspace_resolver_request represents the in-kernel request state.
10522  *    It contains a request ID, storage space for the errno code returned
10523  *    by filecoordinationd, and flags.
10524  *
10525  * => The request ID is simply a global monotonically incrementing 32-bit
10526  *    number.  Outstanding requests are stored in a hash table, and the
10527  *    hash function is extremely simple.
10528  *
10529  * => When an upcall is to be made to filecoordinationd, a request structure
10530  *    is allocated on the stack (it is small, and needs to live only during
10531  *    the duration of the call to resolve_nspace_item_ext()).  It is
10532  *    initialized and inserted into the table.  Some backpressure from
10533  *    filecoordinationd is applied by limiting the numnber of entries that
10534  *    can be inserted into the table (and thus limiting the number of
10535  *    outstanding requests issued to filecoordinationd); waiting for an
10536  *    available slot is interruptible.
10537  *
10538  * => Once the request has been inserted into the table, the up-call is made
10539  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
10540  *    immediately and filecoordinationd processes the request asynchronously.
10541  *
10542  * => The caller now waits for the request to complete.  Tnis is achieved by
10543  *    sleeping on the address of the request structure and waiting for
10544  *    filecoordinationd to mark the request structure as complete.  This
10545  *    is an interruptible sleep call; if interrupted, the request structure
10546  *    is removed from the table and EINTR is returned to the caller.  If
10547  *    this occurs, an advisory up-call is made to filecoordinationd with
10548  *    the request ID to indicate that the request can be aborted or
10549  *    de-prioritized at the discretion of filecoordinationd.
10550  *
10551  * => When filecoordinationd has completed the request, it signals completion
10552  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
10553  *    decorated as a namespace resolver can write to this sysctl node.  The
10554  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10555  *    The request ID is looked up in the table, and if the request is found,
10556  *    the error code is stored in the request structure and a wakeup()
10557  *    issued on the address of the request structure.  If the request is not
10558  *    found, we simply drop the completion notification, assuming that the
10559  *    caller was interrupted.
10560  *
10561  * => When the waiting thread wakes up, it extracts the error code from the
10562  *    request structure, removes the request from the table, and returns the
10563  *    error code to the calling function.  Fini!
10564  */
10565
10566 struct nspace_resolver_request {
10567         LIST_ENTRY(nspace_resolver_request) r_hashlink;
10568         vnode_t         r_vp;
10569         uint32_t        r_req_id;
10570         int             r_resolver_error;
10571         int             r_flags;
10572 };
10573
10574 #define RRF_COMPLETE    0x0001
10575
10576 static uint32_t
10577 next_nspace_req_id(void)
10578 {
10579         static uint32_t next_req_id;
10580
10581         return OSAddAtomic(1, &next_req_id);
10582 }
10583
10584 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
10585 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
10586
10587 static LIST_HEAD(nspace_resolver_requesthead,
10588     nspace_resolver_request) * nspace_resolver_request_hashtbl;
10589 static u_long nspace_resolver_request_hashmask;
10590 static u_int nspace_resolver_request_count;
10591 static bool nspace_resolver_request_wait_slot;
10592 static LCK_GRP_DECLARE(nspace_resolver_request_lck_grp, "file namespace resolver");
10593 static LCK_MTX_DECLARE(nspace_resolver_request_hash_mutex,
10594     &nspace_resolver_request_lck_grp);
10595
10596 #define NSPACE_REQ_LOCK() \
10597         lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10598 #define NSPACE_REQ_UNLOCK() \
10599         lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10600
10601 #define NSPACE_RESOLVER_HASH(req_id)    \
10602         (&nspace_resolver_request_hashtbl[(req_id) & \
10603          nspace_resolver_request_hashmask])
10604
10605 static struct nspace_resolver_request *
10606 nspace_resolver_req_lookup(uint32_t req_id)
10607 {
10608         struct nspace_resolver_requesthead *bucket;
10609         struct nspace_resolver_request *req;
10610
10611         bucket = NSPACE_RESOLVER_HASH(req_id);
10612         LIST_FOREACH(req, bucket, r_hashlink) {
10613                 if (req->r_req_id == req_id) {
10614                         return req;
10615                 }
10616         }
10617
10618         return NULL;
10619 }
10620
10621 static int
10622 nspace_resolver_req_add(struct nspace_resolver_request *req)
10623 {
10624         struct nspace_resolver_requesthead *bucket;
10625         int error;
10626
10627         while (nspace_resolver_request_count >=
10628             NSPACE_RESOLVER_MAX_OUTSTANDING) {
10629                 nspace_resolver_request_wait_slot = true;
10630                 error = msleep(&nspace_resolver_request_count,
10631                     &nspace_resolver_request_hash_mutex,
10632                     PVFS | PCATCH, "nspacerq", NULL);
10633                 if (error) {
10634                         return error;
10635                 }
10636         }
10637
10638         bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10639 #if DIAGNOSTIC
10640         assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10641 #endif /* DIAGNOSTIC */
10642         LIST_INSERT_HEAD(bucket, req, r_hashlink);
10643         nspace_resolver_request_count++;
10644
10645         return 0;
10646 }
10647
10648 static void
10649 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10650 {
10651         struct nspace_resolver_requesthead *bucket;
10652
10653         bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10654 #if DIAGNOSTIC
10655         assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10656 #endif /* DIAGNOSTIC */
10657         LIST_REMOVE(req, r_hashlink);
10658         nspace_resolver_request_count--;
10659
10660         if (nspace_resolver_request_wait_slot) {
10661                 nspace_resolver_request_wait_slot = false;
10662                 wakeup(&nspace_resolver_request_count);
10663         }
10664 }
10665
10666 static void
10667 nspace_resolver_req_cancel(uint32_t req_id)
10668 {
10669         kern_return_t kr;
10670         mach_port_t mp;
10671
10672         // Failures here aren't fatal -- the cancellation message
10673         // sent to the resolver is merely advisory.
10674
10675         kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10676         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10677                 return;
10678         }
10679
10680         kr = send_nspace_resolve_cancel(mp, req_id);
10681         if (kr != KERN_SUCCESS) {
10682                 os_log_error(OS_LOG_DEFAULT,
10683                     "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10684         }
10685
10686         ipc_port_release_send(mp);
10687 }
10688
10689 static int
10690 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10691 {
10692         bool send_cancel_message = false;
10693         int error;
10694
10695         NSPACE_REQ_LOCK();
10696
10697         while ((req->r_flags & RRF_COMPLETE) == 0) {
10698                 error = msleep(req, &nspace_resolver_request_hash_mutex,
10699                     PVFS | PCATCH, "nspace", NULL);
10700                 if (error && error != ERESTART) {
10701                         req->r_resolver_error = (error == EINTR) ? EINTR :
10702                             ETIMEDOUT;
10703                         send_cancel_message = true;
10704                         break;
10705                 }
10706         }
10707
10708         nspace_resolver_req_remove(req);
10709
10710         NSPACE_REQ_UNLOCK();
10711
10712         if (send_cancel_message) {
10713                 nspace_resolver_req_cancel(req->r_req_id);
10714         }
10715
10716         return req->r_resolver_error;
10717 }
10718
10719 static void
10720 nspace_resolver_req_mark_complete(
10721         struct nspace_resolver_request *req,
10722         int resolver_error)
10723 {
10724         req->r_resolver_error = resolver_error;
10725         req->r_flags |= RRF_COMPLETE;
10726         wakeup(req);
10727 }
10728
10729 static void
10730 nspace_resolver_req_completed(uint32_t req_id, int resolver_error, uint64_t orig_gencount)
10731 {
10732         struct nspace_resolver_request *req;
10733
10734         NSPACE_REQ_LOCK();
10735
10736         // If we don't find the request corresponding to our req_id,
10737         // just drop the completion signal on the floor; it's likely
10738         // that the requester interrupted with a signal.
10739
10740         req = nspace_resolver_req_lookup(req_id);
10741         if (req) {
10742                 mount_t locked_mp = NULL;
10743
10744                 locked_mp = req->r_vp->v_mount;
10745                 mount_ref(locked_mp, 0);
10746                 mount_lock_renames(locked_mp);
10747
10748                 //
10749                 // if the resolver isn't already returning an error and we have an
10750                 // orig_gencount, then get an iocount on the request vnode and check
10751                 // that the gencount on req->r_vp has not changed.
10752                 //
10753                 // note: a ref was taken on req->r_vp when the request was created
10754                 // and that ref will be dropped by that thread when it wakes up.
10755                 //
10756                 if (resolver_error == 0 &&
10757                     orig_gencount != 0 &&
10758                     vnode_getwithref(req->r_vp) == 0) {
10759                         struct vnode_attr va;
10760                         uint64_t cur_gencount;
10761
10762                         VATTR_INIT(&va);
10763                         VATTR_WANTED(&va, va_recursive_gencount);
10764
10765                         if (vnode_getattr(req->r_vp, &va, vfs_context_kernel()) == 0) {
10766                                 cur_gencount = va.va_recursive_gencount;
10767                         } else {
10768                                 cur_gencount = 0;
10769                         }
10770
10771                         if (resolver_error == 0 && cur_gencount && orig_gencount && cur_gencount != orig_gencount) {
10772                                 printf("nspace.complete: gencount changed! (orig %llu cur %llu)\n", orig_gencount, cur_gencount);
10773
10774                                 // this error will be returned to the thread that initiated the
10775                                 // materialization of req->r_vp.
10776                                 resolver_error = EBUSY;
10777
10778                                 // note: we explicitly do not return an error to the caller (i.e.
10779                                 // the thread that did the materialization) because they said they
10780                                 // don't want one.
10781                         }
10782
10783                         vnode_put(req->r_vp);
10784                 }
10785
10786                 mount_unlock_renames(locked_mp);
10787                 mount_drop(locked_mp, 0);
10788
10789                 nspace_resolver_req_mark_complete(req, resolver_error);
10790         }
10791
10792         NSPACE_REQ_UNLOCK();
10793
10794         return;
10795 }
10796
10797 static struct proc *nspace_resolver_proc;
10798
10799 static int
10800 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10801 {
10802         *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10803             p == nspace_resolver_proc) ? 1 : 0;
10804         return 0;
10805 }
10806
10807 static int
10808 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10809 {
10810         vfs_context_t ctx = vfs_context_current();
10811         int error = 0;
10812
10813         //
10814         // The system filecoordinationd runs as uid == 0.  This also
10815         // has the nice side-effect of filtering out filecoordinationd
10816         // running in the simulator.
10817         //
10818         if (!vfs_context_issuser(ctx)) {
10819                 return EPERM;
10820         }
10821
10822         error = priv_check_cred(vfs_context_ucred(ctx),
10823             PRIV_VFS_DATALESS_RESOLVER, 0);
10824         if (error) {
10825                 return error;
10826         }
10827
10828         if (is_resolver) {
10829                 NSPACE_REQ_LOCK();
10830
10831                 if (nspace_resolver_proc == NULL) {
10832                         proc_lock(p);
10833                         p->p_lflag |= P_LNSPACE_RESOLVER;
10834                         proc_unlock(p);
10835                         nspace_resolver_proc = p;
10836                 } else {
10837                         error = EBUSY;
10838                 }
10839
10840                 NSPACE_REQ_UNLOCK();
10841         } else {
10842                 // This is basically just like the exit case.
10843                 // nspace_resolver_exited() will verify that the
10844                 // process is the resolver, and will clear the
10845                 // global.
10846                 nspace_resolver_exited(p);
10847         }
10848
10849         return error;
10850 }
10851
10852 static int
10853 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10854 {
10855         if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10856             (p->p_vfs_iopolicy &
10857             P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10858                 *is_prevented = 1;
10859         } else {
10860                 *is_prevented = 0;
10861         }
10862         return 0;
10863 }
10864
10865 static int
10866 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10867 {
10868         if (p->p_lflag & P_LNSPACE_RESOLVER) {
10869                 return is_prevented ? 0 : EBUSY;
10870         }
10871
10872         if (is_prevented) {
10873                 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10874         } else {
10875                 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10876         }
10877         return 0;
10878 }
10879
10880 static int
10881 nspace_materialization_get_thread_state(int *is_prevented)
10882 {
10883         uthread_t ut = get_bsdthread_info(current_thread());
10884
10885         *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10886         return 0;
10887 }
10888
10889 static int
10890 nspace_materialization_set_thread_state(int is_prevented)
10891 {
10892         uthread_t ut = get_bsdthread_info(current_thread());
10893
10894         if (is_prevented) {
10895                 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10896         } else {
10897                 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10898         }
10899         return 0;
10900 }
10901
10902 /* the vfs.nspace branch */
10903 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10904
10905 static int
10906 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10907     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10908 {
10909         struct proc *p = req->p;
10910         int new_value, old_value, changed = 0;
10911         int error;
10912
10913         error = nspace_resolver_get_proc_state(p, &old_value);
10914         if (error) {
10915                 return error;
10916         }
10917
10918         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10919             &changed);
10920         if (error == 0 && changed) {
10921                 error = nspace_resolver_set_proc_state(p, new_value);
10922         }
10923         return error;
10924 }
10925
10926 /* decorate this process as the dataless file resolver */
10927 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
10928     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10929     0, 0, sysctl_nspace_resolver, "I", "");
10930
10931 static int
10932 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
10933     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10934 {
10935         struct proc *p = req->p;
10936         int new_value, old_value, changed = 0;
10937         int error;
10938
10939         error = nspace_materialization_get_proc_state(p, &old_value);
10940         if (error) {
10941                 return error;
10942         }
10943
10944         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10945             &changed);
10946         if (error == 0 && changed) {
10947                 error = nspace_materialization_set_proc_state(p, new_value);
10948         }
10949         return error;
10950 }
10951
10952 /* decorate this process as not wanting to materialize dataless files */
10953 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
10954     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10955     0, 0, sysctl_nspace_prevent_materialization, "I", "");
10956
10957 static int
10958 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
10959     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10960 {
10961         int new_value, old_value, changed = 0;
10962         int error;
10963
10964         error = nspace_materialization_get_thread_state(&old_value);
10965         if (error) {
10966                 return error;
10967         }
10968
10969         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10970             &changed);
10971         if (error == 0 && changed) {
10972                 error = nspace_materialization_set_thread_state(new_value);
10973         }
10974         return error;
10975 }
10976
10977 /* decorate this thread as not wanting to materialize dataless files */
10978 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
10979     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10980     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
10981
10982 static int
10983 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
10984     __unused int arg2, struct sysctl_req *req)
10985 {
10986         struct proc *p = req->p;
10987         uint32_t req_status[2] = { 0, 0 };
10988         uint64_t gencount = 0;
10989         int error, is_resolver, changed = 0, gencount_changed;
10990
10991         error = nspace_resolver_get_proc_state(p, &is_resolver);
10992         if (error) {
10993                 return error;
10994         }
10995
10996         if (!is_resolver) {
10997                 return EPERM;
10998         }
10999
11000         error = sysctl_io_opaque(req, req_status, sizeof(req_status),
11001             &changed);
11002         if (error) {
11003                 return error;
11004         }
11005
11006         // get the gencount if it was passed
11007         error = sysctl_io_opaque(req, &gencount, sizeof(gencount),
11008             &gencount_changed);
11009         if (error) {
11010                 gencount = 0;
11011                 // we ignore the error because the gencount was optional
11012                 error = 0;
11013         }
11014
11015         /*
11016          * req_status[0] is the req_id
11017          *
11018          * req_status[1] is the errno
11019          */
11020         if (error == 0 && changed) {
11021                 nspace_resolver_req_completed(req_status[0],
11022                     (int)req_status[1], gencount);
11023         }
11024         return error;
11025 }
11026
11027 /* Resolver reports completed reqs here. */
11028 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
11029     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
11030     0, 0, sysctl_nspace_complete, "-", "");
11031
11032 #endif /* CONFIG_DATALESS_FILES */
11033
11034 #if CONFIG_DATALESS_FILES
11035 #define __no_dataless_unused    /* nothing */
11036 #else
11037 #define __no_dataless_unused    __unused
11038 #endif
11039
11040 int
11041 vfs_context_dataless_materialization_is_prevented(
11042         vfs_context_t const ctx __no_dataless_unused)
11043 {
11044 #if CONFIG_DATALESS_FILES
11045         proc_t const p = vfs_context_proc(ctx);
11046         thread_t const t = vfs_context_thread(ctx);
11047         uthread_t const ut = t ? get_bsdthread_info(t) : NULL;
11048
11049         /*
11050          * Kernel context ==> return EDEADLK, as we would with any random
11051          * process decorated as no-materialize.
11052          */
11053         if (ctx == vfs_context_kernel()) {
11054                 return EDEADLK;
11055         }
11056
11057         /*
11058          * If the process has the dataless-manipulation entitlement,
11059          * materialization is prevented, and depending on the kind
11060          * of file system operation, things get to proceed as if the
11061          * object is not dataless.
11062          */
11063         if (vfs_context_is_dataless_manipulator(ctx)) {
11064                 return EJUSTRETURN;
11065         }
11066
11067         /*
11068          * Per-thread decorations override any process-wide decorations.
11069          * (Foundation uses this, and this overrides even the dataless-
11070          * manipulation entitlement so as to make API contracts consistent.)
11071          */
11072         if (ut != NULL) {
11073                 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
11074                         return EDEADLK;
11075                 }
11076                 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
11077                         return 0;
11078                 }
11079         }
11080
11081         /*
11082          * If the process's iopolicy specifies that dataless files
11083          * can be materialized, then we let it go ahead.
11084          */
11085         if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
11086                 return 0;
11087         }
11088 #endif /* CONFIG_DATALESS_FILES */
11089
11090         /*
11091          * The default behavior is to not materialize dataless files;
11092          * return to the caller that deadlock was detected.
11093          */
11094         return EDEADLK;
11095 }
11096
11097 void
11098 nspace_resolver_init(void)
11099 {
11100 #if CONFIG_DATALESS_FILES
11101         nspace_resolver_request_hashtbl =
11102             hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
11103             M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
11104 #endif /* CONFIG_DATALESS_FILES */
11105 }
11106
11107 void
11108 nspace_resolver_exited(struct proc *p __no_dataless_unused)
11109 {
11110 #if CONFIG_DATALESS_FILES
11111         struct nspace_resolver_requesthead *bucket;
11112         struct nspace_resolver_request *req;
11113         u_long idx;
11114
11115         NSPACE_REQ_LOCK();
11116
11117         if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
11118             p == nspace_resolver_proc) {
11119                 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
11120                         bucket = &nspace_resolver_request_hashtbl[idx];
11121                         LIST_FOREACH(req, bucket, r_hashlink) {
11122                                 nspace_resolver_req_mark_complete(req,
11123                                     ETIMEDOUT);
11124                         }
11125                 }
11126                 nspace_resolver_proc = NULL;
11127         }
11128
11129         NSPACE_REQ_UNLOCK();
11130 #endif /* CONFIG_DATALESS_FILES */
11131 }
11132
11133 int
11134 resolve_nspace_item(struct vnode *vp, uint64_t op)
11135 {
11136         return resolve_nspace_item_ext(vp, op, NULL);
11137 }
11138
11139 #define DATALESS_RESOLVER_ENTITLEMENT     \
11140         "com.apple.private.vfs.dataless-resolver"
11141 #define DATALESS_MANIPULATION_ENTITLEMENT \
11142         "com.apple.private.vfs.dataless-manipulation"
11143
11144 /*
11145  * Return TRUE if the vfs context is associated with a process entitled
11146  * for dataless manipulation.
11147  *
11148  * XXX Arguably belongs in vfs_subr.c, but is here because of the
11149  * complication around CONFIG_DATALESS_FILES.
11150  */
11151 boolean_t
11152 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
11153 {
11154 #if CONFIG_DATALESS_FILES
11155         assert(ctx->vc_thread == current_thread());
11156         task_t const task = current_task();
11157         return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
11158                IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
11159 #else
11160         return false;
11161 #endif /* CONFIG_DATALESS_FILES */
11162 }
11163
11164 int
11165 resolve_nspace_item_ext(
11166         struct vnode *vp __no_dataless_unused,
11167         uint64_t op __no_dataless_unused,
11168         void *arg __unused)
11169 {
11170 #if CONFIG_DATALESS_FILES
11171         int error;
11172         mach_port_t mp;
11173         char *path = NULL;
11174         int path_len;
11175         kern_return_t kr;
11176         struct nspace_resolver_request req;
11177
11178         // only allow namespace events on regular files, directories and symlinks.
11179         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
11180                 return EFTYPE;
11181         }
11182
11183         //
11184         // if this is a snapshot event and the vnode is on a
11185         // disk image just pretend nothing happened since any
11186         // change to the disk image will cause the disk image
11187         // itself to get backed up and this avoids multi-way
11188         // deadlocks between the snapshot handler and the ever
11189         // popular diskimages-helper process.  the variable
11190         // nspace_allow_virtual_devs allows this behavior to
11191         // be overridden (for use by the Mobile TimeMachine
11192         // testing infrastructure which uses disk images)
11193         //
11194         if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
11195                 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
11196                 return ENOTSUP;
11197         }
11198
11199         error = vfs_context_dataless_materialization_is_prevented(
11200                 vfs_context_current());
11201         if (error) {
11202                 os_log_debug(OS_LOG_DEFAULT,
11203                     "NSPACE process/thread is decorated as no-materialization");
11204                 return error;
11205         }
11206
11207         kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
11208         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
11209                 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
11210                 // Treat this like being unable to access the backing
11211                 // store server.
11212                 return ETIMEDOUT;
11213         }
11214
11215         path = zalloc(ZV_NAMEI);
11216         path_len = MAXPATHLEN;
11217
11218         error = vn_getpath(vp, path, &path_len);
11219         if (error == 0) {
11220                 int xxx_rdar44371223;   /* XXX Mig bug */
11221                 req.r_req_id = next_nspace_req_id();
11222                 req.r_resolver_error = 0;
11223                 req.r_flags = 0;
11224
11225                 if ((error = vnode_ref(vp)) == 0) {     // take a ref so that the vnode doesn't go away
11226                         req.r_vp = vp;
11227                 } else {
11228                         goto out_release_port;
11229                 }
11230
11231                 NSPACE_REQ_LOCK();
11232                 error = nspace_resolver_req_add(&req);
11233                 NSPACE_REQ_UNLOCK();
11234                 if (error) {
11235                         vnode_rele(req.r_vp);
11236                         goto out_release_port;
11237                 }
11238
11239                 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
11240                 kr = send_nspace_resolve_path(mp, req.r_req_id,
11241                     current_proc()->p_pid, (uint32_t)(op & 0xffffffff),
11242                     path, &xxx_rdar44371223);
11243                 if (kr != KERN_SUCCESS) {
11244                         // Also treat this like being unable to access
11245                         // the backing store server.
11246                         os_log_error(OS_LOG_DEFAULT,
11247                             "NSPACE resolve_path failure: %d", kr);
11248                         error = ETIMEDOUT;
11249
11250                         NSPACE_REQ_LOCK();
11251                         nspace_resolver_req_remove(&req);
11252                         NSPACE_REQ_UNLOCK();
11253                         vnode_rele(req.r_vp);
11254                         goto out_release_port;
11255                 }
11256
11257                 // Give back the memory we allocated earlier while
11258                 // we wait; we no longer need it.
11259                 zfree(ZV_NAMEI, path);
11260                 path = NULL;
11261
11262                 // Request has been submitted to the resolver.
11263                 // Now (interruptibly) wait for completion.
11264                 // Upon requrn, the request will have been removed
11265                 // from the lookup table.
11266                 error = nspace_resolver_req_wait(&req);
11267
11268                 vnode_rele(req.r_vp);
11269         }
11270
11271 out_release_port:
11272         if (path != NULL) {
11273                 zfree(ZV_NAMEI, path);
11274         }
11275         ipc_port_release_send(mp);
11276
11277         return error;
11278 #else
11279         return ENOTSUP;
11280 #endif /* CONFIG_DATALESS_FILES */
11281 }
11282
11283 int
11284 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
11285     __unused uint64_t op_type, __unused void *arg)
11286 {
11287         return 0;
11288 }
11289
11290 #if 0
11291 static int
11292 build_volfs_path(struct vnode *vp, char *path, int *len)
11293 {
11294         struct vnode_attr va;
11295         int ret;
11296
11297         VATTR_INIT(&va);
11298         VATTR_WANTED(&va, va_fsid);
11299         VATTR_WANTED(&va, va_fileid);
11300
11301         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
11302                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
11303                 ret = -1;
11304         } else {
11305                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
11306                 ret = 0;
11307         }
11308
11309         return ret;
11310 }
11311 #endif
11312
11313 static unsigned long
11314 fsctl_bogus_command_compat(unsigned long cmd)
11315 {
11316         switch (cmd) {
11317         case IOCBASECMD(FSIOC_SYNC_VOLUME):
11318                 return FSIOC_SYNC_VOLUME;
11319         case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
11320                 return FSIOC_ROUTEFS_SETROUTEID;
11321         case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
11322                 return FSIOC_SET_PACKAGE_EXTS;
11323         case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
11324                 return FSIOC_SET_FSTYPENAME_OVERRIDE;
11325         case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
11326                 return DISK_CONDITIONER_IOC_GET;
11327         case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
11328                 return DISK_CONDITIONER_IOC_SET;
11329         case IOCBASECMD(FSIOC_FIOSEEKHOLE):
11330                 return FSIOC_FIOSEEKHOLE;
11331         case IOCBASECMD(FSIOC_FIOSEEKDATA):
11332                 return FSIOC_FIOSEEKDATA;
11333         case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
11334                 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
11335         case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
11336                 return SPOTLIGHT_IOC_GET_LAST_MTIME;
11337         }
11338
11339         return cmd;
11340 }
11341
11342 static int
11343 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
11344 {
11345         return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
11346 }
11347
11348 static int __attribute__((noinline))
11349 handle_sync_volume(vnode_t vp, vnode_t *arg_vp, caddr_t data, vfs_context_t ctx)
11350 {
11351         struct vfs_attr vfa;
11352         mount_t mp = vp->v_mount;
11353         unsigned arg;
11354         int error;
11355
11356         /* record vid of vp so we can drop it below. */
11357         uint32_t vvid = vp->v_id;
11358
11359         /*
11360          * Then grab mount_iterref so that we can release the vnode.
11361          * Without this, a thread may call vnode_iterate_prepare then
11362          * get into a deadlock because we've never released the root vp
11363          */
11364         error = mount_iterref(mp, 0);
11365         if (error) {
11366                 return error;
11367         }
11368         vnode_put(vp);
11369
11370         arg = MNT_NOWAIT;
11371         if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11372                 arg = MNT_WAIT;
11373         }
11374
11375         /*
11376          * If the filessytem supports multiple filesytems in a
11377          * partition (For eg APFS volumes in a container, it knows
11378          * that the waitfor argument to VFS_SYNC are flags.
11379          */
11380         VFSATTR_INIT(&vfa);
11381         VFSATTR_WANTED(&vfa, f_capabilities);
11382         if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11383             VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11384             ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11385             ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11386                 arg |= MNT_VOLUME;
11387         }
11388
11389         /* issue the sync for this volume */
11390         (void)sync_callback(mp, &arg);
11391
11392         /*
11393          * Then release the mount_iterref once we're done syncing; it's not
11394          * needed for the VNOP_IOCTL below
11395          */
11396         mount_iterdrop(mp);
11397
11398         if (arg & FSCTL_SYNC_FULLSYNC) {
11399                 /* re-obtain vnode iocount on the root vp, if possible */
11400                 error = vnode_getwithvid(vp, vvid);
11401                 if (error == 0) {
11402                         error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11403                         vnode_put(vp);
11404                 }
11405         }
11406         /* mark the argument VP as having been released */
11407         *arg_vp = NULL;
11408         return error;
11409 }
11410
11411 #if ROUTEFS
11412 static int __attribute__((noinline))
11413 handle_routes(user_addr_t udata)
11414 {
11415         char routepath[MAXPATHLEN];
11416         size_t len = 0;
11417         int error;
11418
11419         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11420                 return error;
11421         }
11422         bzero(routepath, MAXPATHLEN);
11423         error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11424         if (error) {
11425                 return error;
11426         }
11427         error = routefs_kernel_mount(routepath);
11428         return error;
11429 }
11430 #endif
11431
11432 static int __attribute__((noinline))
11433 handle_flags(vnode_t vp, caddr_t data, vfs_context_t ctx)
11434 {
11435         struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11436         struct vnode_attr va;
11437         int error;
11438
11439         VATTR_INIT(&va);
11440         VATTR_SET(&va, va_flags, cas->new_flags);
11441
11442         error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11443         return error;
11444 }
11445
11446 static int __attribute__((noinline))
11447 handle_auth(vnode_t vp, u_long cmd, caddr_t data, u_long options, vfs_context_t ctx)
11448 {
11449         struct mount *mp = NULL;
11450         errno_t rootauth = 0;
11451
11452         mp = vp->v_mount;
11453
11454         /*
11455          * query the underlying FS and see if it reports something
11456          * sane for this vnode. If volume is authenticated via
11457          * chunklist, leave that for the caller to determine.
11458          */
11459         rootauth = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
11460
11461         return rootauth;
11462 }
11463
11464 /*
11465  * Make a filesystem-specific control call:
11466  */
11467 /* ARGSUSED */
11468 static int
11469 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
11470 {
11471         int error = 0;
11472         boolean_t is64bit;
11473         u_int size;
11474 #define STK_PARAMS 128
11475         char stkbuf[STK_PARAMS] = {0};
11476         caddr_t data, memp;
11477         vnode_t vp = *arg_vp;
11478
11479         if (vp->v_type == VCHR || vp->v_type == VBLK) {
11480                 return ENOTTY;
11481         }
11482
11483         cmd = fsctl_bogus_command_compat(cmd);
11484
11485         size = IOCPARM_LEN(cmd);
11486         if (size > IOCPARM_MAX) {
11487                 return EINVAL;
11488         }
11489
11490         is64bit = proc_is64bit(p);
11491
11492         memp = NULL;
11493
11494         if (size > sizeof(stkbuf)) {
11495                 if ((memp = (caddr_t)kheap_alloc(KHEAP_TEMP, size, Z_WAITOK)) == 0) {
11496                         return ENOMEM;
11497                 }
11498                 data = memp;
11499         } else {
11500                 data = &stkbuf[0];
11501         };
11502
11503         if (cmd & IOC_IN) {
11504                 if (size) {
11505                         error = copyin(udata, data, size);
11506                         if (error) {
11507                                 if (memp) {
11508                                         kheap_free(KHEAP_TEMP, memp, size);
11509                                 }
11510                                 return error;
11511                         }
11512                 } else {
11513                         if (is64bit) {
11514                                 *(user_addr_t *)data = udata;
11515                         } else {
11516                                 *(uint32_t *)data = (uint32_t)udata;
11517                         }
11518                 };
11519         } else if ((cmd & IOC_OUT) && size) {
11520                 /*
11521                  * Zero the buffer so the user always
11522                  * gets back something deterministic.
11523                  */
11524                 bzero(data, size);
11525         } else if (cmd & IOC_VOID) {
11526                 if (is64bit) {
11527                         *(user_addr_t *)data = udata;
11528                 } else {
11529                         *(uint32_t *)data = (uint32_t)udata;
11530                 }
11531         }
11532
11533         /* Check to see if it's a generic command */
11534         switch (cmd) {
11535         case FSIOC_SYNC_VOLUME:
11536                 error = handle_sync_volume(vp, arg_vp, data, ctx);
11537                 break;
11538
11539         case FSIOC_ROUTEFS_SETROUTEID:
11540 #if ROUTEFS
11541                 error = handle_routes(udata);
11542 #endif
11543                 break;
11544
11545         case FSIOC_SET_PACKAGE_EXTS: {
11546                 user_addr_t ext_strings;
11547                 uint32_t    num_entries;
11548                 uint32_t    max_width;
11549
11550                 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11551                         break;
11552                 }
11553
11554                 if ((is64bit && size != sizeof(user64_package_ext_info))
11555                     || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11556                         // either you're 64-bit and passed a 64-bit struct or
11557                         // you're 32-bit and passed a 32-bit struct.  otherwise
11558                         // it's not ok.
11559                         error = EINVAL;
11560                         break;
11561                 }
11562
11563                 if (is64bit) {
11564                         if (sizeof(user64_addr_t) > sizeof(user_addr_t)) {
11565                                 assert(((user64_package_ext_info *)data)->strings <= UINT32_MAX);
11566                         }
11567                         ext_strings = (user_addr_t)((user64_package_ext_info *)data)->strings;
11568                         num_entries = ((user64_package_ext_info *)data)->num_entries;
11569                         max_width   = ((user64_package_ext_info *)data)->max_width;
11570                 } else {
11571                         ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11572                         num_entries = ((user32_package_ext_info *)data)->num_entries;
11573                         max_width   = ((user32_package_ext_info *)data)->max_width;
11574                 }
11575                 error = set_package_extensions_table(ext_strings, num_entries, max_width);
11576         }
11577         break;
11578
11579         case FSIOC_SET_FSTYPENAME_OVERRIDE:
11580         {
11581                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11582                         break;
11583                 }
11584                 if (vp->v_mount) {
11585                         mount_lock(vp->v_mount);
11586                         if (data[0] != 0) {
11587                                 int i;
11588                                 for (i = 0; i < MFSTYPENAMELEN; i++) {
11589                                         if (!data[i]) {
11590                                                 goto continue_copy;
11591                                         }
11592                                 }
11593                                 /*
11594                                  * Getting here means we have a user data string which has no
11595                                  * NULL termination in its first MFSTYPENAMELEN bytes.
11596                                  * This is bogus, let's avoid strlcpy-ing the read data and
11597                                  * return an error.
11598                                  */
11599                                 error = EINVAL;
11600                                 goto unlock;
11601 continue_copy:
11602                                 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11603                                 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11604                                 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11605                                         vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
11606                                         vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
11607                                 }
11608                         } else {
11609                                 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11610                                         vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
11611                                 }
11612                                 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11613                                 vp->v_mount->fstypename_override[0] = '\0';
11614                         }
11615 unlock:
11616                         mount_unlock(vp->v_mount);
11617                 }
11618         }
11619         break;
11620
11621         case DISK_CONDITIONER_IOC_GET: {
11622                 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
11623         }
11624         break;
11625
11626         case DISK_CONDITIONER_IOC_SET: {
11627                 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
11628         }
11629         break;
11630
11631         case FSIOC_CAS_BSDFLAGS:
11632                 error = handle_flags(vp, data, ctx);
11633                 break;
11634
11635         case FSIOC_FD_ONLY_OPEN_ONCE: {
11636                 error = 0;
11637                 if (vnode_usecount(vp) > 1) {
11638                         vnode_lock_spin(vp);
11639                         if (vp->v_lflag & VL_HASSTREAMS) {
11640                                 if (vnode_isinuse_locked(vp, 1, 1)) {
11641                                         error = EBUSY;
11642                                 }
11643                         } else if (vnode_usecount(vp) > 1) {
11644                                 error = EBUSY;
11645                         }
11646                         vnode_unlock(vp);
11647                 }
11648         }
11649         break;
11650
11651         case FSIOC_EVAL_ROOTAUTH:
11652                 error = handle_auth(vp, cmd, data, options, ctx);
11653                 break;
11654
11655         default: {
11656                 /* other, known commands shouldn't be passed down here */
11657                 switch (cmd) {
11658                 case F_PUNCHHOLE:
11659                 case F_TRIM_ACTIVE_FILE:
11660                 case F_RDADVISE:
11661                 case F_TRANSCODEKEY:
11662                 case F_GETPROTECTIONLEVEL:
11663                 case F_GETDEFAULTPROTLEVEL:
11664                 case F_MAKECOMPRESSED:
11665                 case F_SET_GREEDY_MODE:
11666                 case F_SETSTATICCONTENT:
11667                 case F_SETIOTYPE:
11668                 case F_SETBACKINGSTORE:
11669                 case F_GETPATH_MTMINFO:
11670                 case APFSIOC_REVERT_TO_SNAPSHOT:
11671                 case FSIOC_FIOSEEKHOLE:
11672                 case FSIOC_FIOSEEKDATA:
11673                 case HFS_GET_BOOT_INFO:
11674                 case HFS_SET_BOOT_INFO:
11675                 case FIOPINSWAP:
11676                 case F_CHKCLEAN:
11677                 case F_FULLFSYNC:
11678                 case F_BARRIERFSYNC:
11679                 case F_FREEZE_FS:
11680                 case F_THAW_FS:
11681                 case FSIOC_KERNEL_ROOTAUTH:
11682                         error = EINVAL;
11683                         goto outdrop;
11684                 }
11685                 /* Invoke the filesystem-specific code */
11686                 error = VNOP_IOCTL(vp, cmd, data, (int)options, ctx);
11687         }
11688         } /* end switch stmt */
11689
11690         /*
11691          * if no errors, copy any data to user. Size was
11692          * already set and checked above.
11693          */
11694         if (error == 0 && (cmd & IOC_OUT) && size) {
11695                 error = copyout(data, udata, size);
11696         }
11697
11698 outdrop:
11699         if (memp) {
11700                 kheap_free(KHEAP_TEMP, memp, size);
11701         }
11702
11703         return error;
11704 }
11705
11706 /* ARGSUSED */
11707 int
11708 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
11709 {
11710         int error;
11711         struct nameidata nd;
11712         uint32_t nameiflags;
11713         vnode_t vp = NULL;
11714         vfs_context_t ctx = vfs_context_current();
11715
11716         AUDIT_ARG(cmd, (int)uap->cmd);
11717         AUDIT_ARG(value32, uap->options);
11718         /* Get the vnode for the file we are getting info on:  */
11719         nameiflags = 0;
11720         //
11721         // if we come through fsctl() then the file is by definition not open.
11722         // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
11723         // lest the caller mistakenly thinks the only open is their own (but in
11724         // reality it's someone elses).
11725         //
11726         if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
11727                 return EINVAL;
11728         }
11729         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11730                 nameiflags |= FOLLOW;
11731         }
11732         if (uap->cmd == FSIOC_FIRMLINK_CTL) {
11733                 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
11734         }
11735         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
11736             UIO_USERSPACE, uap->path, ctx);
11737         if ((error = namei(&nd))) {
11738                 goto done;
11739         }
11740         vp = nd.ni_vp;
11741         nameidone(&nd);
11742
11743 #if CONFIG_MACF
11744         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
11745         if (error) {
11746                 goto done;
11747         }
11748 #endif
11749
11750         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11751
11752 done:
11753         if (vp) {
11754                 vnode_put(vp);
11755         }
11756         return error;
11757 }
11758 /* ARGSUSED */
11759 int
11760 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
11761 {
11762         int error;
11763         vnode_t vp = NULL;
11764         vfs_context_t ctx = vfs_context_current();
11765         int fd = -1;
11766
11767         AUDIT_ARG(fd, uap->fd);
11768         AUDIT_ARG(cmd, (int)uap->cmd);
11769         AUDIT_ARG(value32, uap->options);
11770
11771         /* Get the vnode for the file we are getting info on:  */
11772         if ((error = file_vnode(uap->fd, &vp))) {
11773                 return error;
11774         }
11775         fd = uap->fd;
11776         if ((error = vnode_getwithref(vp))) {
11777                 file_drop(fd);
11778                 return error;
11779         }
11780
11781 #if CONFIG_MACF
11782         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
11783                 file_drop(fd);
11784                 vnode_put(vp);
11785                 return error;
11786         }
11787 #endif
11788
11789         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11790
11791         file_drop(fd);
11792
11793         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
11794         if (vp) {
11795                 vnode_put(vp);
11796         }
11797
11798         return error;
11799 }
11800 /* end of fsctl system call */
11801
11802 #define FILESEC_ACCESS_ENTITLEMENT              \
11803         "com.apple.private.vfs.filesec-access"
11804
11805 static int
11806 xattr_entitlement_check(const char *attrname, vfs_context_t ctx, bool setting)
11807 {
11808         if (strcmp(attrname, KAUTH_FILESEC_XATTR) == 0) {
11809                 /*
11810                  * get: root and tasks with FILESEC_ACCESS_ENTITLEMENT.
11811                  * set: only tasks with FILESEC_ACCESS_ENTITLEMENT.
11812                  */
11813                 if ((!setting && vfs_context_issuser(ctx)) ||
11814                     IOTaskHasEntitlement(current_task(),
11815                     FILESEC_ACCESS_ENTITLEMENT)) {
11816                         return 0;
11817                 }
11818         }
11819
11820         return EPERM;
11821 }
11822
11823 /*
11824  *  Retrieve the data of an extended attribute.
11825  */
11826 int
11827 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
11828 {
11829         vnode_t vp;
11830         struct nameidata nd;
11831         char attrname[XATTR_MAXNAMELEN + 1];
11832         vfs_context_t ctx = vfs_context_current();
11833         uio_t auio = NULL;
11834         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11835         size_t attrsize = 0;
11836         size_t namelen;
11837         u_int32_t nameiflags;
11838         int error;
11839         char uio_buf[UIO_SIZEOF(1)];
11840
11841         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11842                 return EINVAL;
11843         }
11844
11845         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11846         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
11847         if ((error = namei(&nd))) {
11848                 return error;
11849         }
11850         vp = nd.ni_vp;
11851         nameidone(&nd);
11852
11853         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11854         if (error != 0) {
11855                 goto out;
11856         }
11857         if (xattr_protected(attrname) &&
11858             (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
11859                 goto out;
11860         }
11861         /*
11862          * the specific check for 0xffffffff is a hack to preserve
11863          * binaray compatibilty in K64 with applications that discovered
11864          * that passing in a buf pointer and a size of -1 resulted in
11865          * just the size of the indicated extended attribute being returned.
11866          * this isn't part of the documented behavior, but because of the
11867          * original implemtation's check for "uap->size > 0", this behavior
11868          * was allowed. In K32 that check turned into a signed comparison
11869          * even though uap->size is unsigned...  in K64, we blow by that
11870          * check because uap->size is unsigned and doesn't get sign smeared
11871          * in the munger for a 32 bit user app.  we also need to add a
11872          * check to limit the maximum size of the buffer being passed in...
11873          * unfortunately, the underlying fileystems seem to just malloc
11874          * the requested size even if the actual extended attribute is tiny.
11875          * because that malloc is for kernel wired memory, we have to put a
11876          * sane limit on it.
11877          *
11878          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
11879          * U64 running on K64 will yield -1 (64 bits wide)
11880          * U32/U64 running on K32 will yield -1 (32 bits wide)
11881          */
11882         if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
11883                 goto no_uio;
11884         }
11885
11886         if (uap->value) {
11887                 if (uap->size > (size_t)XATTR_MAXSIZE) {
11888                         uap->size = XATTR_MAXSIZE;
11889                 }
11890
11891                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11892                     &uio_buf[0], sizeof(uio_buf));
11893                 uio_addiov(auio, uap->value, uap->size);
11894         }
11895 no_uio:
11896         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
11897 out:
11898         vnode_put(vp);
11899
11900         if (auio) {
11901                 *retval = uap->size - uio_resid(auio);
11902         } else {
11903                 *retval = (user_ssize_t)attrsize;
11904         }
11905
11906         return error;
11907 }
11908
11909 /*
11910  * Retrieve the data of an extended attribute.
11911  */
11912 int
11913 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
11914 {
11915         vnode_t vp;
11916         char attrname[XATTR_MAXNAMELEN + 1];
11917         vfs_context_t ctx = vfs_context_current();
11918         uio_t auio = NULL;
11919         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11920         size_t attrsize = 0;
11921         size_t namelen;
11922         int error;
11923         char uio_buf[UIO_SIZEOF(1)];
11924
11925         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11926                 return EINVAL;
11927         }
11928
11929         if ((error = file_vnode(uap->fd, &vp))) {
11930                 return error;
11931         }
11932         if ((error = vnode_getwithref(vp))) {
11933                 file_drop(uap->fd);
11934                 return error;
11935         }
11936         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11937         if (error != 0) {
11938                 goto out;
11939         }
11940         if (xattr_protected(attrname) &&
11941             (error = xattr_entitlement_check(attrname, ctx, false)) != 0) {
11942                 goto out;
11943         }
11944         if (uap->value && uap->size > 0) {
11945                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11946                     &uio_buf[0], sizeof(uio_buf));
11947                 uio_addiov(auio, uap->value, uap->size);
11948         }
11949
11950         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
11951 out:
11952         (void)vnode_put(vp);
11953         file_drop(uap->fd);
11954
11955         if (auio) {
11956                 *retval = uap->size - uio_resid(auio);
11957         } else {
11958                 *retval = (user_ssize_t)attrsize;
11959         }
11960         return error;
11961 }
11962
11963 /*
11964  * Set the data of an extended attribute.
11965  */
11966 int
11967 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
11968 {
11969         vnode_t vp;
11970         struct nameidata nd;
11971         char attrname[XATTR_MAXNAMELEN + 1];
11972         vfs_context_t ctx = vfs_context_current();
11973         uio_t auio = NULL;
11974         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11975         size_t namelen;
11976         u_int32_t nameiflags;
11977         int error;
11978         char uio_buf[UIO_SIZEOF(1)];
11979
11980         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11981                 return EINVAL;
11982         }
11983
11984         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11985         if (error != 0) {
11986                 if (error == EPERM) {
11987                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11988                         return ENAMETOOLONG;
11989                 }
11990                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11991                 return error;
11992         }
11993         if (xattr_protected(attrname) &&
11994             (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
11995                 return error;
11996         }
11997         if (uap->size != 0 && uap->value == 0) {
11998                 return EINVAL;
11999         }
12000         if (uap->size > INT_MAX) {
12001                 return E2BIG;
12002         }
12003
12004         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12005         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
12006         if ((error = namei(&nd))) {
12007                 return error;
12008         }
12009         vp = nd.ni_vp;
12010         nameidone(&nd);
12011
12012         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12013             &uio_buf[0], sizeof(uio_buf));
12014         uio_addiov(auio, uap->value, uap->size);
12015
12016         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
12017 #if CONFIG_FSE
12018         if (error == 0) {
12019                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
12020                     FSE_ARG_VNODE, vp,
12021                     FSE_ARG_DONE);
12022         }
12023 #endif
12024         vnode_put(vp);
12025         *retval = 0;
12026         return error;
12027 }
12028
12029 /*
12030  * Set the data of an extended attribute.
12031  */
12032 int
12033 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
12034 {
12035         vnode_t vp;
12036         char attrname[XATTR_MAXNAMELEN + 1];
12037         vfs_context_t ctx = vfs_context_current();
12038         uio_t auio = NULL;
12039         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12040         size_t namelen;
12041         int error;
12042         char uio_buf[UIO_SIZEOF(1)];
12043
12044         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12045                 return EINVAL;
12046         }
12047
12048         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12049         if (error != 0) {
12050                 if (error == EPERM) {
12051                         /* if the string won't fit in attrname, copyinstr emits EPERM */
12052                         return ENAMETOOLONG;
12053                 }
12054                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
12055                 return error;
12056         }
12057         if (xattr_protected(attrname) &&
12058             (error = xattr_entitlement_check(attrname, ctx, true)) != 0) {
12059                 return error;
12060         }
12061         if (uap->size != 0 && uap->value == 0) {
12062                 return EINVAL;
12063         }
12064         if (uap->size > INT_MAX) {
12065                 return E2BIG;
12066         }
12067         if ((error = file_vnode(uap->fd, &vp))) {
12068                 return error;
12069         }
12070         if ((error = vnode_getwithref(vp))) {
12071                 file_drop(uap->fd);
12072                 return error;
12073         }
12074         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
12075             &uio_buf[0], sizeof(uio_buf));
12076         uio_addiov(auio, uap->value, uap->size);
12077
12078         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
12079 #if CONFIG_FSE
12080         if (error == 0) {
12081                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
12082                     FSE_ARG_VNODE, vp,
12083                     FSE_ARG_DONE);
12084         }
12085 #endif
12086         vnode_put(vp);
12087         file_drop(uap->fd);
12088         *retval = 0;
12089         return error;
12090 }
12091
12092 /*
12093  * Remove an extended attribute.
12094  * XXX Code duplication here.
12095  */
12096 int
12097 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
12098 {
12099         vnode_t vp;
12100         struct nameidata nd;
12101         char attrname[XATTR_MAXNAMELEN + 1];
12102         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12103         vfs_context_t ctx = vfs_context_current();
12104         size_t namelen;
12105         u_int32_t nameiflags;
12106         int error;
12107
12108         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12109                 return EINVAL;
12110         }
12111
12112         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12113         if (error != 0) {
12114                 return error;
12115         }
12116         if (xattr_protected(attrname)) {
12117                 return EPERM;
12118         }
12119         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12120         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
12121         if ((error = namei(&nd))) {
12122                 return error;
12123         }
12124         vp = nd.ni_vp;
12125         nameidone(&nd);
12126
12127         error = vn_removexattr(vp, attrname, uap->options, ctx);
12128 #if CONFIG_FSE
12129         if (error == 0) {
12130                 add_fsevent(FSE_XATTR_REMOVED, ctx,
12131                     FSE_ARG_VNODE, vp,
12132                     FSE_ARG_DONE);
12133         }
12134 #endif
12135         vnode_put(vp);
12136         *retval = 0;
12137         return error;
12138 }
12139
12140 /*
12141  * Remove an extended attribute.
12142  * XXX Code duplication here.
12143  */
12144 int
12145 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
12146 {
12147         vnode_t vp;
12148         char attrname[XATTR_MAXNAMELEN + 1];
12149         size_t namelen;
12150         int error;
12151 #if CONFIG_FSE
12152         vfs_context_t ctx = vfs_context_current();
12153 #endif
12154
12155         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12156                 return EINVAL;
12157         }
12158
12159         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
12160         if (error != 0) {
12161                 return error;
12162         }
12163         if (xattr_protected(attrname)) {
12164                 return EPERM;
12165         }
12166         if ((error = file_vnode(uap->fd, &vp))) {
12167                 return error;
12168         }
12169         if ((error = vnode_getwithref(vp))) {
12170                 file_drop(uap->fd);
12171                 return error;
12172         }
12173
12174         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
12175 #if CONFIG_FSE
12176         if (error == 0) {
12177                 add_fsevent(FSE_XATTR_REMOVED, ctx,
12178                     FSE_ARG_VNODE, vp,
12179                     FSE_ARG_DONE);
12180         }
12181 #endif
12182         vnode_put(vp);
12183         file_drop(uap->fd);
12184         *retval = 0;
12185         return error;
12186 }
12187
12188 /*
12189  * Retrieve the list of extended attribute names.
12190  * XXX Code duplication here.
12191  */
12192 int
12193 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
12194 {
12195         vnode_t vp;
12196         struct nameidata nd;
12197         vfs_context_t ctx = vfs_context_current();
12198         uio_t auio = NULL;
12199         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12200         size_t attrsize = 0;
12201         u_int32_t nameiflags;
12202         int error;
12203         char uio_buf[UIO_SIZEOF(1)];
12204
12205         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12206                 return EINVAL;
12207         }
12208
12209         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
12210         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
12211         if ((error = namei(&nd))) {
12212                 return error;
12213         }
12214         vp = nd.ni_vp;
12215         nameidone(&nd);
12216         if (uap->namebuf != 0 && uap->bufsize > 0) {
12217                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
12218                     &uio_buf[0], sizeof(uio_buf));
12219                 uio_addiov(auio, uap->namebuf, uap->bufsize);
12220         }
12221
12222         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
12223
12224         vnode_put(vp);
12225         if (auio) {
12226                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12227         } else {
12228                 *retval = (user_ssize_t)attrsize;
12229         }
12230         return error;
12231 }
12232
12233 /*
12234  * Retrieve the list of extended attribute names.
12235  * XXX Code duplication here.
12236  */
12237 int
12238 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
12239 {
12240         vnode_t vp;
12241         uio_t auio = NULL;
12242         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
12243         size_t attrsize = 0;
12244         int error;
12245         char uio_buf[UIO_SIZEOF(1)];
12246
12247         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
12248                 return EINVAL;
12249         }
12250
12251         if ((error = file_vnode(uap->fd, &vp))) {
12252                 return error;
12253         }
12254         if ((error = vnode_getwithref(vp))) {
12255                 file_drop(uap->fd);
12256                 return error;
12257         }
12258         if (uap->namebuf != 0 && uap->bufsize > 0) {
12259                 auio = uio_createwithbuffer(1, 0, spacetype,
12260                     UIO_READ, &uio_buf[0], sizeof(uio_buf));
12261                 uio_addiov(auio, uap->namebuf, uap->bufsize);
12262         }
12263
12264         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
12265
12266         vnode_put(vp);
12267         file_drop(uap->fd);
12268         if (auio) {
12269                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
12270         } else {
12271                 *retval = (user_ssize_t)attrsize;
12272         }
12273         return error;
12274 }
12275
12276 static int
12277 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
12278     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
12279 {
12280         int error;
12281         struct mount *mp = NULL;
12282         vnode_t vp;
12283         int length;
12284         int bpflags;
12285         /* maximum number of times to retry build_path */
12286         unsigned int retries = 0x10;
12287
12288         if (bufsize > PAGE_SIZE) {
12289                 return EINVAL;
12290         }
12291
12292         if (buf == NULL) {
12293                 return ENOMEM;
12294         }
12295
12296 retry:
12297         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
12298                 error = ENOTSUP;  /* unexpected failure */
12299                 return ENOTSUP;
12300         }
12301
12302 unionget:
12303         if (objid == 2) {
12304                 struct vfs_attr vfsattr;
12305                 int use_vfs_root = TRUE;
12306
12307                 VFSATTR_INIT(&vfsattr);
12308                 VFSATTR_WANTED(&vfsattr, f_capabilities);
12309                 if (!(options & FSOPT_ISREALFSID) &&
12310                     vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
12311                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
12312                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
12313                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
12314                                 use_vfs_root = FALSE;
12315                         }
12316                 }
12317
12318                 if (use_vfs_root) {
12319                         error = VFS_ROOT(mp, &vp, ctx);
12320                 } else {
12321                         error = VFS_VGET(mp, objid, &vp, ctx);
12322                 }
12323         } else {
12324                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
12325         }
12326
12327         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
12328                 /*
12329                  * If the fileid isn't found and we're in a union
12330                  * mount volume, then see if the fileid is in the
12331                  * mounted-on volume.
12332                  */
12333                 struct mount *tmp = mp;
12334                 mp = vnode_mount(tmp->mnt_vnodecovered);
12335                 vfs_unbusy(tmp);
12336                 if (vfs_busy(mp, LK_NOWAIT) == 0) {
12337                         goto unionget;
12338                 }
12339         } else {
12340                 vfs_unbusy(mp);
12341         }
12342
12343         if (error) {
12344                 return error;
12345         }
12346
12347 #if CONFIG_MACF
12348         error = mac_vnode_check_fsgetpath(ctx, vp);
12349         if (error) {
12350                 vnode_put(vp);
12351                 return error;
12352         }
12353 #endif
12354
12355         /* Obtain the absolute path to this vnode. */
12356         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
12357         if (options & FSOPT_NOFIRMLINKPATH) {
12358                 bpflags |= BUILDPATH_NO_FIRMLINK;
12359         }
12360         bpflags |= BUILDPATH_CHECK_MOVED;
12361         error = build_path(vp, buf, (int)bufsize, &length, bpflags, ctx);
12362         vnode_put(vp);
12363
12364         if (error) {
12365                 /* there was a race building the path, try a few more times */
12366                 if (error == EAGAIN) {
12367                         --retries;
12368                         if (retries > 0) {
12369                                 goto retry;
12370                         }
12371
12372                         error = ENOENT;
12373                 }
12374                 goto out;
12375         }
12376
12377         AUDIT_ARG(text, buf);
12378
12379         if (kdebug_debugid_enabled(VFS_LOOKUP) && length > 0) {
12380                 unsigned long path_words[NUMPARMS];
12381                 size_t path_len = sizeof(path_words);
12382
12383                 if ((size_t)length < path_len) {
12384                         memcpy((char *)path_words, buf, length);
12385                         memset((char *)path_words + length, 0, path_len - length);
12386
12387                         path_len = length;
12388                 } else {
12389                         memcpy((char *)path_words, buf + (length - path_len), path_len);
12390                 }
12391
12392                 kdebug_vfs_lookup(path_words, (int)path_len, vp,
12393                     KDBG_VFS_LOOKUP_FLAG_LOOKUP);
12394         }
12395
12396         *pathlen = length; /* may be superseded by error */
12397
12398 out:
12399         return error;
12400 }
12401
12402 /*
12403  * Obtain the full pathname of a file system object by id.
12404  */
12405 static int
12406 fsgetpath_extended(user_addr_t buf, user_size_t bufsize, user_addr_t user_fsid, uint64_t objid,
12407     uint32_t options, user_ssize_t *retval)
12408 {
12409         vfs_context_t ctx = vfs_context_current();
12410         fsid_t fsid;
12411         char *realpath;
12412         int length;
12413         int error;
12414
12415         if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
12416                 return EINVAL;
12417         }
12418
12419         if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
12420                 return error;
12421         }
12422         AUDIT_ARG(value32, fsid.val[0]);
12423         AUDIT_ARG(value64, objid);
12424         /* Restrict output buffer size for now. */
12425
12426         if (bufsize > PAGE_SIZE || bufsize <= 0) {
12427                 return EINVAL;
12428         }
12429         realpath = kheap_alloc(KHEAP_TEMP, bufsize, Z_WAITOK | Z_ZERO);
12430         if (realpath == NULL) {
12431                 return ENOMEM;
12432         }
12433
12434         error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
12435             options, &length);
12436
12437         if (error) {
12438                 goto out;
12439         }
12440
12441         error = copyout((caddr_t)realpath, buf, length);
12442
12443         *retval = (user_ssize_t)length; /* may be superseded by error */
12444 out:
12445         kheap_free(KHEAP_TEMP, realpath, bufsize);
12446         return error;
12447 }
12448
12449 int
12450 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
12451 {
12452         return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12453                    0, retval);
12454 }
12455
12456 int
12457 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
12458 {
12459         return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12460                    uap->options, retval);
12461 }
12462
12463 /*
12464  * Common routine to handle various flavors of statfs data heading out
12465  *      to user space.
12466  *
12467  * Returns:     0                       Success
12468  *              EFAULT
12469  */
12470 static int
12471 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
12472     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12473     boolean_t partial_copy)
12474 {
12475         int             error;
12476         int             my_size, copy_size;
12477
12478         if (is_64_bit) {
12479                 struct user64_statfs sfs;
12480                 my_size = copy_size = sizeof(sfs);
12481                 bzero(&sfs, my_size);
12482                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12483                 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12484                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12485                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12486                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12487                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12488                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12489                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12490                 sfs.f_files = (user64_long_t)sfsp->f_files;
12491                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12492                 sfs.f_fsid = sfsp->f_fsid;
12493                 sfs.f_owner = sfsp->f_owner;
12494                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12495                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12496                 } else {
12497                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12498                 }
12499                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12500                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12501
12502                 if (partial_copy) {
12503                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12504                 }
12505                 error = copyout((caddr_t)&sfs, bufp, copy_size);
12506         } else {
12507                 struct user32_statfs sfs;
12508
12509                 my_size = copy_size = sizeof(sfs);
12510                 bzero(&sfs, my_size);
12511
12512                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12513                 sfs.f_type = (short)mp->mnt_vtable->vfc_typenum;
12514                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12515
12516                 /*
12517                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12518                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
12519                  * to reflect the filesystem size as best we can.
12520                  */
12521                 if ((sfsp->f_blocks > INT_MAX)
12522                     /* Hack for 4061702 . I think the real fix is for Carbon to
12523                      * look for some volume capability and not depend on hidden
12524                      * semantics agreed between a FS and carbon.
12525                      * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12526                      * for Carbon to set bNoVolumeSizes volume attribute.
12527                      * Without this the webdavfs files cannot be copied onto
12528                      * disk as they look huge. This change should not affect
12529                      * XSAN as they should not setting these to -1..
12530                      */
12531                     && (sfsp->f_blocks != 0xffffffffffffffffULL)
12532                     && (sfsp->f_bfree != 0xffffffffffffffffULL)
12533                     && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12534                         int             shift;
12535
12536                         /*
12537                          * Work out how far we have to shift the block count down to make it fit.
12538                          * Note that it's possible to have to shift so far that the resulting
12539                          * blocksize would be unreportably large.  At that point, we will clip
12540                          * any values that don't fit.
12541                          *
12542                          * For safety's sake, we also ensure that f_iosize is never reported as
12543                          * being smaller than f_bsize.
12544                          */
12545                         for (shift = 0; shift < 32; shift++) {
12546                                 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12547                                         break;
12548                                 }
12549                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12550                                         break;
12551                                 }
12552                         }
12553 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12554                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12555                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12556                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12557 #undef __SHIFT_OR_CLIP
12558                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12559                         sfs.f_iosize = (int)lmax(sfsp->f_iosize, sfsp->f_bsize);
12560                 } else {
12561                         /* filesystem is small enough to be reported honestly */
12562                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12563                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12564                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12565                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12566                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12567                 }
12568                 sfs.f_files = (user32_long_t)sfsp->f_files;
12569                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12570                 sfs.f_fsid = sfsp->f_fsid;
12571                 sfs.f_owner = sfsp->f_owner;
12572                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12573                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12574                 } else {
12575                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12576                 }
12577                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12578                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12579
12580                 if (partial_copy) {
12581                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12582                 }
12583                 error = copyout((caddr_t)&sfs, bufp, copy_size);
12584         }
12585
12586         if (sizep != NULL) {
12587                 *sizep = my_size;
12588         }
12589         return error;
12590 }
12591
12592 /*
12593  * copy stat structure into user_stat structure.
12594  */
12595 void
12596 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
12597 {
12598         bzero(usbp, sizeof(*usbp));
12599
12600         usbp->st_dev = sbp->st_dev;
12601         usbp->st_ino = sbp->st_ino;
12602         usbp->st_mode = sbp->st_mode;
12603         usbp->st_nlink = sbp->st_nlink;
12604         usbp->st_uid = sbp->st_uid;
12605         usbp->st_gid = sbp->st_gid;
12606         usbp->st_rdev = sbp->st_rdev;
12607 #ifndef _POSIX_C_SOURCE
12608         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12609         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12610         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12611         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12612         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12613         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12614 #else
12615         usbp->st_atime = sbp->st_atime;
12616         usbp->st_atimensec = sbp->st_atimensec;
12617         usbp->st_mtime = sbp->st_mtime;
12618         usbp->st_mtimensec = sbp->st_mtimensec;
12619         usbp->st_ctime = sbp->st_ctime;
12620         usbp->st_ctimensec = sbp->st_ctimensec;
12621 #endif
12622         usbp->st_size = sbp->st_size;
12623         usbp->st_blocks = sbp->st_blocks;
12624         usbp->st_blksize = sbp->st_blksize;
12625         usbp->st_flags = sbp->st_flags;
12626         usbp->st_gen = sbp->st_gen;
12627         usbp->st_lspare = sbp->st_lspare;
12628         usbp->st_qspare[0] = sbp->st_qspare[0];
12629         usbp->st_qspare[1] = sbp->st_qspare[1];
12630 }
12631
12632 void
12633 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
12634 {
12635         bzero(usbp, sizeof(*usbp));
12636
12637         usbp->st_dev = sbp->st_dev;
12638         usbp->st_ino = sbp->st_ino;
12639         usbp->st_mode = sbp->st_mode;
12640         usbp->st_nlink = sbp->st_nlink;
12641         usbp->st_uid = sbp->st_uid;
12642         usbp->st_gid = sbp->st_gid;
12643         usbp->st_rdev = sbp->st_rdev;
12644 #ifndef _POSIX_C_SOURCE
12645         usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
12646         usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
12647         usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
12648         usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
12649         usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
12650         usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
12651 #else
12652         usbp->st_atime = sbp->st_atime;
12653         usbp->st_atimensec = sbp->st_atimensec;
12654         usbp->st_mtime = sbp->st_mtime;
12655         usbp->st_mtimensec = sbp->st_mtimensec;
12656         usbp->st_ctime = sbp->st_ctime;
12657         usbp->st_ctimensec = sbp->st_ctimensec;
12658 #endif
12659         usbp->st_size = sbp->st_size;
12660         usbp->st_blocks = sbp->st_blocks;
12661         usbp->st_blksize = sbp->st_blksize;
12662         usbp->st_flags = sbp->st_flags;
12663         usbp->st_gen = sbp->st_gen;
12664         usbp->st_lspare = sbp->st_lspare;
12665         usbp->st_qspare[0] = sbp->st_qspare[0];
12666         usbp->st_qspare[1] = sbp->st_qspare[1];
12667 }
12668
12669 /*
12670  * copy stat64 structure into user_stat64 structure.
12671  */
12672 void
12673 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
12674 {
12675         bzero(usbp, sizeof(*usbp));
12676
12677         usbp->st_dev = sbp->st_dev;
12678         usbp->st_ino = sbp->st_ino;
12679         usbp->st_mode = sbp->st_mode;
12680         usbp->st_nlink = sbp->st_nlink;
12681         usbp->st_uid = sbp->st_uid;
12682         usbp->st_gid = sbp->st_gid;
12683         usbp->st_rdev = sbp->st_rdev;
12684 #ifndef _POSIX_C_SOURCE
12685         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12686         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12687         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12688         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12689         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12690         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12691         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12692         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12693 #else
12694         usbp->st_atime = sbp->st_atime;
12695         usbp->st_atimensec = sbp->st_atimensec;
12696         usbp->st_mtime = sbp->st_mtime;
12697         usbp->st_mtimensec = sbp->st_mtimensec;
12698         usbp->st_ctime = sbp->st_ctime;
12699         usbp->st_ctimensec = sbp->st_ctimensec;
12700         usbp->st_birthtime = sbp->st_birthtime;
12701         usbp->st_birthtimensec = sbp->st_birthtimensec;
12702 #endif
12703         usbp->st_size = sbp->st_size;
12704         usbp->st_blocks = sbp->st_blocks;
12705         usbp->st_blksize = sbp->st_blksize;
12706         usbp->st_flags = sbp->st_flags;
12707         usbp->st_gen = sbp->st_gen;
12708         usbp->st_lspare = sbp->st_lspare;
12709         usbp->st_qspare[0] = sbp->st_qspare[0];
12710         usbp->st_qspare[1] = sbp->st_qspare[1];
12711 }
12712
12713 void
12714 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
12715 {
12716         bzero(usbp, sizeof(*usbp));
12717
12718         usbp->st_dev = sbp->st_dev;
12719         usbp->st_ino = sbp->st_ino;
12720         usbp->st_mode = sbp->st_mode;
12721         usbp->st_nlink = sbp->st_nlink;
12722         usbp->st_uid = sbp->st_uid;
12723         usbp->st_gid = sbp->st_gid;
12724         usbp->st_rdev = sbp->st_rdev;
12725 #ifndef _POSIX_C_SOURCE
12726         usbp->st_atimespec.tv_sec = (user32_time_t)sbp->st_atimespec.tv_sec;
12727         usbp->st_atimespec.tv_nsec = (user32_long_t)sbp->st_atimespec.tv_nsec;
12728         usbp->st_mtimespec.tv_sec = (user32_time_t)sbp->st_mtimespec.tv_sec;
12729         usbp->st_mtimespec.tv_nsec = (user32_long_t)sbp->st_mtimespec.tv_nsec;
12730         usbp->st_ctimespec.tv_sec = (user32_time_t)sbp->st_ctimespec.tv_sec;
12731         usbp->st_ctimespec.tv_nsec = (user32_long_t)sbp->st_ctimespec.tv_nsec;
12732         usbp->st_birthtimespec.tv_sec = (user32_time_t)sbp->st_birthtimespec.tv_sec;
12733         usbp->st_birthtimespec.tv_nsec = (user32_long_t)sbp->st_birthtimespec.tv_nsec;
12734 #else
12735         usbp->st_atime = sbp->st_atime;
12736         usbp->st_atimensec = sbp->st_atimensec;
12737         usbp->st_mtime = sbp->st_mtime;
12738         usbp->st_mtimensec = sbp->st_mtimensec;
12739         usbp->st_ctime = sbp->st_ctime;
12740         usbp->st_ctimensec = sbp->st_ctimensec;
12741         usbp->st_birthtime = sbp->st_birthtime;
12742         usbp->st_birthtimensec = sbp->st_birthtimensec;
12743 #endif
12744         usbp->st_size = sbp->st_size;
12745         usbp->st_blocks = sbp->st_blocks;
12746         usbp->st_blksize = sbp->st_blksize;
12747         usbp->st_flags = sbp->st_flags;
12748         usbp->st_gen = sbp->st_gen;
12749         usbp->st_lspare = sbp->st_lspare;
12750         usbp->st_qspare[0] = sbp->st_qspare[0];
12751         usbp->st_qspare[1] = sbp->st_qspare[1];
12752 }
12753
12754 /*
12755  * Purge buffer cache for simulating cold starts
12756  */
12757 static int
12758 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
12759 {
12760         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
12761
12762         return VNODE_RETURNED;
12763 }
12764
12765 static int
12766 vfs_purge_callback(mount_t mp, __unused void * arg)
12767 {
12768         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
12769
12770         return VFS_RETURNED;
12771 }
12772
12773 int
12774 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
12775 {
12776         if (!kauth_cred_issuser(kauth_cred_get())) {
12777                 return EPERM;
12778         }
12779
12780         vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
12781
12782         return 0;
12783 }
12784
12785 /*
12786  * gets the vnode associated with the (unnamed) snapshot directory
12787  * for a Filesystem. The snapshot directory vnode is returned with
12788  * an iocount on it.
12789  */
12790 int
12791 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
12792 {
12793         return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
12794 }
12795
12796 /*
12797  * Get the snapshot vnode.
12798  *
12799  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
12800  * needs nameidone() on ndp.
12801  *
12802  * If the snapshot vnode exists it is returned in ndp->ni_vp.
12803  *
12804  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
12805  * not needed.
12806  */
12807 static int
12808 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
12809     user_addr_t name, struct nameidata *ndp, int32_t op,
12810 #if !CONFIG_TRIGGERS
12811     __unused
12812 #endif
12813     enum path_operation pathop,
12814     vfs_context_t ctx)
12815 {
12816         int error, i;
12817         caddr_t name_buf;
12818         size_t name_len;
12819         struct vfs_attr vfa;
12820
12821         *sdvpp = NULLVP;
12822         *rvpp = NULLVP;
12823
12824         error = vnode_getfromfd(ctx, dirfd, rvpp);
12825         if (error) {
12826                 return error;
12827         }
12828
12829         if (!vnode_isvroot(*rvpp)) {
12830                 error = EINVAL;
12831                 goto out;
12832         }
12833
12834         /* Make sure the filesystem supports snapshots */
12835         VFSATTR_INIT(&vfa);
12836         VFSATTR_WANTED(&vfa, f_capabilities);
12837         if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
12838             !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
12839             !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
12840             VOL_CAP_INT_SNAPSHOT)) ||
12841             !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
12842             VOL_CAP_INT_SNAPSHOT))) {
12843                 error = ENOTSUP;
12844                 goto out;
12845         }
12846
12847         error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
12848         if (error) {
12849                 goto out;
12850         }
12851
12852         name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
12853         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12854         if (error) {
12855                 goto out1;
12856         }
12857
12858         /*
12859          * Some sanity checks- name can't be empty, "." or ".." or have slashes.
12860          * (the length returned by copyinstr includes the terminating NUL)
12861          */
12862         if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
12863             (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
12864                 error = EINVAL;
12865                 goto out1;
12866         }
12867         for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
12868                 ;
12869         }
12870         if (i < (int)name_len) {
12871                 error = EINVAL;
12872                 goto out1;
12873         }
12874
12875 #if CONFIG_MACF
12876         if (op == CREATE) {
12877                 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
12878                     name_buf);
12879         } else if (op == DELETE) {
12880                 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
12881                     name_buf);
12882         }
12883         if (error) {
12884                 goto out1;
12885         }
12886 #endif
12887
12888         /* Check if the snapshot already exists ... */
12889         NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
12890             UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
12891         ndp->ni_dvp = *sdvpp;
12892
12893         error = namei(ndp);
12894 out1:
12895         zfree(ZV_NAMEI, name_buf);
12896 out:
12897         if (error) {
12898                 if (*sdvpp) {
12899                         vnode_put(*sdvpp);
12900                         *sdvpp = NULLVP;
12901                 }
12902                 if (*rvpp) {
12903                         vnode_put(*rvpp);
12904                         *rvpp = NULLVP;
12905                 }
12906         }
12907         return error;
12908 }
12909
12910 /*
12911  * create a filesystem snapshot (for supporting filesystems)
12912  *
12913  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
12914  * We get to the (unnamed) snapshot directory vnode and create the vnode
12915  * for the snapshot in it.
12916  *
12917  * Restrictions:
12918  *
12919  *    a) Passed in name for snapshot cannot have slashes.
12920  *    b) name can't be "." or ".."
12921  *
12922  * Since this requires superuser privileges, vnode_authorize calls are not
12923  * made.
12924  */
12925 static int __attribute__((noinline))
12926 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
12927     vfs_context_t ctx)
12928 {
12929         vnode_t rvp, snapdvp;
12930         int error;
12931         struct nameidata *ndp;
12932
12933         ndp = kheap_alloc(KHEAP_TEMP, sizeof(*ndp), Z_WAITOK);
12934
12935         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE,
12936             OP_LINK, ctx);
12937         if (error) {
12938                 goto out;
12939         }
12940
12941         if (ndp->ni_vp) {
12942                 vnode_put(ndp->ni_vp);
12943                 error = EEXIST;
12944         } else {
12945                 struct vnode_attr *vap;
12946                 vnode_t vp = NULLVP;
12947
12948                 vap = kheap_alloc(KHEAP_TEMP, sizeof(*vap), Z_WAITOK);
12949
12950                 VATTR_INIT(vap);
12951                 VATTR_SET(vap, va_type, VREG);
12952                 VATTR_SET(vap, va_mode, 0);
12953
12954                 error = vn_create(snapdvp, &vp, ndp, vap,
12955                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
12956                 if (!error && vp) {
12957                         vnode_put(vp);
12958                 }
12959
12960                 kheap_free(KHEAP_TEMP, vap, sizeof(*vap));
12961         }
12962
12963         nameidone(ndp);
12964         vnode_put(snapdvp);
12965         vnode_put(rvp);
12966 out:
12967         kheap_free(KHEAP_TEMP, ndp, sizeof(*ndp));
12968
12969         return error;
12970 }
12971
12972 /*
12973  * Delete a Filesystem snapshot
12974  *
12975  * get the vnode for the unnamed snapshot directory and the snapshot and
12976  * delete the snapshot.
12977  */
12978 static int __attribute__((noinline))
12979 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
12980     vfs_context_t ctx)
12981 {
12982         vnode_t rvp, snapdvp;
12983         int error;
12984         struct nameidata *ndp;
12985
12986         ndp = kheap_alloc(KHEAP_TEMP, sizeof(*ndp), Z_WAITOK);
12987
12988         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE,
12989             OP_UNLINK, ctx);
12990         if (error) {
12991                 goto out;
12992         }
12993
12994         error = VNOP_REMOVE(snapdvp, ndp->ni_vp, &ndp->ni_cnd,
12995             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
12996
12997         vnode_put(ndp->ni_vp);
12998         nameidone(ndp);
12999         vnode_put(snapdvp);
13000         vnode_put(rvp);
13001 out:
13002         kheap_free(KHEAP_TEMP, ndp, sizeof(*ndp));
13003
13004         return error;
13005 }
13006
13007 /*
13008  * Revert a filesystem to a snapshot
13009  *
13010  * Marks the filesystem to revert to the given snapshot on next mount.
13011  */
13012 static int __attribute__((noinline))
13013 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
13014     vfs_context_t ctx)
13015 {
13016         int error;
13017         vnode_t rvp;
13018         mount_t mp;
13019         struct fs_snapshot_revert_args revert_data;
13020         struct componentname cnp;
13021         caddr_t name_buf;
13022         size_t name_len;
13023
13024         error = vnode_getfromfd(ctx, dirfd, &rvp);
13025         if (error) {
13026                 return error;
13027         }
13028         mp = vnode_mount(rvp);
13029
13030         name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13031         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13032         if (error) {
13033                 zfree(ZV_NAMEI, name_buf);
13034                 vnode_put(rvp);
13035                 return error;
13036         }
13037
13038 #if CONFIG_MACF
13039         error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
13040         if (error) {
13041                 zfree(ZV_NAMEI, name_buf);
13042                 vnode_put(rvp);
13043                 return error;
13044         }
13045 #endif
13046
13047         /*
13048          * Grab mount_iterref so that we can release the vnode,
13049          * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
13050          */
13051         error = mount_iterref(mp, 0);
13052         vnode_put(rvp);
13053         if (error) {
13054                 zfree(ZV_NAMEI, name_buf);
13055                 return error;
13056         }
13057
13058         memset(&cnp, 0, sizeof(cnp));
13059         cnp.cn_pnbuf = (char *)name_buf;
13060         cnp.cn_nameiop = LOOKUP;
13061         cnp.cn_flags = ISLASTCN | HASBUF;
13062         cnp.cn_pnlen = MAXPATHLEN;
13063         cnp.cn_nameptr = cnp.cn_pnbuf;
13064         cnp.cn_namelen = (int)name_len;
13065         revert_data.sr_cnp = &cnp;
13066
13067         error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
13068         mount_iterdrop(mp);
13069         zfree(ZV_NAMEI, name_buf);
13070
13071         if (error) {
13072                 /* If there was any error, try again using VNOP_IOCTL */
13073
13074                 vnode_t snapdvp;
13075                 struct nameidata namend;
13076
13077                 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
13078                     OP_LOOKUP, ctx);
13079                 if (error) {
13080                         return error;
13081                 }
13082
13083
13084                 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
13085                     0, ctx);
13086
13087                 vnode_put(namend.ni_vp);
13088                 nameidone(&namend);
13089                 vnode_put(snapdvp);
13090                 vnode_put(rvp);
13091         }
13092
13093         return error;
13094 }
13095
13096 /*
13097  * rename a Filesystem snapshot
13098  *
13099  * get the vnode for the unnamed snapshot directory and the snapshot and
13100  * rename the snapshot. This is a very specialised (and simple) case of
13101  * rename(2) (which has to deal with a lot more complications). It differs
13102  * slightly from rename(2) in that EEXIST is returned if the new name exists.
13103  */
13104 static int __attribute__((noinline))
13105 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
13106     __unused uint32_t flags, vfs_context_t ctx)
13107 {
13108         vnode_t rvp, snapdvp;
13109         int error, i;
13110         caddr_t newname_buf;
13111         size_t name_len;
13112         vnode_t fvp;
13113         struct nameidata *fromnd, *tond;
13114         /* carving out a chunk for structs that are too big to be on stack. */
13115         struct {
13116                 struct nameidata from_node;
13117                 struct nameidata to_node;
13118         } * __rename_data;
13119
13120         __rename_data = kheap_alloc(KHEAP_TEMP, sizeof(*__rename_data), Z_WAITOK);
13121         fromnd = &__rename_data->from_node;
13122         tond = &__rename_data->to_node;
13123
13124         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
13125             OP_UNLINK, ctx);
13126         if (error) {
13127                 goto out;
13128         }
13129         fvp  = fromnd->ni_vp;
13130
13131         newname_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13132         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
13133         if (error) {
13134                 goto out1;
13135         }
13136
13137         /*
13138          * Some sanity checks- new name can't be empty, "." or ".." or have
13139          * slashes.
13140          * (the length returned by copyinstr includes the terminating NUL)
13141          *
13142          * The FS rename VNOP is suppossed to handle this but we'll pick it
13143          * off here itself.
13144          */
13145         if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
13146             (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
13147                 error = EINVAL;
13148                 goto out1;
13149         }
13150         for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
13151                 ;
13152         }
13153         if (i < (int)name_len) {
13154                 error = EINVAL;
13155                 goto out1;
13156         }
13157
13158 #if CONFIG_MACF
13159         error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
13160             newname_buf);
13161         if (error) {
13162                 goto out1;
13163         }
13164 #endif
13165
13166         NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
13167             UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
13168         tond->ni_dvp = snapdvp;
13169
13170         error = namei(tond);
13171         if (error) {
13172                 goto out2;
13173         } else if (tond->ni_vp) {
13174                 /*
13175                  * snapshot rename behaves differently than rename(2) - if the
13176                  * new name exists, EEXIST is returned.
13177                  */
13178                 vnode_put(tond->ni_vp);
13179                 error = EEXIST;
13180                 goto out2;
13181         }
13182
13183         error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
13184             &tond->ni_cnd, ctx);
13185
13186 out2:
13187         nameidone(tond);
13188 out1:
13189         zfree(ZV_NAMEI, newname_buf);
13190         vnode_put(fvp);
13191         vnode_put(snapdvp);
13192         vnode_put(rvp);
13193         nameidone(fromnd);
13194 out:
13195         kheap_free(KHEAP_TEMP, __rename_data, sizeof(*__rename_data));
13196         return error;
13197 }
13198
13199 /*
13200  * Mount a Filesystem snapshot
13201  *
13202  * get the vnode for the unnamed snapshot directory and the snapshot and
13203  * mount the snapshot.
13204  */
13205 static int __attribute__((noinline))
13206 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
13207     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
13208 {
13209         mount_t mp;
13210         vnode_t rvp, snapdvp, snapvp, vp, pvp;
13211         struct fs_snapshot_mount_args smnt_data;
13212         int error;
13213         struct nameidata *snapndp, *dirndp;
13214         /* carving out a chunk for structs that are too big to be on stack. */
13215         struct {
13216                 struct nameidata snapnd;
13217                 struct nameidata dirnd;
13218         } * __snapshot_mount_data;
13219
13220         __snapshot_mount_data = kheap_alloc(KHEAP_TEMP,
13221             sizeof(*__snapshot_mount_data), Z_WAITOK);
13222         snapndp = &__snapshot_mount_data->snapnd;
13223         dirndp = &__snapshot_mount_data->dirnd;
13224
13225         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
13226             OP_LOOKUP, ctx);
13227         if (error) {
13228                 goto out;
13229         }
13230
13231         snapvp  = snapndp->ni_vp;
13232         if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
13233                 error = EIO;
13234                 goto out1;
13235         }
13236
13237         /* Get the vnode to be covered */
13238         NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
13239             UIO_USERSPACE, directory, ctx);
13240         error = namei(dirndp);
13241         if (error) {
13242                 goto out1;
13243         }
13244
13245         vp = dirndp->ni_vp;
13246         pvp = dirndp->ni_dvp;
13247         mp = vnode_mount(rvp);
13248
13249         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
13250                 error = EINVAL;
13251                 goto out2;
13252         }
13253
13254 #if CONFIG_MACF
13255         error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
13256             mp->mnt_vfsstat.f_fstypename);
13257         if (error) {
13258                 goto out2;
13259         }
13260 #endif
13261
13262         smnt_data.sm_mp  = mp;
13263         smnt_data.sm_cnp = &snapndp->ni_cnd;
13264         error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
13265             &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
13266             KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
13267
13268 out2:
13269         vnode_put(vp);
13270         vnode_put(pvp);
13271         nameidone(dirndp);
13272 out1:
13273         vnode_put(snapvp);
13274         vnode_put(snapdvp);
13275         vnode_put(rvp);
13276         nameidone(snapndp);
13277 out:
13278         kheap_free(KHEAP_TEMP, __snapshot_mount_data,
13279             sizeof(*__snapshot_mount_data));
13280         return error;
13281 }
13282
13283 /*
13284  * Root from a snapshot of the filesystem
13285  *
13286  * Marks the filesystem to root from the given snapshot on next boot.
13287  */
13288 static int __attribute__((noinline))
13289 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
13290     vfs_context_t ctx)
13291 {
13292         int error;
13293         vnode_t rvp;
13294         mount_t mp;
13295         struct fs_snapshot_root_args root_data;
13296         struct componentname cnp;
13297         caddr_t name_buf;
13298         size_t name_len;
13299
13300         error = vnode_getfromfd(ctx, dirfd, &rvp);
13301         if (error) {
13302                 return error;
13303         }
13304         mp = vnode_mount(rvp);
13305
13306         name_buf = zalloc_flags(ZV_NAMEI, Z_WAITOK);
13307         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
13308         if (error) {
13309                 zfree(ZV_NAMEI, name_buf);
13310                 vnode_put(rvp);
13311                 return error;
13312         }
13313
13314         // XXX MAC checks ?
13315
13316         /*
13317          * Grab mount_iterref so that we can release the vnode,
13318          * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
13319          */
13320         error = mount_iterref(mp, 0);
13321         vnode_put(rvp);
13322         if (error) {
13323                 zfree(ZV_NAMEI, name_buf);
13324                 return error;
13325         }
13326
13327         memset(&cnp, 0, sizeof(cnp));
13328         cnp.cn_pnbuf = (char *)name_buf;
13329         cnp.cn_nameiop = LOOKUP;
13330         cnp.cn_flags = ISLASTCN | HASBUF;
13331         cnp.cn_pnlen = MAXPATHLEN;
13332         cnp.cn_nameptr = cnp.cn_pnbuf;
13333         cnp.cn_namelen = (int)name_len;
13334         root_data.sr_cnp = &cnp;
13335
13336         error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
13337
13338         mount_iterdrop(mp);
13339         zfree(ZV_NAMEI, name_buf);
13340
13341         return error;
13342 }
13343
13344 /*
13345  * FS snapshot operations dispatcher
13346  */
13347 int
13348 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
13349     __unused int32_t *retval)
13350 {
13351         int error;
13352         vfs_context_t ctx = vfs_context_current();
13353
13354         AUDIT_ARG(fd, uap->dirfd);
13355         AUDIT_ARG(value32, uap->op);
13356
13357         error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
13358         if (error) {
13359                 return error;
13360         }
13361
13362         /*
13363          * Enforce user authorization for snapshot modification operations,
13364          * or if trying to root from snapshot.
13365          */
13366         if (uap->op != SNAPSHOT_OP_MOUNT) {
13367                 vnode_t dvp = NULLVP;
13368                 vnode_t devvp = NULLVP;
13369                 mount_t mp;
13370
13371                 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
13372                 if (error) {
13373                         return error;
13374                 }
13375                 mp = vnode_mount(dvp);
13376                 devvp = mp->mnt_devvp;
13377
13378                 /* get an iocount on devvp */
13379                 if (devvp == NULLVP) {
13380                         error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
13381                         /* for mounts which arent block devices */
13382                         if (error == ENOENT) {
13383                                 error = ENXIO;
13384                         }
13385                 } else {
13386                         error = vnode_getwithref(devvp);
13387                 }
13388
13389                 if (error) {
13390                         vnode_put(dvp);
13391                         return error;
13392                 }
13393
13394                 if ((vfs_context_issuser(ctx) == 0) &&
13395                     (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0)) {
13396                         error = EPERM;
13397                 }
13398                 vnode_put(dvp);
13399                 vnode_put(devvp);
13400
13401                 if (error) {
13402                         return error;
13403                 }
13404         }
13405
13406         switch (uap->op) {
13407         case SNAPSHOT_OP_CREATE:
13408                 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
13409                 break;
13410         case SNAPSHOT_OP_DELETE:
13411                 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
13412                 break;
13413         case SNAPSHOT_OP_RENAME:
13414                 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
13415                     uap->flags, ctx);
13416                 break;
13417         case SNAPSHOT_OP_MOUNT:
13418                 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
13419                     uap->data, uap->flags, ctx);
13420                 break;
13421         case SNAPSHOT_OP_REVERT:
13422                 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
13423                 break;
13424 #if CONFIG_MNT_ROOTSNAP
13425         case SNAPSHOT_OP_ROOT:
13426                 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
13427                 break;
13428 #endif /* CONFIG_MNT_ROOTSNAP */
13429         default:
13430                 error = ENOSYS;
13431         }
13432
13433         return error;
13434 }