bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/sysctl.h>
  98 #include <sys/xattr.h>
  99 #include <sys/fcntl.h>
 100 #include <sys/fsctl.h>
 101 #include <sys/ubc_internal.h>
 102 #include <sys/disk.h>
 103 #include <sys/content_protection.h>
 104 #include <sys/clonefile.h>
 105 #include <sys/snapshot.h>
 106 #include <sys/priv.h>
 107 #include <sys/fsgetpath.h>
 108 #include <machine/cons.h>
 109 #include <machine/limits.h>
 110 #include <miscfs/specfs/specdev.h>
 111
 112 #include <vfs/vfs_disk_conditioner.h>
 113
 114 #include <security/audit/audit.h>
 115 #include <bsm/audit_kevents.h>
 116
 117 #include <mach/mach_types.h>
 118 #include <kern/kern_types.h>
 119 #include <kern/kalloc.h>
 120 #include <kern/task.h>
 121
 122 #include <vm/vm_pageout.h>
 123 #include <vm/vm_protos.h>
 124
 125 #include <libkern/OSAtomic.h>
 126 #include <pexpert/pexpert.h>
 127 #include <IOKit/IOBSD.h>
 128
 129 // deps for MIG call
 130 #include <kern/host.h>
 131 #include <kern/ipc_misc.h>
 132 #include <mach/host_priv.h>
 133 #include <mach/vfs_nspace.h>
 134 #include <os/log.h>
 135
 136 #include <nfs/nfs_conf.h>
 137
 138 #if ROUTEFS
 139 #include <miscfs/routefs/routefs.h>
 140 #endif /* ROUTEFS */
 141
 142 #if CONFIG_MACF
 143 #include <security/mac.h>
 144 #include <security/mac_framework.h>
 145 #endif
 146
 147 #if CONFIG_FSE
 148 #define GET_PATH(x) \
 149         (x) = get_pathbuff();
 150 #define RELEASE_PATH(x) \
 151         release_pathbuff(x);
 152 #else
 153 #define GET_PATH(x)     \
 154         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 155 #define RELEASE_PATH(x) \
 156         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 157 #endif /* CONFIG_FSE */
 158
 159 #ifndef HFS_GET_BOOT_INFO
 160 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
 161 #endif
 162
 163 #ifndef HFS_SET_BOOT_INFO
 164 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
 165 #endif
 166
 167 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
 168 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
 169 #endif
 170
 171 extern void disk_conditioner_unmount(mount_t mp);
 172
 173 /* struct for checkdirs iteration */
 174 struct cdirargs {
 175         vnode_t olddp;
 176         vnode_t newdp;
 177 };
 178 /* callback  for checkdirs iteration */
 179 static int checkdirs_callback(proc_t p, void * arg);
 180
 181 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 182 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 183 void enablequotas(struct mount *mp, vfs_context_t ctx);
 184 static int getfsstat_callback(mount_t mp, void * arg);
 185 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 186 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 187 static int sync_callback(mount_t, void *);
 188 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 189     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 190     boolean_t partial_copy);
 191 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 192 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 193     struct componentname *cnp, user_addr_t fsmountargs,
 194     int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 195     vfs_context_t ctx);
 196 void vfs_notify_mount(vnode_t pdvp);
 197
 198 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 199
 200 struct fd_vn_data * fg_vn_data_alloc(void);
 201
 202 /*
 203  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 204  * Concurrent lookups (or lookups by ids) on hard links can cause the
 205  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 206  * does) to return ENOENT as the path cannot be returned from the name cache
 207  * alone. We have no option but to retry and hope to get one namei->reverse path
 208  * generation done without an intervening lookup, lookup by id on the hard link
 209  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 210  * which currently are the MAC hooks for rename, unlink and rmdir.
 211  */
 212 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 213
 214 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg,
 215     int unlink_flags);
 216
 217 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, uint32_t options, int *);
 218
 219 #ifdef CONFIG_IMGSRC_ACCESS
 220 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 221 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 222 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 223 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 224 static void mount_end_update(mount_t mp);
 225 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 226 #endif /* CONFIG_IMGSRC_ACCESS */
 227
 228 #if CONFIG_LOCKERBOOT
 229 int mount_locker_protoboot(const char *fsname, const char *mntpoint,
 230     const char *pbdevpath);
 231 #endif
 232
 233 //snapshot functions
 234 #if CONFIG_MNT_ROOTSNAP
 235 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
 236 #else
 237 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
 238 #endif
 239
 240 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 241
 242 __private_extern__
 243 int sync_internal(void);
 244
 245 __private_extern__
 246 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 247
 248 extern lck_grp_t *fd_vn_lck_grp;
 249 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 250 extern lck_attr_t *fd_vn_lck_attr;
 251
 252 /*
 253  * incremented each time a mount or unmount operation occurs
 254  * used to invalidate the cached value of the rootvp in the
 255  * mount structure utilized by cache_lookup_path
 256  */
 257 uint32_t mount_generation = 0;
 258
 259 /* counts number of mount and unmount operations */
 260 unsigned int vfs_nummntops = 0;
 261
 262 extern const struct fileops vnops;
 263 #if CONFIG_APPLEDOUBLE
 264 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 265 #endif /* CONFIG_APPLEDOUBLE */
 266
 267 /*
 268  * Virtual File System System Calls
 269  */
 270
 271 #if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
 272 /*
 273  * Private in-kernel mounting spi (NFS only, not exported)
 274  */
 275 __private_extern__
 276 boolean_t
 277 vfs_iskernelmount(mount_t mp)
 278 {
 279         return (mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE;
 280 }
 281
 282 __private_extern__
 283 int
 284 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 285     void *data, __unused size_t datalen, int syscall_flags, uint32_t kern_flags, vfs_context_t ctx)
 286 {
 287         struct nameidata nd;
 288         boolean_t did_namei;
 289         int error;
 290
 291         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 292             UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 293
 294         /*
 295          * Get the vnode to be covered if it's not supplied
 296          */
 297         if (vp == NULLVP) {
 298                 error = namei(&nd);
 299                 if (error) {
 300                         if (kern_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_VMVOL | KERNEL_MOUNT_DATAVOL)) {
 301                                 printf("failed to locate mount-on path: %s ", path);
 302                         }
 303                         return error;
 304                 }
 305                 vp = nd.ni_vp;
 306                 pvp = nd.ni_dvp;
 307                 did_namei = TRUE;
 308         } else {
 309                 char *pnbuf = CAST_DOWN(char *, path);
 310
 311                 nd.ni_cnd.cn_pnbuf = pnbuf;
 312                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 313                 did_namei = FALSE;
 314         }
 315
 316         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 317             syscall_flags, kern_flags, NULL, TRUE, ctx);
 318
 319         if (did_namei) {
 320                 vnode_put(vp);
 321                 vnode_put(pvp);
 322                 nameidone(&nd);
 323         }
 324
 325         return error;
 326 }
 327 #endif /* CONFIG_NFS_CLIENT || DEVFS */
 328
 329 /*
 330  * Mount a file system.
 331  */
 332 /* ARGSUSED */
 333 int
 334 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 335 {
 336         struct __mac_mount_args muap;
 337
 338         muap.type = uap->type;
 339         muap.path = uap->path;
 340         muap.flags = uap->flags;
 341         muap.data = uap->data;
 342         muap.mac_p = USER_ADDR_NULL;
 343         return __mac_mount(p, &muap, retval);
 344 }
 345
 346 int
 347 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
 348 {
 349         struct componentname    cn;
 350         vfs_context_t           ctx = vfs_context_current();
 351         size_t                  dummy = 0;
 352         int                     error;
 353         int                     flags = uap->flags;
 354         char                    fstypename[MFSNAMELEN];
 355         char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
 356         vnode_t                 pvp;
 357         vnode_t                 vp;
 358
 359         AUDIT_ARG(fd, uap->fd);
 360         AUDIT_ARG(fflags, flags);
 361         /* fstypename will get audited by mount_common */
 362
 363         /* Sanity check the flags */
 364         if (flags & (MNT_IMGSRC_BY_INDEX | MNT_ROOTFS)) {
 365                 return ENOTSUP;
 366         }
 367
 368         if (flags & MNT_UNION) {
 369                 return EPERM;
 370         }
 371
 372         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 373         if (error) {
 374                 return error;
 375         }
 376
 377         if ((error = file_vnode(uap->fd, &vp)) != 0) {
 378                 return error;
 379         }
 380
 381         if ((error = vnode_getwithref(vp)) != 0) {
 382                 file_drop(uap->fd);
 383                 return error;
 384         }
 385
 386         pvp = vnode_getparent(vp);
 387         if (pvp == NULL) {
 388                 vnode_put(vp);
 389                 file_drop(uap->fd);
 390                 return EINVAL;
 391         }
 392
 393         memset(&cn, 0, sizeof(struct componentname));
 394         MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
 395         cn.cn_pnlen = MAXPATHLEN;
 396
 397         if ((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
 398                 FREE(cn.cn_pnbuf, M_TEMP);
 399                 vnode_put(pvp);
 400                 vnode_put(vp);
 401                 file_drop(uap->fd);
 402                 return error;
 403         }
 404
 405         error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
 406
 407         FREE(cn.cn_pnbuf, M_TEMP);
 408         vnode_put(pvp);
 409         vnode_put(vp);
 410         file_drop(uap->fd);
 411
 412         return error;
 413 }
 414
 415 void
 416 vfs_notify_mount(vnode_t pdvp)
 417 {
 418         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 419         lock_vnode_and_post(pdvp, NOTE_WRITE);
 420 }
 421
 422 /*
 423  * __mac_mount:
 424  *      Mount a file system taking into account MAC label behavior.
 425  *      See mount(2) man page for more information
 426  *
 427  * Parameters:    p                        Process requesting the mount
 428  *                uap                      User argument descriptor (see below)
 429  *                retval                   (ignored)
 430  *
 431  * Indirect:      uap->type                Filesystem type
 432  *                uap->path                Path to mount
 433  *                uap->data                Mount arguments
 434  *                uap->mac_p               MAC info
 435  *                uap->flags               Mount flags
 436  *
 437  *
 438  * Returns:        0                       Success
 439  *                !0                       Not success
 440  */
 441 boolean_t root_fs_upgrade_try = FALSE;
 442
 443 int
 444 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 445 {
 446         vnode_t pvp = NULL;
 447         vnode_t vp = NULL;
 448         int need_nameidone = 0;
 449         vfs_context_t ctx = vfs_context_current();
 450         char fstypename[MFSNAMELEN];
 451         struct nameidata nd;
 452         size_t dummy = 0;
 453         char *labelstr = NULL;
 454         int flags = uap->flags;
 455         int error;
 456 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 457         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 458 #else
 459 #pragma unused(p)
 460 #endif
 461         /*
 462          * Get the fs type name from user space
 463          */
 464         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 465         if (error) {
 466                 return error;
 467         }
 468
 469         /*
 470          * Get the vnode to be covered
 471          */
 472         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 473             UIO_USERSPACE, uap->path, ctx);
 474         error = namei(&nd);
 475         if (error) {
 476                 goto out;
 477         }
 478         need_nameidone = 1;
 479         vp = nd.ni_vp;
 480         pvp = nd.ni_dvp;
 481
 482 #ifdef CONFIG_IMGSRC_ACCESS
 483         /* Mounting image source cannot be batched with other operations */
 484         if (flags == MNT_IMGSRC_BY_INDEX) {
 485                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 486                     ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 487                 goto out;
 488         }
 489 #endif /* CONFIG_IMGSRC_ACCESS */
 490
 491 #if CONFIG_MACF
 492         /*
 493          * Get the label string (if any) from user space
 494          */
 495         if (uap->mac_p != USER_ADDR_NULL) {
 496                 struct user_mac mac;
 497                 size_t ulen = 0;
 498
 499                 if (is_64bit) {
 500                         struct user64_mac mac64;
 501                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 502                         mac.m_buflen = mac64.m_buflen;
 503                         mac.m_string = mac64.m_string;
 504                 } else {
 505                         struct user32_mac mac32;
 506                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 507                         mac.m_buflen = mac32.m_buflen;
 508                         mac.m_string = mac32.m_string;
 509                 }
 510                 if (error) {
 511                         goto out;
 512                 }
 513                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 514                     (mac.m_buflen < 2)) {
 515                         error = EINVAL;
 516                         goto out;
 517                 }
 518                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 519                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 520                 if (error) {
 521                         goto out;
 522                 }
 523                 AUDIT_ARG(mac_string, labelstr);
 524         }
 525 #endif /* CONFIG_MACF */
 526
 527         AUDIT_ARG(fflags, flags);
 528
 529 #if SECURE_KERNEL
 530         if (flags & MNT_UNION) {
 531                 /* No union mounts on release kernels */
 532                 error = EPERM;
 533                 goto out;
 534         }
 535 #endif
 536
 537         if ((vp->v_flag & VROOT) &&
 538             (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 539                 if (!(flags & MNT_UNION)) {
 540                         flags |= MNT_UPDATE;
 541                 } else {
 542                         /*
 543                          * For a union mount on '/', treat it as fresh
 544                          * mount instead of update.
 545                          * Otherwise, union mouting on '/' used to panic the
 546                          * system before, since mnt_vnodecovered was found to
 547                          * be NULL for '/' which is required for unionlookup
 548                          * after it gets ENOENT on union mount.
 549                          */
 550                         flags = (flags & ~(MNT_UPDATE));
 551                 }
 552
 553 #if SECURE_KERNEL
 554                 if ((flags & MNT_RDONLY) == 0) {
 555                         /* Release kernels are not allowed to mount "/" as rw */
 556                         error = EPERM;
 557                         goto out;
 558                 }
 559 #endif
 560                 /*
 561                  * See 7392553 for more details on why this check exists.
 562                  * Suffice to say: If this check is ON and something tries
 563                  * to mount the rootFS RW, we'll turn off the codesign
 564                  * bitmap optimization.
 565                  */
 566 #if CHECK_CS_VALIDATION_BITMAP
 567                 if ((flags & MNT_RDONLY) == 0) {
 568                         root_fs_upgrade_try = TRUE;
 569                 }
 570 #endif
 571         }
 572
 573         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 574             labelstr, FALSE, ctx);
 575
 576 out:
 577
 578 #if CONFIG_MACF
 579         if (labelstr) {
 580                 FREE(labelstr, M_MACTEMP);
 581         }
 582 #endif /* CONFIG_MACF */
 583
 584         if (vp) {
 585                 vnode_put(vp);
 586         }
 587         if (pvp) {
 588                 vnode_put(pvp);
 589         }
 590         if (need_nameidone) {
 591                 nameidone(&nd);
 592         }
 593
 594         return error;
 595 }
 596
 597 /*
 598  * common mount implementation (final stage of mounting)
 599  *
 600  * Arguments:
 601  *  fstypename  file system type (ie it's vfs name)
 602  *  pvp         parent of covered vnode
 603  *  vp          covered vnode
 604  *  cnp         component name (ie path) of covered vnode
 605  *  flags       generic mount flags
 606  *  fsmountargs file system specific data
 607  *  labelstr    optional MAC label
 608  *  kernelmount TRUE for mounts initiated from inside the kernel
 609  *  ctx         caller's context
 610  */
 611 static int
 612 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 613     struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 614     char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 615 {
 616 #if !CONFIG_MACF
 617 #pragma unused(labelstr)
 618 #endif
 619         struct vnode *devvp = NULLVP;
 620         struct vnode *device_vnode = NULLVP;
 621 #if CONFIG_MACF
 622         struct vnode *rvp;
 623 #endif
 624         struct mount *mp;
 625         struct vfstable *vfsp = (struct vfstable *)0;
 626         struct proc *p = vfs_context_proc(ctx);
 627         int error, flag = 0;
 628         user_addr_t devpath = USER_ADDR_NULL;
 629         int ronly = 0;
 630         int mntalloc = 0;
 631         boolean_t vfsp_ref = FALSE;
 632         boolean_t is_rwlock_locked = FALSE;
 633         boolean_t did_rele = FALSE;
 634         boolean_t have_usecount = FALSE;
 635
 636 #if CONFIG_ROSV_STARTUP || CONFIG_MOUNT_VM
 637         /* Check for mutually-exclusive flag bits */
 638         uint32_t checkflags = (internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL));
 639         int bitcount = 0;
 640         while (checkflags != 0) {
 641                 checkflags &= (checkflags - 1);
 642                 bitcount++;
 643         }
 644
 645         if (bitcount > 1) {
 646                 //not allowed to request multiple mount-by-role flags
 647                 error = EINVAL;
 648                 goto out1;
 649         }
 650 #endif
 651
 652         /*
 653          * Process an update for an existing mount
 654          */
 655         if (flags & MNT_UPDATE) {
 656                 if ((vp->v_flag & VROOT) == 0) {
 657                         error = EINVAL;
 658                         goto out1;
 659                 }
 660                 mp = vp->v_mount;
 661
 662                 /* unmount in progress return error */
 663                 mount_lock_spin(mp);
 664                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 665                         mount_unlock(mp);
 666                         error = EBUSY;
 667                         goto out1;
 668                 }
 669                 mount_unlock(mp);
 670                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 671                 is_rwlock_locked = TRUE;
 672                 /*
 673                  * We only allow the filesystem to be reloaded if it
 674                  * is currently mounted read-only.
 675                  */
 676                 if ((flags & MNT_RELOAD) &&
 677                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 678                         error = ENOTSUP;
 679                         goto out1;
 680                 }
 681
 682                 /*
 683                  * If content protection is enabled, update mounts are not
 684                  * allowed to turn it off.
 685                  */
 686                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 687                     ((flags & MNT_CPROTECT) == 0)) {
 688                         error = EINVAL;
 689                         goto out1;
 690                 }
 691
 692                 /*
 693                  * can't turn off MNT_REMOVABLE either but it may be an unexpected
 694                  * failure to return an error for this so we'll just silently
 695                  * add it if it is not passed in.
 696                  */
 697                 if ((mp->mnt_flag & MNT_REMOVABLE) &&
 698                     ((flags & MNT_REMOVABLE) == 0)) {
 699                         flags |= MNT_REMOVABLE;
 700                 }
 701
 702 #ifdef CONFIG_IMGSRC_ACCESS
 703                 /* Can't downgrade the backer of the root FS */
 704                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 705                     (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 706                         error = ENOTSUP;
 707                         goto out1;
 708                 }
 709 #endif /* CONFIG_IMGSRC_ACCESS */
 710
 711                 /*
 712                  * Only root, or the user that did the original mount is
 713                  * permitted to update it.
 714                  */
 715                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 716                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 717                         goto out1;
 718                 }
 719 #if CONFIG_MACF
 720                 error = mac_mount_check_remount(ctx, mp);
 721                 if (error != 0) {
 722                         goto out1;
 723                 }
 724 #endif
 725                 /*
 726                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 727                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 728                  */
 729                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 730                         flags |= MNT_NOSUID | MNT_NODEV;
 731                         if (mp->mnt_flag & MNT_NOEXEC) {
 732                                 flags |= MNT_NOEXEC;
 733                         }
 734                 }
 735                 flag = mp->mnt_flag;
 736
 737
 738
 739                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 740
 741                 vfsp = mp->mnt_vtable;
 742                 goto update;
 743         } // MNT_UPDATE
 744
 745         /*
 746          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 747          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 748          */
 749         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 750                 flags |= MNT_NOSUID | MNT_NODEV;
 751                 if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
 752                         flags |= MNT_NOEXEC;
 753                 }
 754         }
 755
 756         /* XXXAUDIT: Should we capture the type on the error path as well? */
 757         AUDIT_ARG(text, fstypename);
 758         mount_list_lock();
 759         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 760                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 761                         vfsp->vfc_refcount++;
 762                         vfsp_ref = TRUE;
 763                         break;
 764                 }
 765         }
 766         mount_list_unlock();
 767         if (vfsp == NULL) {
 768                 error = ENODEV;
 769                 goto out1;
 770         }
 771
 772         /*
 773          * VFC_VFSLOCALARGS is not currently supported for kernel mounts,
 774          * except in ROSV configs.
 775          */
 776         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) &&
 777             ((internal_flags & (KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL)) == 0)) {
 778                 error = EINVAL;  /* unsupported request */
 779                 goto out1;
 780         }
 781
 782         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 783         if (error != 0) {
 784                 goto out1;
 785         }
 786
 787         /*
 788          * Allocate and initialize the filesystem (mount_t)
 789          */
 790         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 791             M_MOUNT, M_WAITOK);
 792         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 793         mntalloc = 1;
 794
 795         /* Initialize the default IO constraints */
 796         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 797         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 798         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 799         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 800         mp->mnt_devblocksize = DEV_BSIZE;
 801         mp->mnt_alignmentmask = PAGE_MASK;
 802         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 803         mp->mnt_ioscale = 1;
 804         mp->mnt_ioflags = 0;
 805         mp->mnt_realrootvp = NULLVP;
 806         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 807
 808         TAILQ_INIT(&mp->mnt_vnodelist);
 809         TAILQ_INIT(&mp->mnt_workerqueue);
 810         TAILQ_INIT(&mp->mnt_newvnodes);
 811         mount_lock_init(mp);
 812         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 813         is_rwlock_locked = TRUE;
 814         mp->mnt_op = vfsp->vfc_vfsops;
 815         mp->mnt_vtable = vfsp;
 816         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 817         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 818         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 819         do {
 820                 int pathlen = MAXPATHLEN;
 821
 822                 if (vn_getpath_ext(vp, pvp, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER)) {
 823                         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 824                 }
 825         } while (0);
 826         mp->mnt_vnodecovered = vp;
 827         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 828         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 829         mp->mnt_devbsdunit = 0;
 830
 831         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 832         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 833
 834 #if CONFIG_NFS_CLIENT || DEVFS || ROUTEFS
 835         if (kernelmount) {
 836                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 837         }
 838         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) {
 839                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 840         }
 841 #endif /* CONFIG_NFS_CLIENT || DEVFS */
 842
 843 update:
 844
 845         /*
 846          * Set the mount level flags.
 847          */
 848         if (flags & MNT_RDONLY) {
 849                 mp->mnt_flag |= MNT_RDONLY;
 850         } else if (mp->mnt_flag & MNT_RDONLY) {
 851                 // disallow read/write upgrades of file systems that
 852                 // had the TYPENAME_OVERRIDE feature set.
 853                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 854                         error = EPERM;
 855                         goto out1;
 856                 }
 857                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 858         }
 859         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 860             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 861             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 862             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
 863             MNT_QUARANTINE | MNT_CPROTECT);
 864
 865 #if SECURE_KERNEL
 866 #if !CONFIG_MNT_SUID
 867         /*
 868          * On release builds of iOS based platforms, always enforce NOSUID on
 869          * all mounts. We do this here because we can catch update mounts as well as
 870          * non-update mounts in this case.
 871          */
 872         mp->mnt_flag |= (MNT_NOSUID);
 873 #endif
 874 #endif
 875
 876         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 877             MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 878             MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 879             MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | MNT_STRICTATIME |
 880             MNT_QUARANTINE | MNT_CPROTECT);
 881
 882 #if CONFIG_MACF
 883         if (flags & MNT_MULTILABEL) {
 884                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 885                         error = EINVAL;
 886                         goto out1;
 887                 }
 888                 mp->mnt_flag |= MNT_MULTILABEL;
 889         }
 890 #endif
 891         /*
 892          * Process device path for local file systems if requested
 893          */
 894         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
 895             !(internal_flags & (KERNEL_MOUNT_SNAPSHOT | KERNEL_MOUNT_DATAVOL | KERNEL_MOUNT_VMVOL))) {
 896                 //snapshot, vm, datavolume mounts are special
 897                 if (vfs_context_is64bit(ctx)) {
 898                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
 899                                 goto out1;
 900                         }
 901                         fsmountargs += sizeof(devpath);
 902                 } else {
 903                         user32_addr_t tmp;
 904                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
 905                                 goto out1;
 906                         }
 907                         /* munge into LP64 addr */
 908                         devpath = CAST_USER_ADDR_T(tmp);
 909                         fsmountargs += sizeof(tmp);
 910                 }
 911
 912                 /* Lookup device and authorize access to it */
 913                 if ((devpath)) {
 914                         struct nameidata nd;
 915
 916                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 917                         if ((error = namei(&nd))) {
 918                                 goto out1;
 919                         }
 920
 921                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 922                         devvp = nd.ni_vp;
 923
 924                         nameidone(&nd);
 925
 926                         if (devvp->v_type != VBLK) {
 927                                 error = ENOTBLK;
 928                                 goto out2;
 929                         }
 930                         if (major(devvp->v_rdev) >= nblkdev) {
 931                                 error = ENXIO;
 932                                 goto out2;
 933                         }
 934                         /*
 935                          * If mount by non-root, then verify that user has necessary
 936                          * permissions on the device.
 937                          */
 938                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 939                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 940
 941                                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 942                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 943                                 }
 944                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) {
 945                                         goto out2;
 946                                 }
 947                         }
 948                 }
 949                 /* On first mount, preflight and open device */
 950                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 951                         if ((error = vnode_ref(devvp))) {
 952                                 goto out2;
 953                         }
 954                         /*
 955                          * Disallow multiple mounts of the same device.
 956                          * Disallow mounting of a device that is currently in use
 957                          * (except for root, which might share swap device for miniroot).
 958                          * Flush out any old buffers remaining from a previous use.
 959                          */
 960                         if ((error = vfs_mountedon(devvp))) {
 961                                 goto out3;
 962                         }
 963
 964                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 965                                 error = EBUSY;
 966                                 goto out3;
 967                         }
 968                         if ((error = VNOP_FSYNC(devvp, MNT_WAIT, ctx))) {
 969                                 error = ENOTBLK;
 970                                 goto out3;
 971                         }
 972                         if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) {
 973                                 goto out3;
 974                         }
 975
 976                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 977 #if CONFIG_MACF
 978                         error = mac_vnode_check_open(ctx,
 979                             devvp,
 980                             ronly ? FREAD : FREAD | FWRITE);
 981                         if (error) {
 982                                 goto out3;
 983                         }
 984 #endif /* MAC */
 985                         if ((error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, ctx))) {
 986                                 goto out3;
 987                         }
 988
 989                         mp->mnt_devvp = devvp;
 990                         device_vnode = devvp;
 991                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 992                     (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 993                     (device_vnode = mp->mnt_devvp)) {
 994                         dev_t dev;
 995                         int maj;
 996                         /*
 997                          * If upgrade to read-write by non-root, then verify
 998                          * that user has necessary permissions on the device.
 999                          */
1000                         vnode_getalways(device_vnode);
1001
1002                         if (suser(vfs_context_ucred(ctx), NULL) &&
1003                             (error = vnode_authorize(device_vnode, NULL,
1004                             KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
1005                             ctx)) != 0) {
1006                                 vnode_put(device_vnode);
1007                                 goto out2;
1008                         }
1009
1010                         /* Tell the device that we're upgrading */
1011                         dev = (dev_t)device_vnode->v_rdev;
1012                         maj = major(dev);
1013
1014                         if ((u_int)maj >= (u_int)nblkdev) {
1015                                 panic("Volume mounted on a device with invalid major number.");
1016                         }
1017
1018                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
1019                         vnode_put(device_vnode);
1020                         device_vnode = NULLVP;
1021                         if (error != 0) {
1022                                 goto out2;
1023                         }
1024                 }
1025         } // localargs && !(snapshot | data | vm)
1026
1027 #if CONFIG_MACF
1028         if ((flags & MNT_UPDATE) == 0) {
1029                 mac_mount_label_init(mp);
1030                 mac_mount_label_associate(ctx, mp);
1031         }
1032         if (labelstr) {
1033                 if ((flags & MNT_UPDATE) != 0) {
1034                         error = mac_mount_check_label_update(ctx, mp);
1035                         if (error != 0) {
1036                                 goto out3;
1037                         }
1038                 }
1039         }
1040 #endif
1041         /*
1042          * Mount the filesystem.  We already asserted that internal_flags
1043          * cannot have more than one mount-by-role bit set.
1044          */
1045         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
1046                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
1047                     (caddr_t)fsmountargs, 0, ctx);
1048         } else if (internal_flags & KERNEL_MOUNT_DATAVOL) {
1049 #if CONFIG_ROSV_STARTUP
1050                 struct mount *origin_mp = (struct mount*)fsmountargs;
1051                 fs_role_mount_args_t frma = {origin_mp, VFS_DATA_ROLE};
1052                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1053                 if (error) {
1054                         printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_DATA_ROLE, error);
1055                 } else {
1056                         /* Mark volume associated with system volume */
1057                         mp->mnt_kern_flag |= MNTK_SYSTEM;
1058
1059                         /* Attempt to acquire the mnt_devvp and set it up */
1060                         struct vnode *mp_devvp = NULL;
1061                         if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1062                                 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1063                                     0, &mp_devvp, vfs_context_kernel());
1064                                 if (!lerr) {
1065                                         mp->mnt_devvp = mp_devvp;
1066                                         //vnode_lookup took an iocount, need to drop it.
1067                                         vnode_put(mp_devvp);
1068                                         // now set `device_vnode` to the devvp that was acquired.
1069                                         // this is needed in order to ensure vfs_init_io_attributes is invoked.
1070                                         // note that though the iocount above was dropped, the mount acquires
1071                                         // an implicit reference against the device.
1072                                         device_vnode = mp_devvp;
1073                                 }
1074                         }
1075                 }
1076 #else
1077                 error = EINVAL;
1078 #endif
1079         } else if (internal_flags & KERNEL_MOUNT_VMVOL) {
1080 #if CONFIG_MOUNT_VM
1081                 struct mount *origin_mp = (struct mount*)fsmountargs;
1082                 fs_role_mount_args_t frma = {origin_mp, VFS_VM_ROLE};
1083                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_BYROLE, (caddr_t)&frma, 0, ctx);
1084                 if (error) {
1085                         printf("MOUNT-BY-ROLE (%d) failed! (%d)", VFS_VM_ROLE, error);
1086                 } else {
1087                         /* Mark volume associated with system volume and a swap mount */
1088                         mp->mnt_kern_flag |= (MNTK_SYSTEM | MNTK_SWAP_MOUNT);
1089                         /* Attempt to acquire the mnt_devvp and set it up */
1090                         struct vnode *mp_devvp = NULL;
1091                         if (mp->mnt_vfsstat.f_mntfromname[0] != 0) {
1092                                 errno_t lerr = vnode_lookup(mp->mnt_vfsstat.f_mntfromname,
1093                                     0, &mp_devvp, vfs_context_kernel());
1094                                 if (!lerr) {
1095                                         mp->mnt_devvp = mp_devvp;
1096                                         //vnode_lookup took an iocount, need to drop it.
1097                                         vnode_put(mp_devvp);
1098
1099                                         // now set `device_vnode` to the devvp that was acquired.
1100                                         // note that though the iocount above was dropped, the mount acquires
1101                                         // an implicit reference against the device.
1102                                         device_vnode = mp_devvp;
1103                                 }
1104                         }
1105                 }
1106 #else
1107                 error = EINVAL;
1108 #endif
1109         } else {
1110                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
1111         }
1112
1113         if (flags & MNT_UPDATE) {
1114                 if (mp->mnt_kern_flag & MNTK_WANTRDWR) {
1115                         mp->mnt_flag &= ~MNT_RDONLY;
1116                 }
1117                 mp->mnt_flag &= ~
1118                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
1119                 mp->mnt_kern_flag &= ~MNTK_WANTRDWR;
1120                 if (error) {
1121                         mp->mnt_flag = flag;  /* restore flag value */
1122                 }
1123                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
1124                 lck_rw_done(&mp->mnt_rwlock);
1125                 is_rwlock_locked = FALSE;
1126                 if (!error) {
1127                         enablequotas(mp, ctx);
1128                 }
1129                 goto exit;
1130         }
1131
1132         /*
1133          * Put the new filesystem on the mount list after root.
1134          */
1135         if (error == 0) {
1136                 struct vfs_attr vfsattr;
1137 #if CONFIG_MACF
1138                 error = mac_mount_check_mount_late(ctx, mp);
1139                 if (error != 0) {
1140                         goto out3;
1141                 }
1142
1143                 if (vfs_flags(mp) & MNT_MULTILABEL) {
1144                         error = VFS_ROOT(mp, &rvp, ctx);
1145                         if (error) {
1146                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1147                                 goto out3;
1148                         }
1149                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1150                         /*
1151                          * drop reference provided by VFS_ROOT
1152                          */
1153                         vnode_put(rvp);
1154
1155                         if (error) {
1156                                 goto out3;
1157                         }
1158                 }
1159 #endif  /* MAC */
1160
1161                 vnode_lock_spin(vp);
1162                 CLR(vp->v_flag, VMOUNT);
1163                 vp->v_mountedhere = mp;
1164                 vnode_unlock(vp);
1165
1166                 /*
1167                  * taking the name_cache_lock exclusively will
1168                  * insure that everyone is out of the fast path who
1169                  * might be trying to use a now stale copy of
1170                  * vp->v_mountedhere->mnt_realrootvp
1171                  * bumping mount_generation causes the cached values
1172                  * to be invalidated
1173                  */
1174                 name_cache_lock();
1175                 mount_generation++;
1176                 name_cache_unlock();
1177
1178                 error = vnode_ref(vp);
1179                 if (error != 0) {
1180                         goto out4;
1181                 }
1182
1183                 have_usecount = TRUE;
1184
1185                 error = checkdirs(vp, ctx);
1186                 if (error != 0) {
1187                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
1188                         goto out4;
1189                 }
1190                 /*
1191                  * there is no cleanup code here so I have made it void
1192                  * we need to revisit this
1193                  */
1194                 (void)VFS_START(mp, 0, ctx);
1195
1196                 if (mount_list_add(mp) != 0) {
1197                         /*
1198                          * The system is shutting down trying to umount
1199                          * everything, so fail with a plausible errno.
1200                          */
1201                         error = EBUSY;
1202                         goto out4;
1203                 }
1204                 lck_rw_done(&mp->mnt_rwlock);
1205                 is_rwlock_locked = FALSE;
1206
1207                 /* Check if this mounted file system supports EAs or named streams. */
1208                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1209                 VFSATTR_INIT(&vfsattr);
1210                 VFSATTR_WANTED(&vfsattr, f_capabilities);
1211                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1212                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1213                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1214                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1215                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1216                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1217                         }
1218 #if NAMEDSTREAMS
1219                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1220                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1221                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1222                         }
1223 #endif
1224                         /* Check if this file system supports path from id lookups. */
1225                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1226                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1227                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1228                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
1229                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1230                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1231                         }
1232
1233                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1234                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1235                                 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1236                         }
1237                 }
1238                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1239                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1240                 }
1241                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1242                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1243                 }
1244                 /* increment the operations count */
1245                 OSAddAtomic(1, &vfs_nummntops);
1246                 enablequotas(mp, ctx);
1247
1248                 if (device_vnode) {
1249                         device_vnode->v_specflags |= SI_MOUNTEDON;
1250
1251                         /*
1252                          *   cache the IO attributes for the underlying physical media...
1253                          *   an error return indicates the underlying driver doesn't
1254                          *   support all the queries necessary... however, reasonable
1255                          *   defaults will have been set, so no reason to bail or care
1256                          */
1257                         vfs_init_io_attributes(device_vnode, mp);
1258                 }
1259
1260                 /* Now that mount is setup, notify the listeners */
1261                 vfs_notify_mount(pvp);
1262                 IOBSDMountChange(mp, kIOMountChangeMount);
1263         } else {
1264                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1265                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1266                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1267                             mp->mnt_vtable->vfc_name, error);
1268                 }
1269
1270                 vnode_lock_spin(vp);
1271                 CLR(vp->v_flag, VMOUNT);
1272                 vnode_unlock(vp);
1273                 mount_list_lock();
1274                 mp->mnt_vtable->vfc_refcount--;
1275                 mount_list_unlock();
1276
1277                 if (device_vnode) {
1278                         vnode_rele(device_vnode);
1279                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD | FWRITE, ctx);
1280                 }
1281                 lck_rw_done(&mp->mnt_rwlock);
1282                 is_rwlock_locked = FALSE;
1283
1284                 /*
1285                  * if we get here, we have a mount structure that needs to be freed,
1286                  * but since the coveredvp hasn't yet been updated to point at it,
1287                  * no need to worry about other threads holding a crossref on this mp
1288                  * so it's ok to just free it
1289                  */
1290                 mount_lock_destroy(mp);
1291 #if CONFIG_MACF
1292                 mac_mount_label_destroy(mp);
1293 #endif
1294                 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1295         }
1296 exit:
1297         /*
1298          * drop I/O count on the device vp if there was one
1299          */
1300         if (devpath && devvp) {
1301                 vnode_put(devvp);
1302         }
1303
1304         return error;
1305
1306 /* Error condition exits */
1307 out4:
1308         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1309
1310         /*
1311          * If the mount has been placed on the covered vp,
1312          * it may have been discovered by now, so we have
1313          * to treat this just like an unmount
1314          */
1315         mount_lock_spin(mp);
1316         mp->mnt_lflag |= MNT_LDEAD;
1317         mount_unlock(mp);
1318
1319         if (device_vnode != NULLVP) {
1320                 vnode_rele(device_vnode);
1321                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
1322                     ctx);
1323                 did_rele = TRUE;
1324         }
1325
1326         vnode_lock_spin(vp);
1327
1328         mp->mnt_crossref++;
1329         vp->v_mountedhere = (mount_t) 0;
1330
1331         vnode_unlock(vp);
1332
1333         if (have_usecount) {
1334                 vnode_rele(vp);
1335         }
1336 out3:
1337         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) {
1338                 vnode_rele(devvp);
1339         }
1340 out2:
1341         if (devpath && devvp) {
1342                 vnode_put(devvp);
1343         }
1344 out1:
1345         /* Release mnt_rwlock only when it was taken */
1346         if (is_rwlock_locked == TRUE) {
1347                 lck_rw_done(&mp->mnt_rwlock);
1348         }
1349
1350         if (mntalloc) {
1351                 if (mp->mnt_crossref) {
1352                         mount_dropcrossref(mp, vp, 0);
1353                 } else {
1354                         mount_lock_destroy(mp);
1355 #if CONFIG_MACF
1356                         mac_mount_label_destroy(mp);
1357 #endif
1358                         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
1359                 }
1360         }
1361         if (vfsp_ref) {
1362                 mount_list_lock();
1363                 vfsp->vfc_refcount--;
1364                 mount_list_unlock();
1365         }
1366
1367         return error;
1368 }
1369
1370 /*
1371  * Flush in-core data, check for competing mount attempts,
1372  * and set VMOUNT
1373  */
1374 int
1375 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1376 {
1377 #if !CONFIG_MACF
1378 #pragma unused(cnp,fsname)
1379 #endif
1380         struct vnode_attr va;
1381         int error;
1382
1383         if (!skip_auth) {
1384                 /*
1385                  * If the user is not root, ensure that they own the directory
1386                  * onto which we are attempting to mount.
1387                  */
1388                 VATTR_INIT(&va);
1389                 VATTR_WANTED(&va, va_uid);
1390                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1391                     (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1392                     (!vfs_context_issuser(ctx)))) {
1393                         error = EPERM;
1394                         goto out;
1395                 }
1396         }
1397
1398         if ((error = VNOP_FSYNC(vp, MNT_WAIT, ctx))) {
1399                 goto out;
1400         }
1401
1402         if ((error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0))) {
1403                 goto out;
1404         }
1405
1406         if (vp->v_type != VDIR) {
1407                 error = ENOTDIR;
1408                 goto out;
1409         }
1410
1411         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1412                 error = EBUSY;
1413                 goto out;
1414         }
1415
1416 #if CONFIG_MACF
1417         error = mac_mount_check_mount(ctx, vp,
1418             cnp, fsname);
1419         if (error != 0) {
1420                 goto out;
1421         }
1422 #endif
1423
1424         vnode_lock_spin(vp);
1425         SET(vp->v_flag, VMOUNT);
1426         vnode_unlock(vp);
1427
1428 out:
1429         return error;
1430 }
1431
1432 #if CONFIG_IMGSRC_ACCESS
1433
1434 #define DEBUG_IMGSRC 0
1435
1436 #if DEBUG_IMGSRC
1437 #define IMGSRC_DEBUG(args...) printf("imgsrc: " args)
1438 #else
1439 #define IMGSRC_DEBUG(args...) do { } while(0)
1440 #endif
1441
1442 static int
1443 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1444 {
1445         struct nameidata nd;
1446         vnode_t vp, realdevvp;
1447         mode_t accessmode;
1448         int error;
1449         enum uio_seg uio = UIO_USERSPACE;
1450
1451         if (ctx == vfs_context_kernel()) {
1452                 uio = UIO_SYSSPACE;
1453         }
1454
1455         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, uio, devpath, ctx);
1456         if ((error = namei(&nd))) {
1457                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1458                 return error;
1459         }
1460
1461         vp = nd.ni_vp;
1462
1463         if (!vnode_isblk(vp)) {
1464                 IMGSRC_DEBUG("Not block device.\n");
1465                 error = ENOTBLK;
1466                 goto out;
1467         }
1468
1469         realdevvp = mp->mnt_devvp;
1470         if (realdevvp == NULLVP) {
1471                 IMGSRC_DEBUG("No device backs the mount.\n");
1472                 error = ENXIO;
1473                 goto out;
1474         }
1475
1476         error = vnode_getwithref(realdevvp);
1477         if (error != 0) {
1478                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1479                 goto out;
1480         }
1481
1482         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1483                 IMGSRC_DEBUG("Wrong dev_t.\n");
1484                 error = ENXIO;
1485                 goto out1;
1486         }
1487
1488         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1489
1490         /*
1491          * If mount by non-root, then verify that user has necessary
1492          * permissions on the device.
1493          */
1494         if (!vfs_context_issuser(ctx)) {
1495                 accessmode = KAUTH_VNODE_READ_DATA;
1496                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1497                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1498                 }
1499                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1500                         IMGSRC_DEBUG("Access denied.\n");
1501                         goto out1;
1502                 }
1503         }
1504
1505         *devvpp = vp;
1506
1507 out1:
1508         vnode_put(realdevvp);
1509
1510 out:
1511         nameidone(&nd);
1512
1513         if (error) {
1514                 vnode_put(vp);
1515         }
1516
1517         return error;
1518 }
1519
1520 /*
1521  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1522  * and call checkdirs()
1523  */
1524 static int
1525 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1526 {
1527         int error;
1528
1529         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1530
1531         IMGSRC_DEBUG("placing: fsname = %s, vp = %s\n",
1532             mp->mnt_vtable->vfc_name, vnode_getname(vp));
1533
1534         vnode_lock_spin(vp);
1535         CLR(vp->v_flag, VMOUNT);
1536         vp->v_mountedhere = mp;
1537         vnode_unlock(vp);
1538
1539         /*
1540          * taking the name_cache_lock exclusively will
1541          * insure that everyone is out of the fast path who
1542          * might be trying to use a now stale copy of
1543          * vp->v_mountedhere->mnt_realrootvp
1544          * bumping mount_generation causes the cached values
1545          * to be invalidated
1546          */
1547         name_cache_lock();
1548         mount_generation++;
1549         name_cache_unlock();
1550
1551         error = vnode_ref(vp);
1552         if (error != 0) {
1553                 goto out;
1554         }
1555
1556         error = checkdirs(vp, ctx);
1557         if (error != 0) {
1558                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1559                 vnode_rele(vp);
1560                 goto out;
1561         }
1562
1563 out:
1564         if (error != 0) {
1565                 mp->mnt_vnodecovered = NULLVP;
1566         }
1567         return error;
1568 }
1569
1570 static void
1571 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1572 {
1573         vnode_rele(vp);
1574         vnode_lock_spin(vp);
1575         vp->v_mountedhere = (mount_t)NULL;
1576         vnode_unlock(vp);
1577
1578         mp->mnt_vnodecovered = NULLVP;
1579 }
1580
1581 static int
1582 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1583 {
1584         int error;
1585
1586         /* unmount in progress return error */
1587         mount_lock_spin(mp);
1588         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1589                 mount_unlock(mp);
1590                 return EBUSY;
1591         }
1592         mount_unlock(mp);
1593         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1594
1595         /*
1596          * We only allow the filesystem to be reloaded if it
1597          * is currently mounted read-only.
1598          */
1599         if ((flags & MNT_RELOAD) &&
1600             ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1601                 error = ENOTSUP;
1602                 goto out;
1603         }
1604
1605         /*
1606          * Only root, or the user that did the original mount is
1607          * permitted to update it.
1608          */
1609         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1610             (!vfs_context_issuser(ctx))) {
1611                 error = EPERM;
1612                 goto out;
1613         }
1614 #if CONFIG_MACF
1615         error = mac_mount_check_remount(ctx, mp);
1616         if (error != 0) {
1617                 goto out;
1618         }
1619 #endif
1620
1621 out:
1622         if (error) {
1623                 lck_rw_done(&mp->mnt_rwlock);
1624         }
1625
1626         return error;
1627 }
1628
1629 static void
1630 mount_end_update(mount_t mp)
1631 {
1632         lck_rw_done(&mp->mnt_rwlock);
1633 }
1634
1635 static int
1636 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1637 {
1638         vnode_t vp;
1639
1640         if (height >= MAX_IMAGEBOOT_NESTING) {
1641                 return EINVAL;
1642         }
1643
1644         vp = imgsrc_rootvnodes[height];
1645         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1646                 *rvpp = vp;
1647                 return 0;
1648         } else {
1649                 return ENOENT;
1650         }
1651 }
1652
1653 static int
1654 relocate_imageboot_source(vnode_t pvp, vnode_t vp,
1655     struct componentname *cnp, const char *fsname, vfs_context_t ctx,
1656     boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1657 {
1658         int error;
1659         mount_t mp;
1660         boolean_t placed = FALSE;
1661         struct vfstable *vfsp;
1662         user_addr_t devpath;
1663         char *old_mntonname;
1664         vnode_t rvp;
1665         vnode_t devvp;
1666         uint32_t height;
1667         uint32_t flags;
1668
1669         /* If we didn't imageboot, nothing to move */
1670         if (imgsrc_rootvnodes[0] == NULLVP) {
1671                 return EINVAL;
1672         }
1673
1674         /* Only root can do this */
1675         if (!vfs_context_issuser(ctx)) {
1676                 return EPERM;
1677         }
1678
1679         IMGSRC_DEBUG("looking for root vnode.\n");
1680
1681         /*
1682          * Get root vnode of filesystem we're moving.
1683          */
1684         if (by_index) {
1685                 if (is64bit) {
1686                         struct user64_mnt_imgsrc_args mia64;
1687                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1688                         if (error != 0) {
1689                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1690                                 return error;
1691                         }
1692
1693                         height = mia64.mi_height;
1694                         flags = mia64.mi_flags;
1695                         devpath = mia64.mi_devpath;
1696                 } else {
1697                         struct user32_mnt_imgsrc_args mia32;
1698                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1699                         if (error != 0) {
1700                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1701                                 return error;
1702                         }
1703
1704                         height = mia32.mi_height;
1705                         flags = mia32.mi_flags;
1706                         devpath = mia32.mi_devpath;
1707                 }
1708         } else {
1709                 /*
1710                  * For binary compatibility--assumes one level of nesting.
1711                  */
1712                 if (is64bit) {
1713                         if ((error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath)))) {
1714                                 return error;
1715                         }
1716                 } else {
1717                         user32_addr_t tmp;
1718                         if ((error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp)))) {
1719                                 return error;
1720                         }
1721
1722                         /* munge into LP64 addr */
1723                         devpath = CAST_USER_ADDR_T(tmp);
1724                 }
1725
1726                 height = 0;
1727                 flags = 0;
1728         }
1729
1730         if (flags != 0) {
1731                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1732                 return EINVAL;
1733         }
1734
1735         error = get_imgsrc_rootvnode(height, &rvp);
1736         if (error != 0) {
1737                 IMGSRC_DEBUG("getting old root vnode failed with %d\n", error);
1738                 return error;
1739         }
1740
1741         IMGSRC_DEBUG("got old root vnode\n");
1742
1743         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1744
1745         /* Can only move once */
1746         mp = vnode_mount(rvp);
1747         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1748                 IMGSRC_DEBUG("Already moved.\n");
1749                 error = EBUSY;
1750                 goto out0;
1751         }
1752
1753         IMGSRC_DEBUG("moving rvp: fsname = %s\n", mp->mnt_vtable->vfc_name);
1754         IMGSRC_DEBUG("Starting updated.\n");
1755
1756         /* Get exclusive rwlock on mount, authorize update on mp */
1757         error = mount_begin_update(mp, ctx, 0);
1758         if (error != 0) {
1759                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1760                 goto out0;
1761         }
1762
1763         /*
1764          * It can only be moved once.  Flag is set under the rwlock,
1765          * so we're now safe to proceed.
1766          */
1767         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1768                 IMGSRC_DEBUG("Already moved [2]\n");
1769                 goto out1;
1770         }
1771
1772         IMGSRC_DEBUG("Preparing coveredvp.\n");
1773
1774         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1775         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1776         if (error != 0) {
1777                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1778                 goto out1;
1779         }
1780
1781         IMGSRC_DEBUG("Covered vp OK.\n");
1782
1783         /* Sanity check the name caller has provided */
1784         vfsp = mp->mnt_vtable;
1785         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1786                 IMGSRC_DEBUG("Wrong fs name: actual = %s, expected = %s\n",
1787                     vfsp->vfc_name, fsname);
1788                 error = EINVAL;
1789                 goto out2;
1790         }
1791
1792         /* Check the device vnode and update mount-from name, for local filesystems */
1793         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1794                 IMGSRC_DEBUG("Local, doing device validation.\n");
1795
1796                 if (devpath != USER_ADDR_NULL) {
1797                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1798                         if (error) {
1799                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1800                                 goto out2;
1801                         }
1802
1803                         vnode_put(devvp);
1804                 }
1805         }
1806
1807         /*
1808          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1809          * and increment the name cache's mount generation
1810          */
1811
1812         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1813         error = place_mount_and_checkdirs(mp, vp, ctx);
1814         if (error != 0) {
1815                 goto out2;
1816         }
1817
1818         placed = TRUE;
1819
1820         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1821         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1822
1823         /* Forbid future moves */
1824         mount_lock(mp);
1825         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1826         mount_unlock(mp);
1827
1828         /* Finally, add to mount list, completely ready to go */
1829         if (mount_list_add(mp) != 0) {
1830                 /*
1831                  * The system is shutting down trying to umount
1832                  * everything, so fail with a plausible errno.
1833                  */
1834                 error = EBUSY;
1835                 goto out3;
1836         }
1837
1838         mount_end_update(mp);
1839         vnode_put(rvp);
1840         FREE(old_mntonname, M_TEMP);
1841
1842         vfs_notify_mount(pvp);
1843
1844         return 0;
1845 out3:
1846         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1847
1848         mount_lock(mp);
1849         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1850         mount_unlock(mp);
1851
1852 out2:
1853         /*
1854          * Placing the mp on the vnode clears VMOUNT,
1855          * so cleanup is different after that point
1856          */
1857         if (placed) {
1858                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1859                 undo_place_on_covered_vp(mp, vp);
1860         } else {
1861                 vnode_lock_spin(vp);
1862                 CLR(vp->v_flag, VMOUNT);
1863                 vnode_unlock(vp);
1864         }
1865 out1:
1866         mount_end_update(mp);
1867
1868 out0:
1869         vnode_put(rvp);
1870         FREE(old_mntonname, M_TEMP);
1871         return error;
1872 }
1873
1874 #if CONFIG_LOCKERBOOT
1875 __private_extern__
1876 int
1877 mount_locker_protoboot(const char *fsname, const char *mntpoint,
1878     const char *pbdevpath)
1879 {
1880         int error = -1;
1881         struct nameidata nd;
1882         boolean_t cleanup_nd = FALSE;
1883         vfs_context_t ctx = vfs_context_kernel();
1884         boolean_t is64 = TRUE;
1885         boolean_t by_index = TRUE;
1886         struct user64_mnt_imgsrc_args mia64 = {
1887                 .mi_height = 0,
1888                 .mi_flags = 0,
1889                 .mi_devpath = CAST_USER_ADDR_T(pbdevpath),
1890         };
1891         user_addr_t mia64addr = CAST_USER_ADDR_T(&mia64);
1892
1893         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
1894             UIO_SYSSPACE, CAST_USER_ADDR_T(mntpoint), ctx);
1895         error = namei(&nd);
1896         if (error) {
1897                 IMGSRC_DEBUG("namei: %d\n", error);
1898                 goto out;
1899         }
1900
1901         cleanup_nd = TRUE;
1902         error = relocate_imageboot_source(nd.ni_dvp, nd.ni_vp,
1903             &nd.ni_cnd, fsname, ctx, is64, mia64addr, by_index);
1904
1905 out:
1906         if (cleanup_nd) {
1907                 int stashed = error;
1908
1909                 error = vnode_put(nd.ni_vp);
1910                 if (error) {
1911                         panic("vnode_put() returned non-zero: %d", error);
1912                 }
1913
1914                 if (nd.ni_dvp) {
1915                         error = vnode_put(nd.ni_dvp);
1916                         if (error) {
1917                                 panic("vnode_put() returned non-zero: %d", error);
1918                         }
1919                 }
1920                 nameidone(&nd);
1921
1922                 error = stashed;
1923         }
1924         return error;
1925 }
1926 #endif /* CONFIG_LOCKERBOOT */
1927 #endif /* CONFIG_IMGSRC_ACCESS */
1928
1929 void
1930 enablequotas(struct mount *mp, vfs_context_t ctx)
1931 {
1932         struct nameidata qnd;
1933         int type;
1934         char qfpath[MAXPATHLEN];
1935         const char *qfname = QUOTAFILENAME;
1936         const char *qfopsname = QUOTAOPSNAME;
1937         const char *qfextension[] = INITQFNAMES;
1938
1939         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1940         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0) {
1941                 return;
1942         }
1943         /*
1944          * Enable filesystem disk quotas if necessary.
1945          * We ignore errors as this should not interfere with final mount
1946          */
1947         for (type = 0; type < MAXQUOTAS; type++) {
1948                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1949                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1950                     CAST_USER_ADDR_T(qfpath), ctx);
1951                 if (namei(&qnd) != 0) {
1952                         continue;           /* option file to trigger quotas is not present */
1953                 }
1954                 vnode_put(qnd.ni_vp);
1955                 nameidone(&qnd);
1956                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1957
1958                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1959         }
1960         return;
1961 }
1962
1963
1964 static int
1965 checkdirs_callback(proc_t p, void * arg)
1966 {
1967         struct cdirargs * cdrp = (struct cdirargs *)arg;
1968         vnode_t olddp = cdrp->olddp;
1969         vnode_t newdp = cdrp->newdp;
1970         struct filedesc *fdp;
1971         vnode_t new_cvp = newdp;
1972         vnode_t new_rvp = newdp;
1973         vnode_t old_cvp = NULL;
1974         vnode_t old_rvp = NULL;
1975
1976         /*
1977          * XXX Also needs to iterate each thread in the process to see if it
1978          * XXX is using a per-thread current working directory, and, if so,
1979          * XXX update that as well.
1980          */
1981
1982         /*
1983          * First, with the proc_fdlock held, check to see if we will need
1984          * to do any work.  If not, we will get out fast.
1985          */
1986         proc_fdlock(p);
1987         fdp = p->p_fd;
1988         if (fdp == NULL ||
1989             (fdp->fd_cdir != olddp && fdp->fd_rdir != olddp)) {
1990                 proc_fdunlock(p);
1991                 return PROC_RETURNED;
1992         }
1993         proc_fdunlock(p);
1994
1995         /*
1996          * Ok, we will have to do some work.  Always take two refs
1997          * because we might need that many.  We'll dispose of whatever
1998          * we ended up not using.
1999          */
2000         if (vnode_ref(newdp) != 0) {
2001                 return PROC_RETURNED;
2002         }
2003         if (vnode_ref(newdp) != 0) {
2004                 vnode_rele(newdp);
2005                 return PROC_RETURNED;
2006         }
2007
2008         /*
2009          * Now do the work.  Note: we dropped the proc_fdlock, so we
2010          * have to do all of the checks again.
2011          */
2012         proc_fdlock(p);
2013         fdp = p->p_fd;
2014         if (fdp != NULL) {
2015                 if (fdp->fd_cdir == olddp) {
2016                         old_cvp = olddp;
2017                         fdp->fd_cdir = newdp;
2018                         new_cvp = NULL;
2019                 }
2020                 if (fdp->fd_rdir == olddp) {
2021                         old_rvp = olddp;
2022                         fdp->fd_rdir = newdp;
2023                         new_rvp = NULL;
2024                 }
2025         }
2026         proc_fdunlock(p);
2027
2028         /*
2029          * Dispose of any references that are no longer needed.
2030          */
2031         if (old_cvp != NULL) {
2032                 vnode_rele(old_cvp);
2033         }
2034         if (old_rvp != NULL) {
2035                 vnode_rele(old_rvp);
2036         }
2037         if (new_cvp != NULL) {
2038                 vnode_rele(new_cvp);
2039         }
2040         if (new_rvp != NULL) {
2041                 vnode_rele(new_rvp);
2042         }
2043
2044         return PROC_RETURNED;
2045 }
2046
2047
2048
2049 /*
2050  * Scan all active processes to see if any of them have a current
2051  * or root directory onto which the new filesystem has just been
2052  * mounted. If so, replace them with the new mount point.
2053  */
2054 static int
2055 checkdirs(vnode_t olddp, vfs_context_t ctx)
2056 {
2057         vnode_t newdp;
2058         vnode_t tvp;
2059         int err;
2060         struct cdirargs cdr;
2061
2062         if (olddp->v_usecount == 1) {
2063                 return 0;
2064         }
2065         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
2066
2067         if (err != 0) {
2068 #if DIAGNOSTIC
2069                 panic("mount: lost mount: error %d", err);
2070 #endif
2071                 return err;
2072         }
2073
2074         cdr.olddp = olddp;
2075         cdr.newdp = newdp;
2076         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
2077         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
2078
2079         if (rootvnode == olddp) {
2080                 vnode_ref(newdp);
2081                 tvp = rootvnode;
2082                 rootvnode = newdp;
2083                 vnode_rele(tvp);
2084         }
2085
2086         vnode_put(newdp);
2087         return 0;
2088 }
2089
2090 /*
2091  * Unmount a file system.
2092  *
2093  * Note: unmount takes a path to the vnode mounted on as argument,
2094  * not special file (as before).
2095  */
2096 /* ARGSUSED */
2097 int
2098 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
2099 {
2100         vnode_t vp;
2101         struct mount *mp;
2102         int error;
2103         struct nameidata nd;
2104         vfs_context_t ctx = vfs_context_current();
2105
2106         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
2107             UIO_USERSPACE, uap->path, ctx);
2108         error = namei(&nd);
2109         if (error) {
2110                 return error;
2111         }
2112         vp = nd.ni_vp;
2113         mp = vp->v_mount;
2114         nameidone(&nd);
2115
2116 #if CONFIG_MACF
2117         error = mac_mount_check_umount(ctx, mp);
2118         if (error != 0) {
2119                 vnode_put(vp);
2120                 return error;
2121         }
2122 #endif
2123         /*
2124          * Must be the root of the filesystem
2125          */
2126         if ((vp->v_flag & VROOT) == 0) {
2127                 vnode_put(vp);
2128                 return EINVAL;
2129         }
2130         mount_ref(mp, 0);
2131         vnode_put(vp);
2132         /* safedounmount consumes the mount ref */
2133         return safedounmount(mp, uap->flags, ctx);
2134 }
2135
2136 int
2137 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
2138 {
2139         mount_t mp;
2140
2141         mp = mount_list_lookupby_fsid(fsid, 0, 1);
2142         if (mp == (mount_t)0) {
2143                 return ENOENT;
2144         }
2145         mount_ref(mp, 0);
2146         mount_iterdrop(mp);
2147         /* safedounmount consumes the mount ref */
2148         return safedounmount(mp, flags, ctx);
2149 }
2150
2151
2152 /*
2153  * The mount struct comes with a mount ref which will be consumed.
2154  * Do the actual file system unmount, prevent some common foot shooting.
2155  */
2156 int
2157 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
2158 {
2159         int error;
2160         proc_t p = vfs_context_proc(ctx);
2161
2162         /*
2163          * If the file system is not responding and MNT_NOBLOCK
2164          * is set and not a forced unmount then return EBUSY.
2165          */
2166         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
2167             (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
2168                 error = EBUSY;
2169                 goto out;
2170         }
2171
2172         /*
2173          * Skip authorization if the mount is tagged as permissive and
2174          * this is not a forced-unmount attempt.
2175          */
2176         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
2177                 /*
2178                  * Only root, or the user that did the original mount is
2179                  * permitted to unmount this filesystem.
2180                  */
2181                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
2182                     (error = suser(kauth_cred_get(), &p->p_acflag))) {
2183                         goto out;
2184                 }
2185         }
2186         /*
2187          * Don't allow unmounting the root file system (or the associated VM or DATA mounts) .
2188          */
2189         if ((mp->mnt_flag & MNT_ROOTFS) || (mp->mnt_kern_flag & MNTK_SYSTEM)) {
2190                 error = EBUSY; /* the root (or associated volumes) is always busy */
2191                 goto out;
2192         }
2193
2194 #ifdef CONFIG_IMGSRC_ACCESS
2195         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
2196                 error = EBUSY;
2197                 goto out;
2198         }
2199 #endif /* CONFIG_IMGSRC_ACCESS */
2200
2201         return dounmount(mp, flags, 1, ctx);
2202
2203 out:
2204         mount_drop(mp, 0);
2205         return error;
2206 }
2207
2208 /*
2209  * Do the actual file system unmount.
2210  */
2211 int
2212 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
2213 {
2214         vnode_t coveredvp = (vnode_t)0;
2215         int error;
2216         int needwakeup = 0;
2217         int forcedunmount = 0;
2218         int lflags = 0;
2219         struct vnode *devvp = NULLVP;
2220 #if CONFIG_TRIGGERS
2221         proc_t p = vfs_context_proc(ctx);
2222         int did_vflush = 0;
2223         int pflags_save = 0;
2224 #endif /* CONFIG_TRIGGERS */
2225
2226 #if CONFIG_FSE
2227         if (!(flags & MNT_FORCE)) {
2228                 fsevent_unmount(mp, ctx);  /* has to come first! */
2229         }
2230 #endif
2231
2232         mount_lock(mp);
2233
2234         /*
2235          * If already an unmount in progress just return EBUSY.
2236          * Even a forced unmount cannot override.
2237          */
2238         if (mp->mnt_lflag & MNT_LUNMOUNT) {
2239                 if (withref != 0) {
2240                         mount_drop(mp, 1);
2241                 }
2242                 mount_unlock(mp);
2243                 return EBUSY;
2244         }
2245
2246         if (flags & MNT_FORCE) {
2247                 forcedunmount = 1;
2248                 mp->mnt_lflag |= MNT_LFORCE;
2249         }
2250
2251 #if CONFIG_TRIGGERS
2252         if (flags & MNT_NOBLOCK && p != kernproc) {
2253                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2254         }
2255 #endif
2256
2257         mp->mnt_kern_flag |= MNTK_UNMOUNT;
2258         mp->mnt_lflag |= MNT_LUNMOUNT;
2259         mp->mnt_flag &= ~MNT_ASYNC;
2260         /*
2261          * anyone currently in the fast path that
2262          * trips over the cached rootvp will be
2263          * dumped out and forced into the slow path
2264          * to regenerate a new cached value
2265          */
2266         mp->mnt_realrootvp = NULLVP;
2267         mount_unlock(mp);
2268
2269         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2270                 /*
2271                  * Force unmount any mounts in this filesystem.
2272                  * If any unmounts fail - just leave them dangling.
2273                  * Avoids recursion.
2274                  */
2275                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2276         }
2277
2278         /*
2279          * taking the name_cache_lock exclusively will
2280          * insure that everyone is out of the fast path who
2281          * might be trying to use a now stale copy of
2282          * vp->v_mountedhere->mnt_realrootvp
2283          * bumping mount_generation causes the cached values
2284          * to be invalidated
2285          */
2286         name_cache_lock();
2287         mount_generation++;
2288         name_cache_unlock();
2289
2290
2291         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2292         if (withref != 0) {
2293                 mount_drop(mp, 0);
2294         }
2295         error = 0;
2296         if (forcedunmount == 0) {
2297                 ubc_umount(mp); /* release cached vnodes */
2298                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2299                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
2300                         if (error) {
2301                                 mount_lock(mp);
2302                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2303                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2304                                 mp->mnt_lflag &= ~MNT_LFORCE;
2305                                 goto out;
2306                         }
2307                 }
2308         }
2309
2310         IOBSDMountChange(mp, kIOMountChangeUnmount);
2311
2312 #if CONFIG_TRIGGERS
2313         vfs_nested_trigger_unmounts(mp, flags, ctx);
2314         did_vflush = 1;
2315 #endif
2316         if (forcedunmount) {
2317                 lflags |= FORCECLOSE;
2318         }
2319         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2320         if ((forcedunmount == 0) && error) {
2321                 mount_lock(mp);
2322                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2323                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2324                 mp->mnt_lflag &= ~MNT_LFORCE;
2325                 goto out;
2326         }
2327
2328         /* make sure there are no one in the mount iterations or lookup */
2329         mount_iterdrain(mp);
2330
2331         error = VFS_UNMOUNT(mp, flags, ctx);
2332         if (error) {
2333                 mount_iterreset(mp);
2334                 mount_lock(mp);
2335                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2336                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2337                 mp->mnt_lflag &= ~MNT_LFORCE;
2338                 goto out;
2339         }
2340
2341         /* increment the operations count */
2342         if (!error) {
2343                 OSAddAtomic(1, &vfs_nummntops);
2344         }
2345
2346         if (mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2347                 /* hold an io reference and drop the usecount before close */
2348                 devvp = mp->mnt_devvp;
2349                 vnode_getalways(devvp);
2350                 vnode_rele(devvp);
2351                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD | FWRITE,
2352                     ctx);
2353                 vnode_clearmountedon(devvp);
2354                 vnode_put(devvp);
2355         }
2356         lck_rw_done(&mp->mnt_rwlock);
2357         mount_list_remove(mp);
2358         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2359
2360         /* mark the mount point hook in the vp but not drop the ref yet */
2361         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2362                 /*
2363                  * The covered vnode needs special handling. Trying to get an
2364                  * iocount must not block here as this may lead to deadlocks
2365                  * if the Filesystem to which the covered vnode belongs is
2366                  * undergoing forced unmounts. Since we hold a usecount, the
2367                  * vnode cannot be reused (it can, however, still be terminated)
2368                  */
2369                 vnode_getalways(coveredvp);
2370                 vnode_lock_spin(coveredvp);
2371
2372                 mp->mnt_crossref++;
2373                 coveredvp->v_mountedhere = (struct mount *)0;
2374                 CLR(coveredvp->v_flag, VMOUNT);
2375
2376                 vnode_unlock(coveredvp);
2377                 vnode_put(coveredvp);
2378         }
2379
2380         mount_list_lock();
2381         mp->mnt_vtable->vfc_refcount--;
2382         mount_list_unlock();
2383
2384         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2385         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2386         mount_lock(mp);
2387         mp->mnt_lflag |= MNT_LDEAD;
2388
2389         if (mp->mnt_lflag & MNT_LWAIT) {
2390                 /*
2391                  * do the wakeup here
2392                  * in case we block in mount_refdrain
2393                  * which will drop the mount lock
2394                  * and allow anyone blocked in vfs_busy
2395                  * to wakeup and see the LDEAD state
2396                  */
2397                 mp->mnt_lflag &= ~MNT_LWAIT;
2398                 wakeup((caddr_t)mp);
2399         }
2400         mount_refdrain(mp);
2401
2402         /* free disk_conditioner_info structure for this mount */
2403         disk_conditioner_unmount(mp);
2404
2405 out:
2406         if (mp->mnt_lflag & MNT_LWAIT) {
2407                 mp->mnt_lflag &= ~MNT_LWAIT;
2408                 needwakeup = 1;
2409         }
2410
2411 #if CONFIG_TRIGGERS
2412         if (flags & MNT_NOBLOCK && p != kernproc) {
2413                 // Restore P_NOREMOTEHANG bit to its previous value
2414                 if ((pflags_save & P_NOREMOTEHANG) == 0) {
2415                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2416                 }
2417         }
2418
2419         /*
2420          * Callback and context are set together under the mount lock, and
2421          * never cleared, so we're safe to examine them here, drop the lock,
2422          * and call out.
2423          */
2424         if (mp->mnt_triggercallback != NULL) {
2425                 mount_unlock(mp);
2426                 if (error == 0) {
2427                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2428                 } else if (did_vflush) {
2429                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2430                 }
2431         } else {
2432                 mount_unlock(mp);
2433         }
2434 #else
2435         mount_unlock(mp);
2436 #endif /* CONFIG_TRIGGERS */
2437
2438         lck_rw_done(&mp->mnt_rwlock);
2439
2440         if (needwakeup) {
2441                 wakeup((caddr_t)mp);
2442         }
2443
2444         if (!error) {
2445                 if ((coveredvp != NULLVP)) {
2446                         vnode_t pvp = NULLVP;
2447
2448                         /*
2449                          * The covered vnode needs special handling. Trying to
2450                          * get an iocount must not block here as this may lead
2451                          * to deadlocks if the Filesystem to which the covered
2452                          * vnode belongs is undergoing forced unmounts. Since we
2453                          * hold a usecount, the  vnode cannot be reused
2454                          * (it can, however, still be terminated).
2455                          */
2456                         vnode_getalways(coveredvp);
2457
2458                         mount_dropcrossref(mp, coveredvp, 0);
2459                         /*
2460                          * We'll _try_ to detect if this really needs to be
2461                          * done. The coveredvp can only be in termination (or
2462                          * terminated) if the coveredvp's mount point is in a
2463                          * forced unmount (or has been) since we still hold the
2464                          * ref.
2465                          */
2466                         if (!vnode_isrecycled(coveredvp)) {
2467                                 pvp = vnode_getparent(coveredvp);
2468 #if CONFIG_TRIGGERS
2469                                 if (coveredvp->v_resolve) {
2470                                         vnode_trigger_rearm(coveredvp, ctx);
2471                                 }
2472 #endif
2473                         }
2474
2475                         vnode_rele(coveredvp);
2476                         vnode_put(coveredvp);
2477                         coveredvp = NULLVP;
2478
2479                         if (pvp) {
2480                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2481                                 vnode_put(pvp);
2482                         }
2483                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2484                         mount_lock_destroy(mp);
2485 #if CONFIG_MACF
2486                         mac_mount_label_destroy(mp);
2487 #endif
2488                         FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2489                 } else {
2490                         panic("dounmount: no coveredvp");
2491                 }
2492         }
2493         return error;
2494 }
2495
2496 /*
2497  * Unmount any mounts in this filesystem.
2498  */
2499 void
2500 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2501 {
2502         mount_t smp;
2503         fsid_t *fsids, fsid;
2504         int fsids_sz;
2505         int count = 0, i, m = 0;
2506         vnode_t vp;
2507
2508         mount_list_lock();
2509
2510         // Get an array to hold the submounts fsids.
2511         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2512         count++;
2513         fsids_sz = count * sizeof(fsid_t);
2514         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2515         if (fsids == NULL) {
2516                 mount_list_unlock();
2517                 goto out;
2518         }
2519         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2520
2521         /*
2522          * Fill the array with submount fsids.
2523          * Since mounts are always added to the tail of the mount list, the
2524          * list is always in mount order.
2525          * For each mount check if the mounted-on vnode belongs to a
2526          * mount that's already added to our array of mounts to be unmounted.
2527          */
2528         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2529                 vp = smp->mnt_vnodecovered;
2530                 if (vp == NULL) {
2531                         continue;
2532                 }
2533                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2534                 for (i = 0; i <= m; i++) {
2535                         if (fsids[i].val[0] == fsid.val[0] &&
2536                             fsids[i].val[1] == fsid.val[1]) {
2537                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2538                                 break;
2539                         }
2540                 }
2541         }
2542         mount_list_unlock();
2543
2544         // Unmount the submounts in reverse order. Ignore errors.
2545         for (i = m; i > 0; i--) {
2546                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2547                 if (smp) {
2548                         mount_ref(smp, 0);
2549                         mount_iterdrop(smp);
2550                         (void) dounmount(smp, flags, 1, ctx);
2551                 }
2552         }
2553 out:
2554         if (fsids) {
2555                 FREE(fsids, M_TEMP);
2556         }
2557 }
2558
2559 void
2560 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2561 {
2562         vnode_lock(dp);
2563         mp->mnt_crossref--;
2564
2565         if (mp->mnt_crossref < 0) {
2566                 panic("mount cross refs -ve");
2567         }
2568
2569         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2570                 if (need_put) {
2571                         vnode_put_locked(dp);
2572                 }
2573                 vnode_unlock(dp);
2574
2575                 mount_lock_destroy(mp);
2576 #if CONFIG_MACF
2577                 mac_mount_label_destroy(mp);
2578 #endif
2579                 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
2580                 return;
2581         }
2582         if (need_put) {
2583                 vnode_put_locked(dp);
2584         }
2585         vnode_unlock(dp);
2586 }
2587
2588
2589 /*
2590  * Sync each mounted filesystem.
2591  */
2592 #if DIAGNOSTIC
2593 int syncprt = 0;
2594 #endif
2595
2596 int print_vmpage_stat = 0;
2597
2598 /*
2599  * sync_callback:       simple wrapper that calls VFS_SYNC() on volumes
2600  *                      mounted read-write with the passed waitfor value.
2601  *
2602  * Parameters:  mp      mount-point descriptor per mounted file-system instance.
2603  *              arg     user argument (please see below)
2604  *
2605  * User argument is a pointer to 32 bit unsigned integer which describes the
2606  * type of waitfor value to set for calling VFS_SYNC().  If user argument is
2607  * passed as NULL, VFS_SYNC() is called with MNT_NOWAIT set as the default
2608  * waitfor value.
2609  *
2610  * Returns:             VFS_RETURNED
2611  */
2612 static int
2613 sync_callback(mount_t mp, void *arg)
2614 {
2615         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2616                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2617                 unsigned waitfor = MNT_NOWAIT;
2618
2619                 if (arg) {
2620                         waitfor = *(uint32_t*)arg;
2621                 }
2622
2623                 /* Sanity check for flags - these are the only valid combinations for the flag bits*/
2624                 if (waitfor != MNT_WAIT &&
2625                     waitfor != (MNT_WAIT | MNT_VOLUME) &&
2626                     waitfor != MNT_NOWAIT &&
2627                     waitfor != (MNT_NOWAIT | MNT_VOLUME) &&
2628                     waitfor != MNT_DWAIT &&
2629                     waitfor != (MNT_DWAIT | MNT_VOLUME)) {
2630                         panic("Passed inappropriate waitfor %u to "
2631                             "sync_callback()", waitfor);
2632                 }
2633
2634                 mp->mnt_flag &= ~MNT_ASYNC;
2635                 (void)VFS_SYNC(mp, waitfor, vfs_context_kernel());
2636                 if (asyncflag) {
2637                         mp->mnt_flag |= MNT_ASYNC;
2638                 }
2639         }
2640
2641         return VFS_RETURNED;
2642 }
2643
2644 /* ARGSUSED */
2645 int
2646 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2647 {
2648         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2649
2650         if (print_vmpage_stat) {
2651                 vm_countdirtypages();
2652         }
2653
2654 #if DIAGNOSTIC
2655         if (syncprt) {
2656                 vfs_bufstats();
2657         }
2658 #endif /* DIAGNOSTIC */
2659         return 0;
2660 }
2661
2662 typedef enum {
2663         SYNC_ALL = 0,
2664         SYNC_ONLY_RELIABLE_MEDIA = 1,
2665         SYNC_ONLY_UNRELIABLE_MEDIA = 2
2666 } sync_type_t;
2667
2668 static int
2669 sync_internal_callback(mount_t mp, void *arg)
2670 {
2671         if (arg) {
2672                 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2673                     (mp->mnt_flag & MNT_LOCAL);
2674                 sync_type_t sync_type = *((sync_type_t *)arg);
2675
2676                 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) {
2677                         return VFS_RETURNED;
2678                 } else if ((sync_type == SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) {
2679                         return VFS_RETURNED;
2680                 }
2681         }
2682
2683         (void)sync_callback(mp, NULL);
2684
2685         return VFS_RETURNED;
2686 }
2687
2688 int sync_thread_state = 0;
2689 int sync_timeout_seconds = 5;
2690
2691 #define SYNC_THREAD_RUN       0x0001
2692 #define SYNC_THREAD_RUNNING   0x0002
2693
2694 static void
2695 sync_thread(__unused void *arg, __unused wait_result_t wr)
2696 {
2697         sync_type_t sync_type;
2698
2699         lck_mtx_lock(sync_mtx_lck);
2700         while (sync_thread_state & SYNC_THREAD_RUN) {
2701                 sync_thread_state &= ~SYNC_THREAD_RUN;
2702                 lck_mtx_unlock(sync_mtx_lck);
2703
2704                 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2705                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2706                 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2707                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2708
2709                 lck_mtx_lock(sync_mtx_lck);
2710         }
2711         /*
2712          * This wakeup _has_ to be issued before the lock is released otherwise
2713          * we may end up waking up a thread in sync_internal which is
2714          * expecting a wakeup from a thread it just created and not from this
2715          * thread which is about to exit.
2716          */
2717         wakeup(&sync_thread_state);
2718         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2719         lck_mtx_unlock(sync_mtx_lck);
2720
2721         if (print_vmpage_stat) {
2722                 vm_countdirtypages();
2723         }
2724
2725 #if DIAGNOSTIC
2726         if (syncprt) {
2727                 vfs_bufstats();
2728         }
2729 #endif /* DIAGNOSTIC */
2730 }
2731
2732 struct timeval sync_timeout_last_print = {.tv_sec = 0, .tv_usec = 0};
2733
2734 /*
2735  * An in-kernel sync for power management to call.
2736  * This function always returns within sync_timeout seconds.
2737  */
2738 __private_extern__ int
2739 sync_internal(void)
2740 {
2741         thread_t thd;
2742         int error;
2743         int thread_created = FALSE;
2744         struct timespec ts = {.tv_sec = sync_timeout_seconds, .tv_nsec = 0};
2745
2746         lck_mtx_lock(sync_mtx_lck);
2747         sync_thread_state |= SYNC_THREAD_RUN;
2748         if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2749                 int kr;
2750
2751                 sync_thread_state |= SYNC_THREAD_RUNNING;
2752                 kr = kernel_thread_start(sync_thread, NULL, &thd);
2753                 if (kr != KERN_SUCCESS) {
2754                         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2755                         lck_mtx_unlock(sync_mtx_lck);
2756                         printf("sync_thread failed\n");
2757                         return 0;
2758                 }
2759                 thread_created = TRUE;
2760         }
2761
2762         error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
2763             (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2764         if (error) {
2765                 struct timeval now;
2766
2767                 microtime(&now);
2768                 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2769                         printf("sync timed out: %d sec\n", sync_timeout_seconds);
2770                         sync_timeout_last_print.tv_sec = now.tv_sec;
2771                 }
2772         }
2773
2774         if (thread_created) {
2775                 thread_deallocate(thd);
2776         }
2777
2778         return 0;
2779 } /* end of sync_internal call */
2780
2781 /*
2782  * Change filesystem quotas.
2783  */
2784 #if QUOTA
2785 int
2786 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2787 {
2788         struct mount *mp;
2789         int error, quota_cmd, quota_status = 0;
2790         caddr_t datap;
2791         size_t fnamelen;
2792         struct nameidata nd;
2793         vfs_context_t ctx = vfs_context_current();
2794         struct dqblk my_dqblk = {};
2795
2796         AUDIT_ARG(uid, uap->uid);
2797         AUDIT_ARG(cmd, uap->cmd);
2798         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2799             uap->path, ctx);
2800         error = namei(&nd);
2801         if (error) {
2802                 return error;
2803         }
2804         mp = nd.ni_vp->v_mount;
2805         mount_ref(mp, 0);
2806         vnode_put(nd.ni_vp);
2807         nameidone(&nd);
2808
2809         /* copyin any data we will need for downstream code */
2810         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2811
2812         switch (quota_cmd) {
2813         case Q_QUOTAON:
2814                 /* uap->arg specifies a file from which to take the quotas */
2815                 fnamelen = MAXPATHLEN;
2816                 datap = kalloc(MAXPATHLEN);
2817                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2818                 break;
2819         case Q_GETQUOTA:
2820                 /* uap->arg is a pointer to a dqblk structure. */
2821                 datap = (caddr_t) &my_dqblk;
2822                 break;
2823         case Q_SETQUOTA:
2824         case Q_SETUSE:
2825                 /* uap->arg is a pointer to a dqblk structure. */
2826                 datap = (caddr_t) &my_dqblk;
2827                 if (proc_is64bit(p)) {
2828                         struct user_dqblk       my_dqblk64;
2829                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof(my_dqblk64));
2830                         if (error == 0) {
2831                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2832                         }
2833                 } else {
2834                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof(my_dqblk));
2835                 }
2836                 break;
2837         case Q_QUOTASTAT:
2838                 /* uap->arg is a pointer to an integer */
2839                 datap = (caddr_t) &quota_status;
2840                 break;
2841         default:
2842                 datap = NULL;
2843                 break;
2844         } /* switch */
2845
2846         if (error == 0) {
2847                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2848         }
2849
2850         switch (quota_cmd) {
2851         case Q_QUOTAON:
2852                 if (datap != NULL) {
2853                         kfree(datap, MAXPATHLEN);
2854                 }
2855                 break;
2856         case Q_GETQUOTA:
2857                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2858                 if (error == 0) {
2859                         if (proc_is64bit(p)) {
2860                                 struct user_dqblk       my_dqblk64;
2861
2862                                 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
2863                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2864                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof(my_dqblk64));
2865                         } else {
2866                                 error = copyout(datap, uap->arg, sizeof(struct dqblk));
2867                         }
2868                 }
2869                 break;
2870         case Q_QUOTASTAT:
2871                 /* uap->arg is a pointer to an integer */
2872                 if (error == 0) {
2873                         error = copyout(datap, uap->arg, sizeof(quota_status));
2874                 }
2875                 break;
2876         default:
2877                 break;
2878         } /* switch */
2879
2880         mount_drop(mp, 0);
2881         return error;
2882 }
2883 #else
2884 int
2885 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2886 {
2887         return EOPNOTSUPP;
2888 }
2889 #endif /* QUOTA */
2890
2891 /*
2892  * Get filesystem statistics.
2893  *
2894  * Returns:     0                       Success
2895  *      namei:???
2896  *      vfs_update_vfsstat:???
2897  *      munge_statfs:EFAULT
2898  */
2899 /* ARGSUSED */
2900 int
2901 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2902 {
2903         struct mount *mp;
2904         struct vfsstatfs *sp;
2905         int error;
2906         struct nameidata nd;
2907         vfs_context_t ctx = vfs_context_current();
2908         vnode_t vp;
2909
2910         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2911             UIO_USERSPACE, uap->path, ctx);
2912         error = namei(&nd);
2913         if (error != 0) {
2914                 return error;
2915         }
2916         vp = nd.ni_vp;
2917         mp = vp->v_mount;
2918         sp = &mp->mnt_vfsstat;
2919         nameidone(&nd);
2920
2921 #if CONFIG_MACF
2922         error = mac_mount_check_stat(ctx, mp);
2923         if (error != 0) {
2924                 vnode_put(vp);
2925                 return error;
2926         }
2927 #endif
2928
2929         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2930         if (error != 0) {
2931                 vnode_put(vp);
2932                 return error;
2933         }
2934
2935         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2936         vnode_put(vp);
2937         return error;
2938 }
2939
2940 /*
2941  * Get filesystem statistics.
2942  */
2943 /* ARGSUSED */
2944 int
2945 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2946 {
2947         vnode_t vp;
2948         struct mount *mp;
2949         struct vfsstatfs *sp;
2950         int error;
2951
2952         AUDIT_ARG(fd, uap->fd);
2953
2954         if ((error = file_vnode(uap->fd, &vp))) {
2955                 return error;
2956         }
2957
2958         error = vnode_getwithref(vp);
2959         if (error) {
2960                 file_drop(uap->fd);
2961                 return error;
2962         }
2963
2964         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2965
2966         mp = vp->v_mount;
2967         if (!mp) {
2968                 error = EBADF;
2969                 goto out;
2970         }
2971
2972 #if CONFIG_MACF
2973         error = mac_mount_check_stat(vfs_context_current(), mp);
2974         if (error != 0) {
2975                 goto out;
2976         }
2977 #endif
2978
2979         sp = &mp->mnt_vfsstat;
2980         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2981                 goto out;
2982         }
2983
2984         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2985
2986 out:
2987         file_drop(uap->fd);
2988         vnode_put(vp);
2989
2990         return error;
2991 }
2992
2993 void
2994 vfs_get_statfs64(struct mount *mp, struct statfs64 *sfs)
2995 {
2996         struct vfsstatfs *vsfs = &mp->mnt_vfsstat;
2997
2998         bzero(sfs, sizeof(*sfs));
2999
3000         sfs->f_bsize = vsfs->f_bsize;
3001         sfs->f_iosize = (int32_t)vsfs->f_iosize;
3002         sfs->f_blocks = vsfs->f_blocks;
3003         sfs->f_bfree = vsfs->f_bfree;
3004         sfs->f_bavail = vsfs->f_bavail;
3005         sfs->f_files = vsfs->f_files;
3006         sfs->f_ffree = vsfs->f_ffree;
3007         sfs->f_fsid = vsfs->f_fsid;
3008         sfs->f_owner = vsfs->f_owner;
3009         sfs->f_type = mp->mnt_vtable->vfc_typenum;
3010         sfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3011         sfs->f_fssubtype = vsfs->f_fssubtype;
3012         sfs->f_flags_ext = ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS)) ? MNT_EXT_ROOT_DATA_VOL : 0;
3013         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3014                 strlcpy(&sfs->f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
3015         } else {
3016                 strlcpy(&sfs->f_fstypename[0], &vsfs->f_fstypename[0], MFSTYPENAMELEN);
3017         }
3018         strlcpy(&sfs->f_mntonname[0], &vsfs->f_mntonname[0], MAXPATHLEN);
3019         strlcpy(&sfs->f_mntfromname[0], &vsfs->f_mntfromname[0], MAXPATHLEN);
3020 }
3021
3022 /*
3023  * Get file system statistics in 64-bit mode
3024  */
3025 int
3026 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
3027 {
3028         struct mount *mp;
3029         int error;
3030         struct nameidata nd;
3031         struct statfs64 sfs;
3032         vfs_context_t ctxp = vfs_context_current();
3033         vnode_t vp;
3034
3035         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
3036             UIO_USERSPACE, uap->path, ctxp);
3037         error = namei(&nd);
3038         if (error != 0) {
3039                 return error;
3040         }
3041         vp = nd.ni_vp;
3042         mp = vp->v_mount;
3043         nameidone(&nd);
3044
3045 #if CONFIG_MACF
3046         error = mac_mount_check_stat(ctxp, mp);
3047         if (error != 0) {
3048                 vnode_put(vp);
3049                 return error;
3050         }
3051 #endif
3052
3053         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
3054         if (error != 0) {
3055                 vnode_put(vp);
3056                 return error;
3057         }
3058
3059         vfs_get_statfs64(mp, &sfs);
3060         if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3061             (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3062                 /* This process does not want to see a seperate data volume mountpoint */
3063                 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3064         }
3065         error = copyout(&sfs, uap->buf, sizeof(sfs));
3066         vnode_put(vp);
3067
3068         return error;
3069 }
3070
3071 /*
3072  * Get file system statistics in 64-bit mode
3073  */
3074 int
3075 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
3076 {
3077         struct vnode *vp;
3078         struct mount *mp;
3079         struct statfs64 sfs;
3080         int error;
3081
3082         AUDIT_ARG(fd, uap->fd);
3083
3084         if ((error = file_vnode(uap->fd, &vp))) {
3085                 return error;
3086         }
3087
3088         error = vnode_getwithref(vp);
3089         if (error) {
3090                 file_drop(uap->fd);
3091                 return error;
3092         }
3093
3094         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
3095
3096         mp = vp->v_mount;
3097         if (!mp) {
3098                 error = EBADF;
3099                 goto out;
3100         }
3101
3102 #if CONFIG_MACF
3103         error = mac_mount_check_stat(vfs_context_current(), mp);
3104         if (error != 0) {
3105                 goto out;
3106         }
3107 #endif
3108
3109         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
3110                 goto out;
3111         }
3112
3113         vfs_get_statfs64(mp, &sfs);
3114         if ((mp->mnt_kern_flag & MNTK_SYSTEM) && !(mp->mnt_kern_flag & MNTK_SWAP_MOUNT) && !(mp->mnt_flag & MNT_ROOTFS) &&
3115             (p->p_vfs_iopolicy & P_VFS_IOPOLICY_STATFS_NO_DATA_VOLUME)) {
3116                 /* This process does not want to see a seperate data volume mountpoint */
3117                 strlcpy(&sfs.f_mntonname[0], "/", sizeof("/"));
3118         }
3119         error = copyout(&sfs, uap->buf, sizeof(sfs));
3120
3121 out:
3122         file_drop(uap->fd);
3123         vnode_put(vp);
3124
3125         return error;
3126 }
3127
3128 struct getfsstat_struct {
3129         user_addr_t     sfsp;
3130         user_addr_t     *mp;
3131         int             count;
3132         int             maxcount;
3133         int             flags;
3134         int             error;
3135 };
3136
3137
3138 static int
3139 getfsstat_callback(mount_t mp, void * arg)
3140 {
3141         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3142         struct vfsstatfs *sp;
3143         int error, my_size;
3144         vfs_context_t ctx = vfs_context_current();
3145
3146         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3147 #if CONFIG_MACF
3148                 error = mac_mount_check_stat(ctx, mp);
3149                 if (error != 0) {
3150                         fstp->error = error;
3151                         return VFS_RETURNED_DONE;
3152                 }
3153 #endif
3154                 sp = &mp->mnt_vfsstat;
3155                 /*
3156                  * If MNT_NOWAIT is specified, do not refresh the
3157                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
3158                  */
3159                 if ((mp->mnt_lflag & MNT_LDEAD) ||
3160                     (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3161                     (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3162                     (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))) {
3163                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3164                         return VFS_RETURNED;
3165                 }
3166
3167                 /*
3168                  * Need to handle LP64 version of struct statfs
3169                  */
3170                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
3171                 if (error) {
3172                         fstp->error = error;
3173                         return VFS_RETURNED_DONE;
3174                 }
3175                 fstp->sfsp += my_size;
3176
3177                 if (fstp->mp) {
3178 #if CONFIG_MACF
3179                         error = mac_mount_label_get(mp, *fstp->mp);
3180                         if (error) {
3181                                 fstp->error = error;
3182                                 return VFS_RETURNED_DONE;
3183                         }
3184 #endif
3185                         fstp->mp++;
3186                 }
3187         }
3188         fstp->count++;
3189         return VFS_RETURNED;
3190 }
3191
3192 /*
3193  * Get statistics on all filesystems.
3194  */
3195 int
3196 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
3197 {
3198         struct __mac_getfsstat_args muap;
3199
3200         muap.buf = uap->buf;
3201         muap.bufsize = uap->bufsize;
3202         muap.mac = USER_ADDR_NULL;
3203         muap.macsize = 0;
3204         muap.flags = uap->flags;
3205
3206         return __mac_getfsstat(p, &muap, retval);
3207 }
3208
3209 /*
3210  * __mac_getfsstat: Get MAC-related file system statistics
3211  *
3212  * Parameters:    p                        (ignored)
3213  *                uap                      User argument descriptor (see below)
3214  *                retval                   Count of file system statistics (N stats)
3215  *
3216  * Indirect:      uap->bufsize             Buffer size
3217  *                uap->macsize             MAC info size
3218  *                uap->buf                 Buffer where information will be returned
3219  *                uap->mac                 MAC info
3220  *                uap->flags               File system flags
3221  *
3222  *
3223  * Returns:        0                       Success
3224  *                !0                       Not success
3225  *
3226  */
3227 int
3228 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
3229 {
3230         user_addr_t sfsp;
3231         user_addr_t *mp;
3232         size_t count, maxcount, bufsize, macsize;
3233         struct getfsstat_struct fst;
3234
3235         if ((unsigned)uap->bufsize > INT_MAX || (unsigned)uap->macsize > INT_MAX) {
3236                 return EINVAL;
3237         }
3238
3239         bufsize = (size_t) uap->bufsize;
3240         macsize = (size_t) uap->macsize;
3241
3242         if (IS_64BIT_PROCESS(p)) {
3243                 maxcount = bufsize / sizeof(struct user64_statfs);
3244         } else {
3245                 maxcount = bufsize / sizeof(struct user32_statfs);
3246         }
3247         sfsp = uap->buf;
3248         count = 0;
3249
3250         mp = NULL;
3251
3252 #if CONFIG_MACF
3253         if (uap->mac != USER_ADDR_NULL) {
3254                 u_int32_t *mp0;
3255                 int error;
3256                 unsigned int i;
3257
3258                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
3259                 if (count != maxcount) {
3260                         return EINVAL;
3261                 }
3262
3263                 /* Copy in the array */
3264                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
3265                 if (mp0 == NULL) {
3266                         return ENOMEM;
3267                 }
3268
3269                 error = copyin(uap->mac, mp0, macsize);
3270                 if (error) {
3271                         FREE(mp0, M_MACTEMP);
3272                         return error;
3273                 }
3274
3275                 /* Normalize to an array of user_addr_t */
3276                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
3277                 if (mp == NULL) {
3278                         FREE(mp0, M_MACTEMP);
3279                         return ENOMEM;
3280                 }
3281
3282                 for (i = 0; i < count; i++) {
3283                         if (IS_64BIT_PROCESS(p)) {
3284                                 mp[i] = ((user_addr_t *)mp0)[i];
3285                         } else {
3286                                 mp[i] = (user_addr_t)mp0[i];
3287                         }
3288                 }
3289                 FREE(mp0, M_MACTEMP);
3290         }
3291 #endif
3292
3293
3294         fst.sfsp = sfsp;
3295         fst.mp = mp;
3296         fst.flags = uap->flags;
3297         fst.count = 0;
3298         fst.error = 0;
3299         fst.maxcount = maxcount;
3300
3301
3302         vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat_callback, &fst);
3303
3304         if (mp) {
3305                 FREE(mp, M_MACTEMP);
3306         }
3307
3308         if (fst.error) {
3309                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3310                 return fst.error;
3311         }
3312
3313         if (fst.sfsp && fst.count > fst.maxcount) {
3314                 *retval = fst.maxcount;
3315         } else {
3316                 *retval = fst.count;
3317         }
3318         return 0;
3319 }
3320
3321 static int
3322 getfsstat64_callback(mount_t mp, void * arg)
3323 {
3324         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3325         struct vfsstatfs *sp;
3326         struct statfs64 sfs;
3327         int error;
3328
3329         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3330 #if CONFIG_MACF
3331                 error = mac_mount_check_stat(vfs_context_current(), mp);
3332                 if (error != 0) {
3333                         fstp->error = error;
3334                         return VFS_RETURNED_DONE;
3335                 }
3336 #endif
3337                 sp = &mp->mnt_vfsstat;
3338                 /*
3339                  * If MNT_NOWAIT is specified, do not refresh the fsstat
3340                  * cache. MNT_WAIT overrides MNT_NOWAIT.
3341                  *
3342                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
3343                  * getfsstat, since the constants are out of the same
3344                  * namespace.
3345                  */
3346                 if ((mp->mnt_lflag & MNT_LDEAD) ||
3347                     ((((fstp->flags & MNT_NOWAIT) == 0) || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3348                     (!(mp->mnt_lflag & MNT_LUNMOUNT)) &&
3349                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)))) {
3350                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3351                         return VFS_RETURNED;
3352                 }
3353
3354                 vfs_get_statfs64(mp, &sfs);
3355                 error = copyout(&sfs, fstp->sfsp, sizeof(sfs));
3356                 if (error) {
3357                         fstp->error = error;
3358                         return VFS_RETURNED_DONE;
3359                 }
3360                 fstp->sfsp += sizeof(sfs);
3361         }
3362         fstp->count++;
3363         return VFS_RETURNED;
3364 }
3365
3366 /*
3367  * Get statistics on all file systems in 64 bit mode.
3368  */
3369 int
3370 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3371 {
3372         user_addr_t sfsp;
3373         int count, maxcount;
3374         struct getfsstat_struct fst;
3375
3376         maxcount = uap->bufsize / sizeof(struct statfs64);
3377
3378         sfsp = uap->buf;
3379         count = 0;
3380
3381         fst.sfsp = sfsp;
3382         fst.flags = uap->flags;
3383         fst.count = 0;
3384         fst.error = 0;
3385         fst.maxcount = maxcount;
3386
3387         vfs_iterate(VFS_ITERATE_NOSKIP_UNMOUNT, getfsstat64_callback, &fst);
3388
3389         if (fst.error) {
3390                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3391                 return fst.error;
3392         }
3393
3394         if (fst.sfsp && fst.count > fst.maxcount) {
3395                 *retval = fst.maxcount;
3396         } else {
3397                 *retval = fst.count;
3398         }
3399
3400         return 0;
3401 }
3402
3403 /*
3404  * gets the associated vnode with the file descriptor passed.
3405  * as input
3406  *
3407  * INPUT
3408  * ctx - vfs context of caller
3409  * fd - file descriptor for which vnode is required.
3410  * vpp - Pointer to pointer to vnode to be returned.
3411  *
3412  * The vnode is returned with an iocount so any vnode obtained
3413  * by this call needs a vnode_put
3414  *
3415  */
3416 int
3417 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3418 {
3419         int error;
3420         vnode_t vp;
3421         struct fileproc *fp;
3422         proc_t p = vfs_context_proc(ctx);
3423
3424         *vpp =  NULLVP;
3425
3426         error = fp_getfvp(p, fd, &fp, &vp);
3427         if (error) {
3428                 return error;
3429         }
3430
3431         error = vnode_getwithref(vp);
3432         if (error) {
3433                 (void)fp_drop(p, fd, fp, 0);
3434                 return error;
3435         }
3436
3437         (void)fp_drop(p, fd, fp, 0);
3438         *vpp = vp;
3439         return error;
3440 }
3441
3442 /*
3443  * Wrapper function around namei to start lookup from a directory
3444  * specified by a file descriptor ni_dirfd.
3445  *
3446  * In addition to all the errors returned by namei, this call can
3447  * return ENOTDIR if the file descriptor does not refer to a directory.
3448  * and EBADF if the file descriptor is not valid.
3449  */
3450 int
3451 nameiat(struct nameidata *ndp, int dirfd)
3452 {
3453         if ((dirfd != AT_FDCWD) &&
3454             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3455             !(ndp->ni_cnd.cn_flags & USEDVP)) {
3456                 int error = 0;
3457                 char c;
3458
3459                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3460                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3461                         if (error) {
3462                                 return error;
3463                         }
3464                 } else {
3465                         c = *((char *)(ndp->ni_dirp));
3466                 }
3467
3468                 if (c != '/') {
3469                         vnode_t dvp_at;
3470
3471                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3472                             &dvp_at);
3473                         if (error) {
3474                                 return error;
3475                         }
3476
3477                         if (vnode_vtype(dvp_at) != VDIR) {
3478                                 vnode_put(dvp_at);
3479                                 return ENOTDIR;
3480                         }
3481
3482                         ndp->ni_dvp = dvp_at;
3483                         ndp->ni_cnd.cn_flags |= USEDVP;
3484                         error = namei(ndp);
3485                         ndp->ni_cnd.cn_flags &= ~USEDVP;
3486                         vnode_put(dvp_at);
3487                         return error;
3488                 }
3489         }
3490
3491         return namei(ndp);
3492 }
3493
3494 /*
3495  * Change current working directory to a given file descriptor.
3496  */
3497 /* ARGSUSED */
3498 static int
3499 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3500 {
3501         struct filedesc *fdp = p->p_fd;
3502         vnode_t vp;
3503         vnode_t tdp;
3504         vnode_t tvp;
3505         struct mount *mp;
3506         int error;
3507         vfs_context_t ctx = vfs_context_current();
3508
3509         AUDIT_ARG(fd, uap->fd);
3510         if (per_thread && uap->fd == -1) {
3511                 /*
3512                  * Switching back from per-thread to per process CWD; verify we
3513                  * in fact have one before proceeding.  The only success case
3514                  * for this code path is to return 0 preemptively after zapping
3515                  * the thread structure contents.
3516                  */
3517                 thread_t th = vfs_context_thread(ctx);
3518                 if (th) {
3519                         uthread_t uth = get_bsdthread_info(th);
3520                         tvp = uth->uu_cdir;
3521                         uth->uu_cdir = NULLVP;
3522                         if (tvp != NULLVP) {
3523                                 vnode_rele(tvp);
3524                                 return 0;
3525                         }
3526                 }
3527                 return EBADF;
3528         }
3529
3530         if ((error = file_vnode(uap->fd, &vp))) {
3531                 return error;
3532         }
3533         if ((error = vnode_getwithref(vp))) {
3534                 file_drop(uap->fd);
3535                 return error;
3536         }
3537
3538         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3539
3540         if (vp->v_type != VDIR) {
3541                 error = ENOTDIR;
3542                 goto out;
3543         }
3544
3545 #if CONFIG_MACF
3546         error = mac_vnode_check_chdir(ctx, vp);
3547         if (error) {
3548                 goto out;
3549         }
3550 #endif
3551         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3552         if (error) {
3553                 goto out;
3554         }
3555
3556         while (!error && (mp = vp->v_mountedhere) != NULL) {
3557                 if (vfs_busy(mp, LK_NOWAIT)) {
3558                         error = EACCES;
3559                         goto out;
3560                 }
3561                 error = VFS_ROOT(mp, &tdp, ctx);
3562                 vfs_unbusy(mp);
3563                 if (error) {
3564                         break;
3565                 }
3566                 vnode_put(vp);
3567                 vp = tdp;
3568         }
3569         if (error) {
3570                 goto out;
3571         }
3572         if ((error = vnode_ref(vp))) {
3573                 goto out;
3574         }
3575         vnode_put(vp);
3576
3577         if (per_thread) {
3578                 thread_t th = vfs_context_thread(ctx);
3579                 if (th) {
3580                         uthread_t uth = get_bsdthread_info(th);
3581                         tvp = uth->uu_cdir;
3582                         uth->uu_cdir = vp;
3583                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3584                 } else {
3585                         vnode_rele(vp);
3586                         return ENOENT;
3587                 }
3588         } else {
3589                 proc_fdlock(p);
3590                 tvp = fdp->fd_cdir;
3591                 fdp->fd_cdir = vp;
3592                 proc_fdunlock(p);
3593         }
3594
3595         if (tvp) {
3596                 vnode_rele(tvp);
3597         }
3598         file_drop(uap->fd);
3599
3600         return 0;
3601 out:
3602         vnode_put(vp);
3603         file_drop(uap->fd);
3604
3605         return error;
3606 }
3607
3608 int
3609 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3610 {
3611         return common_fchdir(p, uap, 0);
3612 }
3613
3614 int
3615 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3616 {
3617         return common_fchdir(p, (void *)uap, 1);
3618 }
3619
3620
3621 /*
3622  * Change current working directory (".").
3623  *
3624  * Returns:     0                       Success
3625  *      change_dir:ENOTDIR
3626  *      change_dir:???
3627  *      vnode_ref:ENOENT                No such file or directory
3628  */
3629 /* ARGSUSED */
3630 int
3631 chdir_internal(proc_t p, vfs_context_t ctx, struct nameidata *ndp, int per_thread)
3632 {
3633         struct filedesc *fdp = p->p_fd;
3634         int error;
3635         vnode_t tvp;
3636
3637         error = change_dir(ndp, ctx);
3638         if (error) {
3639                 return error;
3640         }
3641         if ((error = vnode_ref(ndp->ni_vp))) {
3642                 vnode_put(ndp->ni_vp);
3643                 return error;
3644         }
3645         /*
3646          * drop the iocount we picked up in change_dir
3647          */
3648         vnode_put(ndp->ni_vp);
3649
3650         if (per_thread) {
3651                 thread_t th = vfs_context_thread(ctx);
3652                 if (th) {
3653                         uthread_t uth = get_bsdthread_info(th);
3654                         tvp = uth->uu_cdir;
3655                         uth->uu_cdir = ndp->ni_vp;
3656                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3657                 } else {
3658                         vnode_rele(ndp->ni_vp);
3659                         return ENOENT;
3660                 }
3661         } else {
3662                 proc_fdlock(p);
3663                 tvp = fdp->fd_cdir;
3664                 fdp->fd_cdir = ndp->ni_vp;
3665                 proc_fdunlock(p);
3666         }
3667
3668         if (tvp) {
3669                 vnode_rele(tvp);
3670         }
3671
3672         return 0;
3673 }
3674
3675
3676 /*
3677  * Change current working directory (".").
3678  *
3679  * Returns:     0                       Success
3680  *      chdir_internal:ENOTDIR
3681  *      chdir_internal:ENOENT           No such file or directory
3682  *      chdir_internal:???
3683  */
3684 /* ARGSUSED */
3685 static int
3686 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3687 {
3688         struct nameidata nd;
3689         vfs_context_t ctx = vfs_context_current();
3690
3691         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3692             UIO_USERSPACE, uap->path, ctx);
3693
3694         return chdir_internal(p, ctx, &nd, per_thread);
3695 }
3696
3697
3698 /*
3699  * chdir
3700  *
3701  * Change current working directory (".") for the entire process
3702  *
3703  * Parameters:  p       Process requesting the call
3704  *              uap     User argument descriptor (see below)
3705  *              retval  (ignored)
3706  *
3707  * Indirect parameters: uap->path       Directory path
3708  *
3709  * Returns:     0                       Success
3710  *              common_chdir: ENOTDIR
3711  *              common_chdir: ENOENT    No such file or directory
3712  *              common_chdir: ???
3713  *
3714  */
3715 int
3716 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3717 {
3718         return common_chdir(p, (void *)uap, 0);
3719 }
3720
3721 /*
3722  * __pthread_chdir
3723  *
3724  * Change current working directory (".") for a single thread
3725  *
3726  * Parameters:  p       Process requesting the call
3727  *              uap     User argument descriptor (see below)
3728  *              retval  (ignored)
3729  *
3730  * Indirect parameters: uap->path       Directory path
3731  *
3732  * Returns:     0                       Success
3733  *              common_chdir: ENOTDIR
3734  *              common_chdir: ENOENT    No such file or directory
3735  *              common_chdir: ???
3736  *
3737  */
3738 int
3739 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3740 {
3741         return common_chdir(p, (void *)uap, 1);
3742 }
3743
3744
3745 /*
3746  * Change notion of root (``/'') directory.
3747  */
3748 /* ARGSUSED */
3749 int
3750 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3751 {
3752         struct filedesc *fdp = p->p_fd;
3753         int error;
3754         struct nameidata nd;
3755         vnode_t tvp;
3756         vfs_context_t ctx = vfs_context_current();
3757
3758         if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
3759                 return error;
3760         }
3761
3762         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3763             UIO_USERSPACE, uap->path, ctx);
3764         error = change_dir(&nd, ctx);
3765         if (error) {
3766                 return error;
3767         }
3768
3769 #if CONFIG_MACF
3770         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3771             &nd.ni_cnd);
3772         if (error) {
3773                 vnode_put(nd.ni_vp);
3774                 return error;
3775         }
3776 #endif
3777
3778         if ((error = vnode_ref(nd.ni_vp))) {
3779                 vnode_put(nd.ni_vp);
3780                 return error;
3781         }
3782         vnode_put(nd.ni_vp);
3783
3784         proc_fdlock(p);
3785         tvp = fdp->fd_rdir;
3786         fdp->fd_rdir = nd.ni_vp;
3787         fdp->fd_flags |= FD_CHROOT;
3788         proc_fdunlock(p);
3789
3790         if (tvp != NULL) {
3791                 vnode_rele(tvp);
3792         }
3793
3794         return 0;
3795 }
3796
3797 /*
3798  * Common routine for chroot and chdir.
3799  *
3800  * Returns:     0                       Success
3801  *              ENOTDIR                 Not a directory
3802  *              namei:???               [anything namei can return]
3803  *              vnode_authorize:???     [anything vnode_authorize can return]
3804  */
3805 static int
3806 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3807 {
3808         vnode_t vp;
3809         int error;
3810
3811         if ((error = namei(ndp))) {
3812                 return error;
3813         }
3814         nameidone(ndp);
3815         vp = ndp->ni_vp;
3816
3817         if (vp->v_type != VDIR) {
3818                 vnode_put(vp);
3819                 return ENOTDIR;
3820         }
3821
3822 #if CONFIG_MACF
3823         error = mac_vnode_check_chdir(ctx, vp);
3824         if (error) {
3825                 vnode_put(vp);
3826                 return error;
3827         }
3828 #endif
3829
3830         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3831         if (error) {
3832                 vnode_put(vp);
3833                 return error;
3834         }
3835
3836         return error;
3837 }
3838
3839 /*
3840  * Free the vnode data (for directories) associated with the file glob.
3841  */
3842 struct fd_vn_data *
3843 fg_vn_data_alloc(void)
3844 {
3845         struct fd_vn_data *fvdata;
3846
3847         /* Allocate per fd vnode data */
3848         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3849             M_FD_VN_DATA, M_WAITOK | M_ZERO);
3850         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3851         return fvdata;
3852 }
3853
3854 /*
3855  * Free the vnode data (for directories) associated with the file glob.
3856  */
3857 void
3858 fg_vn_data_free(void *fgvndata)
3859 {
3860         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3861
3862         if (fvdata->fv_buf) {
3863                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3864         }
3865         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3866         FREE(fvdata, M_FD_VN_DATA);
3867 }
3868
3869 /*
3870  * Check permissions, allocate an open file structure,
3871  * and call the device open routine if any.
3872  *
3873  * Returns:     0                       Success
3874  *              EINVAL
3875  *              EINTR
3876  *      falloc:ENFILE
3877  *      falloc:EMFILE
3878  *      falloc:ENOMEM
3879  *      vn_open_auth:???
3880  *      dupfdopen:???
3881  *      VNOP_ADVLOCK:???
3882  *      vnode_setsize:???
3883  *
3884  * XXX Need to implement uid, gid
3885  */
3886 int
3887 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3888     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3889     int32_t *retval)
3890 {
3891         proc_t p = vfs_context_proc(ctx);
3892         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3893         struct fileproc *fp;
3894         vnode_t vp;
3895         int flags, oflags;
3896         int type, indx, error;
3897         struct flock lf;
3898         struct vfs_context context;
3899
3900         oflags = uflags;
3901
3902         if ((oflags & O_ACCMODE) == O_ACCMODE) {
3903                 return EINVAL;
3904         }
3905
3906         flags = FFLAGS(uflags);
3907         CLR(flags, FENCRYPTED);
3908         CLR(flags, FUNENCRYPTED);
3909
3910         AUDIT_ARG(fflags, oflags);
3911         AUDIT_ARG(mode, vap->va_mode);
3912
3913         if ((error = falloc_withalloc(p,
3914             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3915                 return error;
3916         }
3917         uu->uu_dupfd = -indx - 1;
3918
3919         if ((error = vn_open_auth(ndp, &flags, vap))) {
3920                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)) {       /* XXX from fdopen */
3921                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3922                                 fp_drop(p, indx, NULL, 0);
3923                                 *retval = indx;
3924                                 return 0;
3925                         }
3926                 }
3927                 if (error == ERESTART) {
3928                         error = EINTR;
3929                 }
3930                 fp_free(p, indx, fp);
3931                 return error;
3932         }
3933         uu->uu_dupfd = 0;
3934         vp = ndp->ni_vp;
3935
3936         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3937         fp->f_fglob->fg_ops = &vnops;
3938         fp->f_fglob->fg_data = (caddr_t)vp;
3939
3940         if (flags & (O_EXLOCK | O_SHLOCK)) {
3941                 lf.l_whence = SEEK_SET;
3942                 lf.l_start = 0;
3943                 lf.l_len = 0;
3944                 if (flags & O_EXLOCK) {
3945                         lf.l_type = F_WRLCK;
3946                 } else {
3947                         lf.l_type = F_RDLCK;
3948                 }
3949                 type = F_FLOCK;
3950                 if ((flags & FNONBLOCK) == 0) {
3951                         type |= F_WAIT;
3952                 }
3953 #if CONFIG_MACF
3954                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3955                     F_SETLK, &lf);
3956                 if (error) {
3957                         goto bad;
3958                 }
3959 #endif
3960                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL))) {
3961                         goto bad;
3962                 }
3963                 fp->f_fglob->fg_flag |= FHASLOCK;
3964         }
3965
3966         /* try to truncate by setting the size attribute */
3967         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0)) {
3968                 goto bad;
3969         }
3970
3971         /*
3972          * For directories we hold some additional information in the fd.
3973          */
3974         if (vnode_vtype(vp) == VDIR) {
3975                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3976         } else {
3977                 fp->f_fglob->fg_vn_data = NULL;
3978         }
3979
3980         vnode_put(vp);
3981
3982         /*
3983          * The first terminal open (without a O_NOCTTY) by a session leader
3984          * results in it being set as the controlling terminal.
3985          */
3986         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3987             !(flags & O_NOCTTY)) {
3988                 int tmp = 0;
3989
3990                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3991                     (caddr_t)&tmp, ctx);
3992         }
3993
3994         proc_fdlock(p);
3995         if (flags & O_CLOEXEC) {
3996                 *fdflags(p, indx) |= UF_EXCLOSE;
3997         }
3998         if (flags & O_CLOFORK) {
3999                 *fdflags(p, indx) |= UF_FORKCLOSE;
4000         }
4001         procfdtbl_releasefd(p, indx, NULL);
4002
4003 #if CONFIG_SECLUDED_MEMORY
4004         if (secluded_for_filecache &&
4005             FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
4006             vnode_vtype(vp) == VREG) {
4007                 memory_object_control_t moc;
4008
4009                 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
4010
4011                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
4012                         /* nothing to do... */
4013                 } else if (fp->f_fglob->fg_flag & FWRITE) {
4014                         /* writable -> no longer  eligible for secluded pages */
4015                         memory_object_mark_eligible_for_secluded(moc,
4016                             FALSE);
4017                 } else if (secluded_for_filecache == 1) {
4018                         char pathname[32] = { 0, };
4019                         size_t copied;
4020                         /* XXX FBDP: better way to detect /Applications/ ? */
4021                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4022                                 (void)copyinstr(ndp->ni_dirp,
4023                                     pathname,
4024                                     sizeof(pathname),
4025                                     &copied);
4026                         } else {
4027                                 copystr(CAST_DOWN(void *, ndp->ni_dirp),
4028                                     pathname,
4029                                     sizeof(pathname),
4030                                     &copied);
4031                         }
4032                         pathname[sizeof(pathname) - 1] = '\0';
4033                         if (strncmp(pathname,
4034                             "/Applications/",
4035                             strlen("/Applications/")) == 0 &&
4036                             strncmp(pathname,
4037                             "/Applications/Camera.app/",
4038                             strlen("/Applications/Camera.app/")) != 0) {
4039                                 /*
4040                                  * not writable
4041                                  * AND from "/Applications/"
4042                                  * AND not from "/Applications/Camera.app/"
4043                                  * ==> eligible for secluded
4044                                  */
4045                                 memory_object_mark_eligible_for_secluded(moc,
4046                                     TRUE);
4047                         }
4048                 } else if (secluded_for_filecache == 2) {
4049 #if __arm64__
4050 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
4051 #elif __arm__
4052 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
4053 #else
4054 /* not implemented... */
4055 #endif
4056                         size_t len = strlen(vp->v_name);
4057                         if (!strncmp(vp->v_name, DYLD_SHARED_CACHE_NAME, len) ||
4058                             !strncmp(vp->v_name, "dyld", len) ||
4059                             !strncmp(vp->v_name, "launchd", len) ||
4060                             !strncmp(vp->v_name, "Camera", len) ||
4061                             !strncmp(vp->v_name, "mediaserverd", len) ||
4062                             !strncmp(vp->v_name, "SpringBoard", len) ||
4063                             !strncmp(vp->v_name, "backboardd", len)) {
4064                                 /*
4065                                  * This file matters when launching Camera:
4066                                  * do not store its contents in the secluded
4067                                  * pool that will be drained on Camera launch.
4068                                  */
4069                                 memory_object_mark_eligible_for_secluded(moc,
4070                                     FALSE);
4071                         }
4072                 }
4073         }
4074 #endif /* CONFIG_SECLUDED_MEMORY */
4075
4076         fp_drop(p, indx, fp, 1);
4077         proc_fdunlock(p);
4078
4079         *retval = indx;
4080
4081         return 0;
4082 bad:
4083         context = *vfs_context_current();
4084         context.vc_ucred = fp->f_fglob->fg_cred;
4085
4086         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
4087             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
4088                 lf.l_whence = SEEK_SET;
4089                 lf.l_start = 0;
4090                 lf.l_len = 0;
4091                 lf.l_type = F_UNLCK;
4092
4093                 (void)VNOP_ADVLOCK(
4094                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
4095         }
4096
4097         vn_close(vp, fp->f_fglob->fg_flag, &context);
4098         vnode_put(vp);
4099         fp_free(p, indx, fp);
4100
4101         return error;
4102 }
4103
4104 /*
4105  * While most of the *at syscall handlers can call nameiat() which
4106  * is a wrapper around namei, the use of namei and initialisation
4107  * of nameidata are far removed and in different functions  - namei
4108  * gets called in vn_open_auth for open1. So we'll just do here what
4109  * nameiat() does.
4110  */
4111 static int
4112 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
4113     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
4114     int dirfd)
4115 {
4116         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
4117                 int error;
4118                 char c;
4119
4120                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
4121                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
4122                         if (error) {
4123                                 return error;
4124                         }
4125                 } else {
4126                         c = *((char *)(ndp->ni_dirp));
4127                 }
4128
4129                 if (c != '/') {
4130                         vnode_t dvp_at;
4131
4132                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
4133                             &dvp_at);
4134                         if (error) {
4135                                 return error;
4136                         }
4137
4138                         if (vnode_vtype(dvp_at) != VDIR) {
4139                                 vnode_put(dvp_at);
4140                                 return ENOTDIR;
4141                         }
4142
4143                         ndp->ni_dvp = dvp_at;
4144                         ndp->ni_cnd.cn_flags |= USEDVP;
4145                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
4146                             retval);
4147                         vnode_put(dvp_at);
4148                         return error;
4149                 }
4150         }
4151
4152         return open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval);
4153 }
4154
4155 /*
4156  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
4157  *
4158  * Parameters:  p                       Process requesting the open
4159  *              uap                     User argument descriptor (see below)
4160  *              retval                  Pointer to an area to receive the
4161  *                                      return calue from the system call
4162  *
4163  * Indirect:    uap->path               Path to open (same as 'open')
4164  *              uap->flags              Flags to open (same as 'open'
4165  *              uap->uid                UID to set, if creating
4166  *              uap->gid                GID to set, if creating
4167  *              uap->mode               File mode, if creating (same as 'open')
4168  *              uap->xsecurity          ACL to set, if creating
4169  *
4170  * Returns:     0                       Success
4171  *              !0                      errno value
4172  *
4173  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4174  *
4175  * XXX:         We should enummerate the possible errno values here, and where
4176  *              in the code they originated.
4177  */
4178 int
4179 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
4180 {
4181         struct filedesc *fdp = p->p_fd;
4182         int ciferror;
4183         kauth_filesec_t xsecdst;
4184         struct vnode_attr va;
4185         struct nameidata nd;
4186         int cmode;
4187
4188         AUDIT_ARG(owner, uap->uid, uap->gid);
4189
4190         xsecdst = NULL;
4191         if ((uap->xsecurity != USER_ADDR_NULL) &&
4192             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
4193                 return ciferror;
4194         }
4195
4196         VATTR_INIT(&va);
4197         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4198         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4199         if (uap->uid != KAUTH_UID_NONE) {
4200                 VATTR_SET(&va, va_uid, uap->uid);
4201         }
4202         if (uap->gid != KAUTH_GID_NONE) {
4203                 VATTR_SET(&va, va_gid, uap->gid);
4204         }
4205         if (xsecdst != NULL) {
4206                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4207         }
4208
4209         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4210             uap->path, vfs_context_current());
4211
4212         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
4213             fileproc_alloc_init, NULL, retval);
4214         if (xsecdst != NULL) {
4215                 kauth_filesec_free(xsecdst);
4216         }
4217
4218         return ciferror;
4219 }
4220
4221 /*
4222  * Go through the data-protected atomically controlled open (2)
4223  *
4224  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
4225  */
4226 int
4227 open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval)
4228 {
4229         int flags = uap->flags;
4230         int class = uap->class;
4231         int dpflags = uap->dpflags;
4232
4233         /*
4234          * Follow the same path as normal open(2)
4235          * Look up the item if it exists, and acquire the vnode.
4236          */
4237         struct filedesc *fdp = p->p_fd;
4238         struct vnode_attr va;
4239         struct nameidata nd;
4240         int cmode;
4241         int error;
4242
4243         VATTR_INIT(&va);
4244         /* Mask off all but regular access permissions */
4245         cmode = ((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4246         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4247
4248         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
4249             uap->path, vfs_context_current());
4250
4251         /*
4252          * Initialize the extra fields in vnode_attr to pass down our
4253          * extra fields.
4254          * 1. target cprotect class.
4255          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
4256          */
4257         if (flags & O_CREAT) {
4258                 /* lower level kernel code validates that the class is valid before applying it. */
4259                 if (class != PROTECTION_CLASS_DEFAULT) {
4260                         /*
4261                          * PROTECTION_CLASS_DEFAULT implies that we make the class for this
4262                          * file behave the same as open (2)
4263                          */
4264                         VATTR_SET(&va, va_dataprotect_class, class);
4265                 }
4266         }
4267
4268         if (dpflags & (O_DP_GETRAWENCRYPTED | O_DP_GETRAWUNENCRYPTED)) {
4269                 if (flags & (O_RDWR | O_WRONLY)) {
4270                         /* Not allowed to write raw encrypted bytes */
4271                         return EINVAL;
4272                 }
4273                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
4274                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
4275                 }
4276                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
4277                         VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
4278                 }
4279         }
4280
4281         error = open1(vfs_context_current(), &nd, uap->flags, &va,
4282             fileproc_alloc_init, NULL, retval);
4283
4284         return error;
4285 }
4286
4287 static int
4288 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
4289     int fd, enum uio_seg segflg, int *retval)
4290 {
4291         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
4292         struct vnode_attr va;
4293         struct nameidata nd;
4294         int cmode;
4295
4296         VATTR_INIT(&va);
4297         /* Mask off all but regular access permissions */
4298         cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
4299         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
4300
4301         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
4302             segflg, path, ctx);
4303
4304         return open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
4305                    retval, fd);
4306 }
4307
4308 int
4309 open(proc_t p, struct open_args *uap, int32_t *retval)
4310 {
4311         __pthread_testcancel(1);
4312         return open_nocancel(p, (struct open_nocancel_args *)uap, retval);
4313 }
4314
4315 int
4316 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
4317     int32_t *retval)
4318 {
4319         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4320                    uap->mode, AT_FDCWD, UIO_USERSPACE, retval);
4321 }
4322
4323 int
4324 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
4325     int32_t *retval)
4326 {
4327         return openat_internal(vfs_context_current(), uap->path, uap->flags,
4328                    uap->mode, uap->fd, UIO_USERSPACE, retval);
4329 }
4330
4331 int
4332 openat(proc_t p, struct openat_args *uap, int32_t *retval)
4333 {
4334         __pthread_testcancel(1);
4335         return openat_nocancel(p, (struct openat_nocancel_args *)uap, retval);
4336 }
4337
4338 /*
4339  * openbyid_np: open a file given a file system id and a file system object id
4340  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
4341  *      file systems that don't support object ids it is a node id (uint64_t).
4342  *
4343  * Parameters:  p                       Process requesting the open
4344  *              uap                     User argument descriptor (see below)
4345  *              retval                  Pointer to an area to receive the
4346  *                                      return calue from the system call
4347  *
4348  * Indirect:    uap->path               Path to open (same as 'open')
4349  *
4350  *              uap->fsid               id of target file system
4351  *              uap->objid              id of target file system object
4352  *              uap->flags              Flags to open (same as 'open')
4353  *
4354  * Returns:     0                       Success
4355  *              !0                      errno value
4356  *
4357  *
4358  * XXX:         We should enummerate the possible errno values here, and where
4359  *              in the code they originated.
4360  */
4361 int
4362 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4363 {
4364         fsid_t fsid;
4365         uint64_t objid;
4366         int error;
4367         char *buf = NULL;
4368         int buflen = MAXPATHLEN;
4369         int pathlen = 0;
4370         vfs_context_t ctx = vfs_context_current();
4371
4372         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4373                 return error;
4374         }
4375
4376         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4377                 return error;
4378         }
4379
4380         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4381         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4382                 return error;
4383         }
4384
4385         AUDIT_ARG(value32, fsid.val[0]);
4386         AUDIT_ARG(value64, objid);
4387
4388         /*resolve path from fsis, objid*/
4389         do {
4390                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
4391                 if (buf == NULL) {
4392                         return ENOMEM;
4393                 }
4394
4395                 error = fsgetpath_internal( ctx, fsid.val[0], objid, buflen,
4396                     buf, FSOPT_ISREALFSID, &pathlen);
4397
4398                 if (error) {
4399                         FREE(buf, M_TEMP);
4400                         buf = NULL;
4401                 }
4402         } while (error == ENOSPC && (buflen += MAXPATHLEN));
4403
4404         if (error) {
4405                 return error;
4406         }
4407
4408         buf[pathlen] = 0;
4409
4410         error = openat_internal(
4411                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4412
4413         FREE(buf, M_TEMP);
4414
4415         return error;
4416 }
4417
4418
4419 /*
4420  * Create a special file.
4421  */
4422 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4423
4424 int
4425 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4426 {
4427         struct vnode_attr va;
4428         vfs_context_t ctx = vfs_context_current();
4429         int error;
4430         struct nameidata nd;
4431         vnode_t vp, dvp;
4432
4433         VATTR_INIT(&va);
4434         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4435         VATTR_SET(&va, va_rdev, uap->dev);
4436
4437         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4438         if ((uap->mode & S_IFMT) == S_IFIFO) {
4439                 return mkfifo1(ctx, uap->path, &va);
4440         }
4441
4442         AUDIT_ARG(mode, uap->mode);
4443         AUDIT_ARG(value32, uap->dev);
4444
4445         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
4446                 return error;
4447         }
4448         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4449             UIO_USERSPACE, uap->path, ctx);
4450         error = namei(&nd);
4451         if (error) {
4452                 return error;
4453         }
4454         dvp = nd.ni_dvp;
4455         vp = nd.ni_vp;
4456
4457         if (vp != NULL) {
4458                 error = EEXIST;
4459                 goto out;
4460         }
4461
4462         switch (uap->mode & S_IFMT) {
4463         case S_IFCHR:
4464                 VATTR_SET(&va, va_type, VCHR);
4465                 break;
4466         case S_IFBLK:
4467                 VATTR_SET(&va, va_type, VBLK);
4468                 break;
4469         default:
4470                 error = EINVAL;
4471                 goto out;
4472         }
4473
4474 #if CONFIG_MACF
4475         error = mac_vnode_check_create(ctx,
4476             nd.ni_dvp, &nd.ni_cnd, &va);
4477         if (error) {
4478                 goto out;
4479         }
4480 #endif
4481
4482         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4483                 goto out;
4484         }
4485
4486         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) {
4487                 goto out;
4488         }
4489
4490         if (vp) {
4491                 int     update_flags = 0;
4492
4493                 // Make sure the name & parent pointers are hooked up
4494                 if (vp->v_name == NULL) {
4495                         update_flags |= VNODE_UPDATE_NAME;
4496                 }
4497                 if (vp->v_parent == NULLVP) {
4498                         update_flags |= VNODE_UPDATE_PARENT;
4499                 }
4500
4501                 if (update_flags) {
4502                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4503                 }
4504
4505 #if CONFIG_FSE
4506                 add_fsevent(FSE_CREATE_FILE, ctx,
4507                     FSE_ARG_VNODE, vp,
4508                     FSE_ARG_DONE);
4509 #endif
4510         }
4511
4512 out:
4513         /*
4514          * nameidone has to happen before we vnode_put(dvp)
4515          * since it may need to release the fs_nodelock on the dvp
4516          */
4517         nameidone(&nd);
4518
4519         if (vp) {
4520                 vnode_put(vp);
4521         }
4522         vnode_put(dvp);
4523
4524         return error;
4525 }
4526
4527 /*
4528  * Create a named pipe.
4529  *
4530  * Returns:     0                       Success
4531  *              EEXIST
4532  *      namei:???
4533  *      vnode_authorize:???
4534  *      vn_create:???
4535  */
4536 static int
4537 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4538 {
4539         vnode_t vp, dvp;
4540         int error;
4541         struct nameidata nd;
4542
4543         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4544             UIO_USERSPACE, upath, ctx);
4545         error = namei(&nd);
4546         if (error) {
4547                 return error;
4548         }
4549         dvp = nd.ni_dvp;
4550         vp = nd.ni_vp;
4551
4552         /* check that this is a new file and authorize addition */
4553         if (vp != NULL) {
4554                 error = EEXIST;
4555                 goto out;
4556         }
4557         VATTR_SET(vap, va_type, VFIFO);
4558
4559         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
4560                 goto out;
4561         }
4562
4563         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4564 out:
4565         /*
4566          * nameidone has to happen before we vnode_put(dvp)
4567          * since it may need to release the fs_nodelock on the dvp
4568          */
4569         nameidone(&nd);
4570
4571         if (vp) {
4572                 vnode_put(vp);
4573         }
4574         vnode_put(dvp);
4575
4576         return error;
4577 }
4578
4579
4580 /*
4581  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4582  *
4583  * Parameters:  p                       Process requesting the open
4584  *              uap                     User argument descriptor (see below)
4585  *              retval                  (Ignored)
4586  *
4587  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
4588  *              uap->uid                UID to set
4589  *              uap->gid                GID to set
4590  *              uap->mode               File mode to set (same as 'mkfifo')
4591  *              uap->xsecurity          ACL to set, if creating
4592  *
4593  * Returns:     0                       Success
4594  *              !0                      errno value
4595  *
4596  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4597  *
4598  * XXX:         We should enummerate the possible errno values here, and where
4599  *              in the code they originated.
4600  */
4601 int
4602 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4603 {
4604         int ciferror;
4605         kauth_filesec_t xsecdst;
4606         struct vnode_attr va;
4607
4608         AUDIT_ARG(owner, uap->uid, uap->gid);
4609
4610         xsecdst = KAUTH_FILESEC_NONE;
4611         if (uap->xsecurity != USER_ADDR_NULL) {
4612                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
4613                         return ciferror;
4614                 }
4615         }
4616
4617         VATTR_INIT(&va);
4618         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4619         if (uap->uid != KAUTH_UID_NONE) {
4620                 VATTR_SET(&va, va_uid, uap->uid);
4621         }
4622         if (uap->gid != KAUTH_GID_NONE) {
4623                 VATTR_SET(&va, va_gid, uap->gid);
4624         }
4625         if (xsecdst != KAUTH_FILESEC_NONE) {
4626                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4627         }
4628
4629         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4630
4631         if (xsecdst != KAUTH_FILESEC_NONE) {
4632                 kauth_filesec_free(xsecdst);
4633         }
4634         return ciferror;
4635 }
4636
4637 /* ARGSUSED */
4638 int
4639 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4640 {
4641         struct vnode_attr va;
4642
4643         VATTR_INIT(&va);
4644         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4645
4646         return mkfifo1(vfs_context_current(), uap->path, &va);
4647 }
4648
4649
4650 static char *
4651 my_strrchr(char *p, int ch)
4652 {
4653         char *save;
4654
4655         for (save = NULL;; ++p) {
4656                 if (*p == ch) {
4657                         save = p;
4658                 }
4659                 if (!*p) {
4660                         return save;
4661                 }
4662         }
4663         /* NOTREACHED */
4664 }
4665
4666 extern int safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink);
4667 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4668 extern int safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4669
4670 int
4671 safe_getpath_new(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path, int firmlink)
4672 {
4673         int ret, len = _len;
4674
4675         *truncated_path = 0;
4676
4677         if (firmlink) {
4678                 ret = vn_getpath(dvp, path, &len);
4679         } else {
4680                 ret = vn_getpath_no_firmlink(dvp, path, &len);
4681         }
4682         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4683                 if (leafname) {
4684                         path[len - 1] = '/';
4685                         len += strlcpy(&path[len], leafname, MAXPATHLEN - len) + 1;
4686                         if (len > MAXPATHLEN) {
4687                                 char *ptr;
4688
4689                                 // the string got truncated!
4690                                 *truncated_path = 1;
4691                                 ptr = my_strrchr(path, '/');
4692                                 if (ptr) {
4693                                         *ptr = '\0';   // chop off the string at the last directory component
4694                                 }
4695                                 len = strlen(path) + 1;
4696                         }
4697                 }
4698         } else if (ret == 0) {
4699                 *truncated_path = 1;
4700         } else if (ret != 0) {
4701                 struct vnode *mydvp = dvp;
4702
4703                 if (ret != ENOSPC) {
4704                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4705                             dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4706                 }
4707                 *truncated_path = 1;
4708
4709                 do {
4710                         if (mydvp->v_parent != NULL) {
4711                                 mydvp = mydvp->v_parent;
4712                         } else if (mydvp->v_mount) {
4713                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4714                                 break;
4715                         } else {
4716                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4717                                 strlcpy(path, "/", _len);
4718                                 len = 2;
4719                                 mydvp = NULL;
4720                         }
4721
4722                         if (mydvp == NULL) {
4723                                 break;
4724                         }
4725
4726                         len = _len;
4727                         if (firmlink) {
4728                                 ret = vn_getpath(mydvp, path, &len);
4729                         } else {
4730                                 ret = vn_getpath_no_firmlink(mydvp, path, &len);
4731                         }
4732                 } while (ret == ENOSPC);
4733         }
4734
4735         return len;
4736 }
4737
4738 int
4739 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4740 {
4741         return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 1);
4742 }
4743
4744 int
4745 safe_getpath_no_firmlink(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4746 {
4747         return safe_getpath_new(dvp, leafname, path, _len, truncated_path, 0);
4748 }
4749
4750 /*
4751  * Make a hard file link.
4752  *
4753  * Returns:     0                       Success
4754  *              EPERM
4755  *              EEXIST
4756  *              EXDEV
4757  *      namei:???
4758  *      vnode_authorize:???
4759  *      VNOP_LINK:???
4760  */
4761 /* ARGSUSED */
4762 static int
4763 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4764     user_addr_t link, int flag, enum uio_seg segflg)
4765 {
4766         vnode_t vp, pvp, dvp, lvp;
4767         struct nameidata nd;
4768         int follow;
4769         int error;
4770 #if CONFIG_FSE
4771         fse_info finfo;
4772 #endif
4773         int need_event, has_listeners, need_kpath2;
4774         char *target_path = NULL;
4775         int truncated = 0;
4776
4777         vp = dvp = lvp = NULLVP;
4778
4779         /* look up the object we are linking to */
4780         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4781         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4782             segflg, path, ctx);
4783
4784         error = nameiat(&nd, fd1);
4785         if (error) {
4786                 if (error == EPERM) {
4787                         printf("XXX 54841485: nameiat() src EPERM\n");
4788                 }
4789                 return error;
4790         }
4791         vp = nd.ni_vp;
4792
4793         nameidone(&nd);
4794
4795         /*
4796          * Normally, linking to directories is not supported.
4797          * However, some file systems may have limited support.
4798          */
4799         if (vp->v_type == VDIR) {
4800                 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4801                         error = EPERM;   /* POSIX */
4802                         printf("XXX 54841485: VDIR EPERM\n");
4803                         goto out;
4804                 }
4805
4806                 /* Linking to a directory requires ownership. */
4807                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4808                         struct vnode_attr dva;
4809
4810                         VATTR_INIT(&dva);
4811                         VATTR_WANTED(&dva, va_uid);
4812                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4813                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4814                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4815                                 error = EACCES;
4816                                 goto out;
4817                         }
4818                 }
4819         }
4820
4821         /* lookup the target node */
4822 #if CONFIG_TRIGGERS
4823         nd.ni_op = OP_LINK;
4824 #endif
4825         nd.ni_cnd.cn_nameiop = CREATE;
4826         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4827         nd.ni_dirp = link;
4828         error = nameiat(&nd, fd2);
4829         if (error != 0) {
4830                 if (error == EPERM) {
4831                         printf("XXX 54841485: nameiat() dst EPERM\n");
4832                 }
4833                 goto out;
4834         }
4835         dvp = nd.ni_dvp;
4836         lvp = nd.ni_vp;
4837
4838 #if CONFIG_MACF
4839         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0) {
4840                 if (error == EPERM) {
4841                         printf("XXX 54841485: mac_vnode_check_link() EPERM\n");
4842                 }
4843                 goto out2;
4844         }
4845 #endif
4846
4847         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4848         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0) {
4849                 if (error == EPERM) {
4850                         printf("XXX 54841485: vnode_authorize() LINKTARGET EPERM\n");
4851                 }
4852                 goto out2;
4853         }
4854
4855         /* target node must not exist */
4856         if (lvp != NULLVP) {
4857                 error = EEXIST;
4858                 goto out2;
4859         }
4860         /* cannot link across mountpoints */
4861         if (vnode_mount(vp) != vnode_mount(dvp)) {
4862                 error = EXDEV;
4863                 goto out2;
4864         }
4865
4866         /* authorize creation of the target note */
4867         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
4868                 if (error == EPERM) {
4869                         printf("XXX 54841485: vnode_authorize() ADD_FILE EPERM\n");
4870                 }
4871                 goto out2;
4872         }
4873
4874         /* and finally make the link */
4875         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4876         if (error) {
4877                 if (error == EPERM) {
4878                         printf("XXX 54841485: VNOP_LINK() EPERM\n");
4879                 }
4880                 goto out2;
4881         }
4882
4883 #if CONFIG_MACF
4884         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4885 #endif
4886
4887 #if CONFIG_FSE
4888         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4889 #else
4890         need_event = 0;
4891 #endif
4892         has_listeners = kauth_authorize_fileop_has_listeners();
4893
4894         need_kpath2 = 0;
4895 #if CONFIG_AUDIT
4896         if (AUDIT_RECORD_EXISTS()) {
4897                 need_kpath2 = 1;
4898         }
4899 #endif
4900
4901         if (need_event || has_listeners || need_kpath2) {
4902                 char *link_to_path = NULL;
4903                 int len, link_name_len;
4904
4905                 /* build the path to the new link file */
4906                 GET_PATH(target_path);
4907                 if (target_path == NULL) {
4908                         error = ENOMEM;
4909                         goto out2;
4910                 }
4911
4912                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4913
4914                 AUDIT_ARG(kpath, target_path, ARG_KPATH2);
4915
4916                 if (has_listeners) {
4917                         /* build the path to file we are linking to */
4918                         GET_PATH(link_to_path);
4919                         if (link_to_path == NULL) {
4920                                 error = ENOMEM;
4921                                 goto out2;
4922                         }
4923
4924                         link_name_len = MAXPATHLEN;
4925                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4926                                 /*
4927                                  * Call out to allow 3rd party notification of rename.
4928                                  * Ignore result of kauth_authorize_fileop call.
4929                                  */
4930                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4931                                     (uintptr_t)link_to_path,
4932                                     (uintptr_t)target_path);
4933                         }
4934                         if (link_to_path != NULL) {
4935                                 RELEASE_PATH(link_to_path);
4936                         }
4937                 }
4938 #if CONFIG_FSE
4939                 if (need_event) {
4940                         /* construct fsevent */
4941                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4942                                 if (truncated) {
4943                                         finfo.mode |= FSE_TRUNCATED_PATH;
4944                                 }
4945
4946                                 // build the path to the destination of the link
4947                                 add_fsevent(FSE_CREATE_FILE, ctx,
4948                                     FSE_ARG_STRING, len, target_path,
4949                                     FSE_ARG_FINFO, &finfo,
4950                                     FSE_ARG_DONE);
4951                         }
4952
4953                         pvp = vp->v_parent;
4954                         // need an iocount on pvp in this case
4955                         if (pvp && pvp != dvp) {
4956                                 error = vnode_get(pvp);
4957                                 if (error) {
4958                                         pvp = NULLVP;
4959                                         error = 0;
4960                                 }
4961                         }
4962                         if (pvp) {
4963                                 add_fsevent(FSE_STAT_CHANGED, ctx,
4964                                     FSE_ARG_VNODE, pvp, FSE_ARG_DONE);
4965                         }
4966                         if (pvp && pvp != dvp) {
4967                                 vnode_put(pvp);
4968                         }
4969                 }
4970 #endif
4971         }
4972 out2:
4973         /*
4974          * nameidone has to happen before we vnode_put(dvp)
4975          * since it may need to release the fs_nodelock on the dvp
4976          */
4977         nameidone(&nd);
4978         if (target_path != NULL) {
4979                 RELEASE_PATH(target_path);
4980         }
4981 out:
4982         if (lvp) {
4983                 vnode_put(lvp);
4984         }
4985         if (dvp) {
4986                 vnode_put(dvp);
4987         }
4988         vnode_put(vp);
4989         return error;
4990 }
4991
4992 int
4993 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4994 {
4995         return linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4996                    AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE);
4997 }
4998
4999 int
5000 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
5001 {
5002         if (uap->flag & ~AT_SYMLINK_FOLLOW) {
5003                 return EINVAL;
5004         }
5005
5006         return linkat_internal(vfs_context_current(), uap->fd1, uap->path,
5007                    uap->fd2, uap->link, uap->flag, UIO_USERSPACE);
5008 }
5009
5010 /*
5011  * Make a symbolic link.
5012  *
5013  * We could add support for ACLs here too...
5014  */
5015 /* ARGSUSED */
5016 static int
5017 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
5018     user_addr_t link, enum uio_seg segflg)
5019 {
5020         struct vnode_attr va;
5021         char *path;
5022         int error;
5023         struct nameidata nd;
5024         vnode_t vp, dvp;
5025         size_t dummy = 0;
5026         proc_t p;
5027
5028         error = 0;
5029         if (UIO_SEG_IS_USER_SPACE(segflg)) {
5030                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
5031                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
5032         } else {
5033                 path = (char *)path_data;
5034         }
5035         if (error) {
5036                 goto out;
5037         }
5038         AUDIT_ARG(text, path);  /* This is the link string */
5039
5040         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
5041             segflg, link, ctx);
5042
5043         error = nameiat(&nd, fd);
5044         if (error) {
5045                 goto out;
5046         }
5047         dvp = nd.ni_dvp;
5048         vp = nd.ni_vp;
5049
5050         p = vfs_context_proc(ctx);
5051         VATTR_INIT(&va);
5052         VATTR_SET(&va, va_type, VLNK);
5053         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
5054
5055 #if CONFIG_MACF
5056         error = mac_vnode_check_create(ctx,
5057             dvp, &nd.ni_cnd, &va);
5058 #endif
5059         if (error != 0) {
5060                 goto skipit;
5061         }
5062
5063         if (vp != NULL) {
5064                 error = EEXIST;
5065                 goto skipit;
5066         }
5067
5068         /* authorize */
5069         if (error == 0) {
5070                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
5071         }
5072         /* get default ownership, etc. */
5073         if (error == 0) {
5074                 error = vnode_authattr_new(dvp, &va, 0, ctx);
5075         }
5076         if (error == 0) {
5077                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
5078         }
5079
5080         /* do fallback attribute handling */
5081         if (error == 0 && vp) {
5082                 error = vnode_setattr_fallback(vp, &va, ctx);
5083         }
5084
5085 #if CONFIG_MACF
5086         if (error == 0 && vp) {
5087                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
5088         }
5089 #endif
5090
5091         if (error == 0) {
5092                 int     update_flags = 0;
5093
5094                 /*check if a new vnode was created, else try to get one*/
5095                 if (vp == NULL) {
5096                         nd.ni_cnd.cn_nameiop = LOOKUP;
5097 #if CONFIG_TRIGGERS
5098                         nd.ni_op = OP_LOOKUP;
5099 #endif
5100                         nd.ni_cnd.cn_flags = 0;
5101                         error = nameiat(&nd, fd);
5102                         vp = nd.ni_vp;
5103
5104                         if (vp == NULL) {
5105                                 goto skipit;
5106                         }
5107                 }
5108
5109 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
5110                 /* call out to allow 3rd party notification of rename.
5111                  * Ignore result of kauth_authorize_fileop call.
5112                  */
5113                 if (kauth_authorize_fileop_has_listeners() &&
5114                     namei(&nd) == 0) {
5115                         char *new_link_path = NULL;
5116                         int             len;
5117
5118                         /* build the path to the new link file */
5119                         new_link_path = get_pathbuff();
5120                         len = MAXPATHLEN;
5121                         vn_getpath(dvp, new_link_path, &len);
5122                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
5123                                 new_link_path[len - 1] = '/';
5124                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN - len);
5125                         }
5126
5127                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
5128                             (uintptr_t)path, (uintptr_t)new_link_path);
5129                         if (new_link_path != NULL) {
5130                                 release_pathbuff(new_link_path);
5131                         }
5132                 }
5133 #endif
5134                 // Make sure the name & parent pointers are hooked up
5135                 if (vp->v_name == NULL) {
5136                         update_flags |= VNODE_UPDATE_NAME;
5137                 }
5138                 if (vp->v_parent == NULLVP) {
5139                         update_flags |= VNODE_UPDATE_PARENT;
5140                 }
5141
5142                 if (update_flags) {
5143                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
5144                 }
5145
5146 #if CONFIG_FSE
5147                 add_fsevent(FSE_CREATE_FILE, ctx,
5148                     FSE_ARG_VNODE, vp,
5149                     FSE_ARG_DONE);
5150 #endif
5151         }
5152
5153 skipit:
5154         /*
5155          * nameidone has to happen before we vnode_put(dvp)
5156          * since it may need to release the fs_nodelock on the dvp
5157          */
5158         nameidone(&nd);
5159
5160         if (vp) {
5161                 vnode_put(vp);
5162         }
5163         vnode_put(dvp);
5164 out:
5165         if (path && (path != (char *)path_data)) {
5166                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
5167         }
5168
5169         return error;
5170 }
5171
5172 int
5173 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
5174 {
5175         return symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
5176                    uap->link, UIO_USERSPACE);
5177 }
5178
5179 int
5180 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
5181     __unused int32_t *retval)
5182 {
5183         return symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
5184                    uap->path2, UIO_USERSPACE);
5185 }
5186
5187 /*
5188  * Delete a whiteout from the filesystem.
5189  * No longer supported.
5190  */
5191 int
5192 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
5193 {
5194         return ENOTSUP;
5195 }
5196
5197 /*
5198  * Delete a name from the filesystem.
5199  */
5200 /* ARGSUSED */
5201 static int
5202 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
5203     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
5204 {
5205         struct nameidata nd;
5206         vnode_t vp, dvp;
5207         int error;
5208         struct componentname *cnp;
5209         char  *path = NULL;
5210         char  *no_firmlink_path = NULL;
5211         int  len_path = 0;
5212         int  len_no_firmlink_path = 0;
5213 #if CONFIG_FSE
5214         fse_info  finfo;
5215         struct vnode_attr va;
5216 #endif
5217         int flags;
5218         int need_event;
5219         int has_listeners;
5220         int truncated_path;
5221         int truncated_no_firmlink_path;
5222         int batched;
5223         struct vnode_attr *vap;
5224         int do_retry;
5225         int retry_count = 0;
5226         int cn_flags;
5227
5228         cn_flags = LOCKPARENT;
5229         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) {
5230                 cn_flags |= AUDITVNPATH1;
5231         }
5232         /* If a starting dvp is passed, it trumps any fd passed. */
5233         if (start_dvp) {
5234                 cn_flags |= USEDVP;
5235         }
5236
5237 #if NAMEDRSRCFORK
5238         /* unlink or delete is allowed on rsrc forks and named streams */
5239         cn_flags |= CN_ALLOWRSRCFORK;
5240 #endif
5241
5242 retry:
5243         do_retry = 0;
5244         flags = 0;
5245         need_event = 0;
5246         has_listeners = 0;
5247         truncated_path = 0;
5248         truncated_no_firmlink_path = 0;
5249         vap = NULL;
5250
5251         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
5252
5253         nd.ni_dvp = start_dvp;
5254         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
5255         cnp = &nd.ni_cnd;
5256
5257 continue_lookup:
5258         error = nameiat(&nd, fd);
5259         if (error) {
5260                 return error;
5261         }
5262
5263         dvp = nd.ni_dvp;
5264         vp = nd.ni_vp;
5265
5266
5267         /* With Carbon delete semantics, busy files cannot be deleted */
5268         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
5269                 flags |= VNODE_REMOVE_NODELETEBUSY;
5270         }
5271
5272         /* Skip any potential upcalls if told to. */
5273         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
5274                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
5275         }
5276
5277         if (vp) {
5278                 batched = vnode_compound_remove_available(vp);
5279                 /*
5280                  * The root of a mounted filesystem cannot be deleted.
5281                  */
5282                 if ((vp->v_flag & VROOT) || (dvp->v_mount != vp->v_mount)) {
5283                         error = EBUSY;
5284                         goto out;
5285                 }
5286
5287 #if DEVELOPMENT || DEBUG
5288                 /*
5289                  * XXX VSWAP: Check for entitlements or special flag here
5290                  * so we can restrict access appropriately.
5291                  */
5292 #else /* DEVELOPMENT || DEBUG */
5293
5294                 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
5295                         error = EPERM;
5296                         goto out;
5297                 }
5298 #endif /* DEVELOPMENT || DEBUG */
5299
5300                 if (!batched) {
5301                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
5302                         if (error) {
5303                                 if (error == ENOENT) {
5304                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5305                                                 do_retry = 1;
5306                                                 retry_count++;
5307                                         }
5308                                 }
5309                                 goto out;
5310                         }
5311                 }
5312         } else {
5313                 batched = 1;
5314
5315                 if (!vnode_compound_remove_available(dvp)) {
5316                         panic("No vp, but no compound remove?");
5317                 }
5318         }
5319
5320 #if CONFIG_FSE
5321         need_event = need_fsevent(FSE_DELETE, dvp);
5322         if (need_event) {
5323                 if (!batched) {
5324                         if ((vp->v_flag & VISHARDLINK) == 0) {
5325                                 /* XXX need to get these data in batched VNOP */
5326                                 get_fse_info(vp, &finfo, ctx);
5327                         }
5328                 } else {
5329                         error = vfs_get_notify_attributes(&va);
5330                         if (error) {
5331                                 goto out;
5332                         }
5333
5334                         vap = &va;
5335                 }
5336         }
5337 #endif
5338         has_listeners = kauth_authorize_fileop_has_listeners();
5339         if (need_event || has_listeners) {
5340                 if (path == NULL) {
5341                         GET_PATH(path);
5342                         if (path == NULL) {
5343                                 error = ENOMEM;
5344                                 goto out;
5345                         }
5346                 }
5347                 len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
5348                 if (no_firmlink_path == NULL) {
5349                         GET_PATH(no_firmlink_path);
5350                         if (no_firmlink_path == NULL) {
5351                                 error = ENOMEM;
5352                                 goto out;
5353                         }
5354                 }
5355                 len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
5356         }
5357
5358 #if NAMEDRSRCFORK
5359         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK) {
5360                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
5361         } else
5362 #endif
5363         {
5364                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
5365                 vp = nd.ni_vp;
5366                 if (error == EKEEPLOOKING) {
5367                         if (!batched) {
5368                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
5369                         }
5370
5371                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
5372                                 panic("EKEEPLOOKING, but continue flag not set?");
5373                         }
5374
5375                         if (vnode_isdir(vp)) {
5376                                 error = EISDIR;
5377                                 goto out;
5378                         }
5379                         goto continue_lookup;
5380                 } else if (error == ENOENT && batched) {
5381                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
5382                                 /*
5383                                  * For compound VNOPs, the authorization callback may
5384                                  * return ENOENT in case of racing hardlink lookups
5385                                  * hitting the name  cache, redrive the lookup.
5386                                  */
5387                                 do_retry = 1;
5388                                 retry_count += 1;
5389                                 goto out;
5390                         }
5391                 }
5392         }
5393
5394         /*
5395          * Call out to allow 3rd party notification of delete.
5396          * Ignore result of kauth_authorize_fileop call.
5397          */
5398         if (!error) {
5399                 if (has_listeners) {
5400                         kauth_authorize_fileop(vfs_context_ucred(ctx),
5401                             KAUTH_FILEOP_DELETE,
5402                             (uintptr_t)vp,
5403                             (uintptr_t)path);
5404                 }
5405
5406                 if (vp->v_flag & VISHARDLINK) {
5407                         //
5408                         // if a hardlink gets deleted we want to blow away the
5409                         // v_parent link because the path that got us to this
5410                         // instance of the link is no longer valid.  this will
5411                         // force the next call to get the path to ask the file
5412                         // system instead of just following the v_parent link.
5413                         //
5414                         vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
5415                 }
5416
5417 #if CONFIG_FSE
5418                 if (need_event) {
5419                         if (vp->v_flag & VISHARDLINK) {
5420                                 get_fse_info(vp, &finfo, ctx);
5421                         } else if (vap) {
5422                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
5423                         }
5424                         if (truncated_path) {
5425                                 finfo.mode |= FSE_TRUNCATED_PATH;
5426                         }
5427                         add_fsevent(FSE_DELETE, ctx,
5428                             FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
5429                             FSE_ARG_FINFO, &finfo,
5430                             FSE_ARG_DONE);
5431                 }
5432 #endif
5433         }
5434
5435 out:
5436         if (path != NULL) {
5437                 RELEASE_PATH(path);
5438                 path = NULL;
5439         }
5440
5441         if (no_firmlink_path != NULL) {
5442                 RELEASE_PATH(no_firmlink_path);
5443                 no_firmlink_path = NULL;
5444         }
5445 #if NAMEDRSRCFORK
5446         /* recycle the deleted rsrc fork vnode to force a reclaim, which
5447          * will cause its shadow file to go away if necessary.
5448          */
5449         if (vp && (vnode_isnamedstream(vp)) &&
5450             (vp->v_parent != NULLVP) &&
5451             vnode_isshadow(vp)) {
5452                 vnode_recycle(vp);
5453         }
5454 #endif
5455         /*
5456          * nameidone has to happen before we vnode_put(dvp)
5457          * since it may need to release the fs_nodelock on the dvp
5458          */
5459         nameidone(&nd);
5460         vnode_put(dvp);
5461         if (vp) {
5462                 vnode_put(vp);
5463         }
5464
5465         if (do_retry) {
5466                 goto retry;
5467         }
5468
5469         return error;
5470 }
5471
5472 int
5473 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
5474     enum uio_seg segflg, int unlink_flags)
5475 {
5476         return unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
5477                    unlink_flags);
5478 }
5479
5480 /*
5481  * Delete a name from the filesystem using Carbon semantics.
5482  */
5483 int
5484 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5485 {
5486         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5487                    uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY);
5488 }
5489
5490 /*
5491  * Delete a name from the filesystem using POSIX semantics.
5492  */
5493 int
5494 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5495 {
5496         return unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5497                    uap->path, UIO_USERSPACE, 0);
5498 }
5499
5500 int
5501 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5502 {
5503         if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5504                 return EINVAL;
5505         }
5506
5507         if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) {
5508                 int unlink_flags = 0;
5509
5510                 if (uap->flag & AT_REMOVEDIR_DATALESS) {
5511                         unlink_flags |= VNODE_REMOVE_DATALESS_DIR;
5512                 }
5513                 return rmdirat_internal(vfs_context_current(), uap->fd,
5514                            uap->path, UIO_USERSPACE, unlink_flags);
5515         } else {
5516                 return unlinkat_internal(vfs_context_current(), uap->fd,
5517                            NULLVP, uap->path, UIO_USERSPACE, 0);
5518         }
5519 }
5520
5521 /*
5522  * Reposition read/write file offset.
5523  */
5524 int
5525 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5526 {
5527         struct fileproc *fp;
5528         vnode_t vp;
5529         struct vfs_context *ctx;
5530         off_t offset = uap->offset, file_size;
5531         int error;
5532
5533         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
5534                 if (error == ENOTSUP) {
5535                         return ESPIPE;
5536                 }
5537                 return error;
5538         }
5539         if (vnode_isfifo(vp)) {
5540                 file_drop(uap->fd);
5541                 return ESPIPE;
5542         }
5543
5544
5545         ctx = vfs_context_current();
5546 #if CONFIG_MACF
5547         if (uap->whence == L_INCR && uap->offset == 0) {
5548                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5549                     fp->f_fglob);
5550         } else {
5551                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5552                     fp->f_fglob);
5553         }
5554         if (error) {
5555                 file_drop(uap->fd);
5556                 return error;
5557         }
5558 #endif
5559         if ((error = vnode_getwithref(vp))) {
5560                 file_drop(uap->fd);
5561                 return error;
5562         }
5563
5564         switch (uap->whence) {
5565         case L_INCR:
5566                 offset += fp->f_fglob->fg_offset;
5567                 break;
5568         case L_XTND:
5569                 if ((error = vnode_size(vp, &file_size, ctx)) != 0) {
5570                         break;
5571                 }
5572                 offset += file_size;
5573                 break;
5574         case L_SET:
5575                 break;
5576         case SEEK_HOLE:
5577                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5578                 break;
5579         case SEEK_DATA:
5580                 error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5581                 break;
5582         default:
5583                 error = EINVAL;
5584         }
5585         if (error == 0) {
5586                 if (uap->offset > 0 && offset < 0) {
5587                         /* Incremented/relative move past max size */
5588                         error = EOVERFLOW;
5589                 } else {
5590                         /*
5591                          * Allow negative offsets on character devices, per
5592                          * POSIX 1003.1-2001.  Most likely for writing disk
5593                          * labels.
5594                          */
5595                         if (offset < 0 && vp->v_type != VCHR) {
5596                                 /* Decremented/relative move before start */
5597                                 error = EINVAL;
5598                         } else {
5599                                 /* Success */
5600                                 fp->f_fglob->fg_offset = offset;
5601                                 *retval = fp->f_fglob->fg_offset;
5602                         }
5603                 }
5604         }
5605
5606         /*
5607          * An lseek can affect whether data is "available to read."  Use
5608          * hint of NOTE_NONE so no EVFILT_VNODE events fire
5609          */
5610         post_event_if_success(vp, error, NOTE_NONE);
5611         (void)vnode_put(vp);
5612         file_drop(uap->fd);
5613         return error;
5614 }
5615
5616
5617 /*
5618  * Check access permissions.
5619  *
5620  * Returns:     0                       Success
5621  *              vnode_authorize:???
5622  */
5623 static int
5624 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5625 {
5626         kauth_action_t action;
5627         int error;
5628
5629         /*
5630          * If just the regular access bits, convert them to something
5631          * that vnode_authorize will understand.
5632          */
5633         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5634                 action = 0;
5635                 if (uflags & R_OK) {
5636                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
5637                 }
5638                 if (uflags & W_OK) {
5639                         if (vnode_isdir(vp)) {
5640                                 action |= KAUTH_VNODE_ADD_FILE |
5641                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
5642                                 /* might want delete rights here too */
5643                         } else {
5644                                 action |= KAUTH_VNODE_WRITE_DATA;
5645                         }
5646                 }
5647                 if (uflags & X_OK) {
5648                         if (vnode_isdir(vp)) {
5649                                 action |= KAUTH_VNODE_SEARCH;
5650                         } else {
5651                                 action |= KAUTH_VNODE_EXECUTE;
5652                         }
5653                 }
5654         } else {
5655                 /* take advantage of definition of uflags */
5656                 action = uflags >> 8;
5657         }
5658
5659 #if CONFIG_MACF
5660         error = mac_vnode_check_access(ctx, vp, uflags);
5661         if (error) {
5662                 return error;
5663         }
5664 #endif /* MAC */
5665
5666         /* action == 0 means only check for existence */
5667         if (action != 0) {
5668                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5669         } else {
5670                 error = 0;
5671         }
5672
5673         return error;
5674 }
5675
5676
5677
5678 /*
5679  * access_extended: Check access permissions in bulk.
5680  *
5681  * Description: uap->entries            Pointer to an array of accessx
5682  *                                      descriptor structs, plus one or
5683  *                                      more NULL terminated strings (see
5684  *                                      "Notes" section below).
5685  *              uap->size               Size of the area pointed to by
5686  *                                      uap->entries.
5687  *              uap->results            Pointer to the results array.
5688  *
5689  * Returns:     0                       Success
5690  *              ENOMEM                  Insufficient memory
5691  *              EINVAL                  Invalid arguments
5692  *              namei:EFAULT            Bad address
5693  *              namei:ENAMETOOLONG      Filename too long
5694  *              namei:ENOENT            No such file or directory
5695  *              namei:ELOOP             Too many levels of symbolic links
5696  *              namei:EBADF             Bad file descriptor
5697  *              namei:ENOTDIR           Not a directory
5698  *              namei:???
5699  *              access1:
5700  *
5701  * Implicit returns:
5702  *              uap->results            Array contents modified
5703  *
5704  * Notes:       The uap->entries are structured as an arbitrary length array
5705  *              of accessx descriptors, followed by one or more NULL terminated
5706  *              strings
5707  *
5708  *                      struct accessx_descriptor[0]
5709  *                      ...
5710  *                      struct accessx_descriptor[n]
5711  *                      char name_data[0];
5712  *
5713  *              We determine the entry count by walking the buffer containing
5714  *              the uap->entries argument descriptor.  For each descriptor we
5715  *              see, the valid values for the offset ad_name_offset will be
5716  *              in the byte range:
5717  *
5718  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
5719  *                                              to
5720  *                              [ uap->entries + uap->size - 2 ]
5721  *
5722  *              since we must have at least one string, and the string must
5723  *              be at least one character plus the NULL terminator in length.
5724  *
5725  * XXX:         Need to support the check-as uid argument
5726  */
5727 int
5728 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5729 {
5730         struct accessx_descriptor *input = NULL;
5731         errno_t *result = NULL;
5732         errno_t error = 0;
5733         int wantdelete = 0;
5734         unsigned int desc_max, desc_actual, i, j;
5735         struct vfs_context context;
5736         struct nameidata nd;
5737         int niopts;
5738         vnode_t vp = NULL;
5739         vnode_t dvp = NULL;
5740 #define ACCESSX_MAX_DESCR_ON_STACK 10
5741         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5742
5743         context.vc_ucred = NULL;
5744
5745         /*
5746          * Validate parameters; if valid, copy the descriptor array and string
5747          * arguments into local memory.  Before proceeding, the following
5748          * conditions must have been met:
5749          *
5750          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5751          * o    There must be sufficient room in the request for at least one
5752          *      descriptor and a one yte NUL terminated string.
5753          * o    The allocation of local storage must not fail.
5754          */
5755         if (uap->size > ACCESSX_MAX_TABLESIZE) {
5756                 return ENOMEM;
5757         }
5758         if (uap->size < (sizeof(struct accessx_descriptor) + 2)) {
5759                 return EINVAL;
5760         }
5761         if (uap->size <= sizeof(stack_input)) {
5762                 input = stack_input;
5763         } else {
5764                 MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5765                 if (input == NULL) {
5766                         error = ENOMEM;
5767                         goto out;
5768                 }
5769         }
5770         error = copyin(uap->entries, input, uap->size);
5771         if (error) {
5772                 goto out;
5773         }
5774
5775         AUDIT_ARG(opaque, input, uap->size);
5776
5777         /*
5778          * Force NUL termination of the copyin buffer to avoid nami() running
5779          * off the end.  If the caller passes us bogus data, they may get a
5780          * bogus result.
5781          */
5782         ((char *)input)[uap->size - 1] = 0;
5783
5784         /*
5785          * Access is defined as checking against the process' real identity,
5786          * even if operations are checking the effective identity.  This
5787          * requires that we use a local vfs context.
5788          */
5789         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5790         context.vc_thread = current_thread();
5791
5792         /*
5793          * Find out how many entries we have, so we can allocate the result
5794          * array by walking the list and adjusting the count downward by the
5795          * earliest string offset we see.
5796          */
5797         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5798         desc_actual = desc_max;
5799         for (i = 0; i < desc_actual; i++) {
5800                 /*
5801                  * Take the offset to the name string for this entry and
5802                  * convert to an input array index, which would be one off
5803                  * the end of the array if this entry was the lowest-addressed
5804                  * name string.
5805                  */
5806                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5807
5808                 /*
5809                  * An offset greater than the max allowable offset is an error.
5810                  * It is also an error for any valid entry to point
5811                  * to a location prior to the end of the current entry, if
5812                  * it's not a reference to the string of the previous entry.
5813                  */
5814                 if (j > desc_max || (j != 0 && j <= i)) {
5815                         error = EINVAL;
5816                         goto out;
5817                 }
5818
5819                 /* Also do not let ad_name_offset point to something beyond the size of the input */
5820                 if (input[i].ad_name_offset >= uap->size) {
5821                         error = EINVAL;
5822                         goto out;
5823                 }
5824
5825                 /*
5826                  * An offset of 0 means use the previous descriptor's offset;
5827                  * this is used to chain multiple requests for the same file
5828                  * to avoid multiple lookups.
5829                  */
5830                 if (j == 0) {
5831                         /* This is not valid for the first entry */
5832                         if (i == 0) {
5833                                 error = EINVAL;
5834                                 goto out;
5835                         }
5836                         continue;
5837                 }
5838
5839                 /*
5840                  * If the offset of the string for this descriptor is before
5841                  * what we believe is the current actual last descriptor,
5842                  * then we need to adjust our estimate downward; this permits
5843                  * the string table following the last descriptor to be out
5844                  * of order relative to the descriptor list.
5845                  */
5846                 if (j < desc_actual) {
5847                         desc_actual = j;
5848                 }
5849         }
5850
5851         /*
5852          * We limit the actual number of descriptors we are willing to process
5853          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5854          * requested does not exceed this limit,
5855          */
5856         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5857                 error = ENOMEM;
5858                 goto out;
5859         }
5860         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK | M_ZERO);
5861         if (result == NULL) {
5862                 error = ENOMEM;
5863                 goto out;
5864         }
5865
5866         /*
5867          * Do the work by iterating over the descriptor entries we know to
5868          * at least appear to contain valid data.
5869          */
5870         error = 0;
5871         for (i = 0; i < desc_actual; i++) {
5872                 /*
5873                  * If the ad_name_offset is 0, then we use the previous
5874                  * results to make the check; otherwise, we are looking up
5875                  * a new file name.
5876                  */
5877                 if (input[i].ad_name_offset != 0) {
5878                         /* discard old vnodes */
5879                         if (vp) {
5880                                 vnode_put(vp);
5881                                 vp = NULL;
5882                         }
5883                         if (dvp) {
5884                                 vnode_put(dvp);
5885                                 dvp = NULL;
5886                         }
5887
5888                         /*
5889                          * Scan forward in the descriptor list to see if we
5890                          * need the parent vnode.  We will need it if we are
5891                          * deleting, since we must have rights  to remove
5892                          * entries in the parent directory, as well as the
5893                          * rights to delete the object itself.
5894                          */
5895                         wantdelete = input[i].ad_flags & _DELETE_OK;
5896                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++) {
5897                                 if (input[j].ad_flags & _DELETE_OK) {
5898                                         wantdelete = 1;
5899                                 }
5900                         }
5901
5902                         niopts = FOLLOW | AUDITVNPATH1;
5903
5904                         /* need parent for vnode_authorize for deletion test */
5905                         if (wantdelete) {
5906                                 niopts |= WANTPARENT;
5907                         }
5908
5909                         /* do the lookup */
5910                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5911                             CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5912                             &context);
5913                         error = namei(&nd);
5914                         if (!error) {
5915                                 vp = nd.ni_vp;
5916                                 if (wantdelete) {
5917                                         dvp = nd.ni_dvp;
5918                                 }
5919                         }
5920                         nameidone(&nd);
5921                 }
5922
5923                 /*
5924                  * Handle lookup errors.
5925                  */
5926                 switch (error) {
5927                 case ENOENT:
5928                 case EACCES:
5929                 case EPERM:
5930                 case ENOTDIR:
5931                         result[i] = error;
5932                         break;
5933                 case 0:
5934                         /* run this access check */
5935                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5936                         break;
5937                 default:
5938                         /* fatal lookup error */
5939
5940                         goto out;
5941                 }
5942         }
5943
5944         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5945
5946         /* copy out results */
5947         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5948
5949 out:
5950         if (input && input != stack_input) {
5951                 FREE(input, M_TEMP);
5952         }
5953         if (result) {
5954                 FREE(result, M_TEMP);
5955         }
5956         if (vp) {
5957                 vnode_put(vp);
5958         }
5959         if (dvp) {
5960                 vnode_put(dvp);
5961         }
5962         if (IS_VALID_CRED(context.vc_ucred)) {
5963                 kauth_cred_unref(&context.vc_ucred);
5964         }
5965         return error;
5966 }
5967
5968
5969 /*
5970  * Returns:     0                       Success
5971  *              namei:EFAULT            Bad address
5972  *              namei:ENAMETOOLONG      Filename too long
5973  *              namei:ENOENT            No such file or directory
5974  *              namei:ELOOP             Too many levels of symbolic links
5975  *              namei:EBADF             Bad file descriptor
5976  *              namei:ENOTDIR           Not a directory
5977  *              namei:???
5978  *              access1:
5979  */
5980 static int
5981 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5982     int flag, enum uio_seg segflg)
5983 {
5984         int error;
5985         struct nameidata nd;
5986         int niopts;
5987         struct vfs_context context;
5988 #if NAMEDRSRCFORK
5989         int is_namedstream = 0;
5990 #endif
5991
5992         /*
5993          * Unless the AT_EACCESS option is used, Access is defined as checking
5994          * against the process' real identity, even if operations are checking
5995          * the effective identity.  So we need to tweak the credential
5996          * in the context for that case.
5997          */
5998         if (!(flag & AT_EACCESS)) {
5999                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
6000         } else {
6001                 context.vc_ucred = ctx->vc_ucred;
6002         }
6003         context.vc_thread = ctx->vc_thread;
6004
6005
6006         niopts = (flag & AT_SYMLINK_NOFOLLOW ? NOFOLLOW : FOLLOW) | AUDITVNPATH1;
6007         /* need parent for vnode_authorize for deletion test */
6008         if (amode & _DELETE_OK) {
6009                 niopts |= WANTPARENT;
6010         }
6011         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
6012             path, &context);
6013
6014 #if NAMEDRSRCFORK
6015         /* access(F_OK) calls are allowed for resource forks. */
6016         if (amode == F_OK) {
6017                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6018         }
6019 #endif
6020         error = nameiat(&nd, fd);
6021         if (error) {
6022                 goto out;
6023         }
6024
6025 #if NAMEDRSRCFORK
6026         /* Grab reference on the shadow stream file vnode to
6027          * force an inactive on release which will mark it
6028          * for recycle.
6029          */
6030         if (vnode_isnamedstream(nd.ni_vp) &&
6031             (nd.ni_vp->v_parent != NULLVP) &&
6032             vnode_isshadow(nd.ni_vp)) {
6033                 is_namedstream = 1;
6034                 vnode_ref(nd.ni_vp);
6035         }
6036 #endif
6037
6038         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
6039
6040 #if NAMEDRSRCFORK
6041         if (is_namedstream) {
6042                 vnode_rele(nd.ni_vp);
6043         }
6044 #endif
6045
6046         vnode_put(nd.ni_vp);
6047         if (amode & _DELETE_OK) {
6048                 vnode_put(nd.ni_dvp);
6049         }
6050         nameidone(&nd);
6051
6052 out:
6053         if (!(flag & AT_EACCESS)) {
6054                 kauth_cred_unref(&context.vc_ucred);
6055         }
6056         return error;
6057 }
6058
6059 int
6060 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
6061 {
6062         return faccessat_internal(vfs_context_current(), AT_FDCWD,
6063                    uap->path, uap->flags, 0, UIO_USERSPACE);
6064 }
6065
6066 int
6067 faccessat(__unused proc_t p, struct faccessat_args *uap,
6068     __unused int32_t *retval)
6069 {
6070         if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW)) {
6071                 return EINVAL;
6072         }
6073
6074         return faccessat_internal(vfs_context_current(), uap->fd,
6075                    uap->path, uap->amode, uap->flag, UIO_USERSPACE);
6076 }
6077
6078 /*
6079  * Returns:     0                       Success
6080  *              EFAULT
6081  *      copyout:EFAULT
6082  *      namei:???
6083  *      vn_stat:???
6084  */
6085 static int
6086 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
6087     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
6088     enum uio_seg segflg, int fd, int flag)
6089 {
6090         struct nameidata nd;
6091         int follow;
6092         union {
6093                 struct stat sb;
6094                 struct stat64 sb64;
6095         } source = {};
6096         union {
6097                 struct user64_stat user64_sb;
6098                 struct user32_stat user32_sb;
6099                 struct user64_stat64 user64_sb64;
6100                 struct user32_stat64 user32_sb64;
6101         } dest = {};
6102         caddr_t sbp;
6103         int error, my_size;
6104         kauth_filesec_t fsec;
6105         size_t xsecurity_bufsize;
6106         void * statptr;
6107         struct fileproc *fp = NULL;
6108         int needsrealdev = 0;
6109
6110         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6111         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
6112             segflg, path, ctx);
6113
6114 #if NAMEDRSRCFORK
6115         int is_namedstream = 0;
6116         /* stat calls are allowed for resource forks. */
6117         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
6118 #endif
6119
6120         if (flag & AT_FDONLY) {
6121                 vnode_t fvp;
6122
6123                 error = fp_getfvp(vfs_context_proc(ctx), fd, &fp, &fvp);
6124                 if (error) {
6125                         return error;
6126                 }
6127                 if ((error = vnode_getwithref(fvp))) {
6128                         file_drop(fd);
6129                         return error;
6130                 }
6131                 nd.ni_vp = fvp;
6132         } else {
6133                 error = nameiat(&nd, fd);
6134                 if (error) {
6135                         return error;
6136                 }
6137         }
6138         fsec = KAUTH_FILESEC_NONE;
6139
6140         statptr = (void *)&source;
6141
6142 #if NAMEDRSRCFORK
6143         /* Grab reference on the shadow stream file vnode to
6144          * force an inactive on release which will mark it
6145          * for recycle.
6146          */
6147         if (vnode_isnamedstream(nd.ni_vp) &&
6148             (nd.ni_vp->v_parent != NULLVP) &&
6149             vnode_isshadow(nd.ni_vp)) {
6150                 is_namedstream = 1;
6151                 vnode_ref(nd.ni_vp);
6152         }
6153 #endif
6154
6155         needsrealdev = flag & AT_REALDEV ? 1 : 0;
6156         if (fp && (xsecurity == USER_ADDR_NULL)) {
6157                 /*
6158                  * If the caller has the file open, and is not
6159                  * requesting extended security information, we are
6160                  * going to let them get the basic stat information.
6161                  */
6162                 error = vn_stat_noauth(nd.ni_vp, statptr, NULL, isstat64, needsrealdev, ctx,
6163                     fp->f_fglob->fg_cred);
6164         } else {
6165                 error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL),
6166                     isstat64, needsrealdev, ctx);
6167         }
6168
6169 #if NAMEDRSRCFORK
6170         if (is_namedstream) {
6171                 vnode_rele(nd.ni_vp);
6172         }
6173 #endif
6174         vnode_put(nd.ni_vp);
6175         nameidone(&nd);
6176         if (fp) {
6177                 file_drop(fd);
6178                 fp = NULL;
6179         }
6180
6181         if (error) {
6182                 return error;
6183         }
6184         /* Zap spare fields */
6185         if (isstat64 != 0) {
6186                 source.sb64.st_lspare = 0;
6187                 source.sb64.st_qspare[0] = 0LL;
6188                 source.sb64.st_qspare[1] = 0LL;
6189                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6190                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
6191                         my_size = sizeof(dest.user64_sb64);
6192                         sbp = (caddr_t)&dest.user64_sb64;
6193                 } else {
6194                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
6195                         my_size = sizeof(dest.user32_sb64);
6196                         sbp = (caddr_t)&dest.user32_sb64;
6197                 }
6198                 /*
6199                  * Check if we raced (post lookup) against the last unlink of a file.
6200                  */
6201                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
6202                         source.sb64.st_nlink = 1;
6203                 }
6204         } else {
6205                 source.sb.st_lspare = 0;
6206                 source.sb.st_qspare[0] = 0LL;
6207                 source.sb.st_qspare[1] = 0LL;
6208                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
6209                         munge_user64_stat(&source.sb, &dest.user64_sb);
6210                         my_size = sizeof(dest.user64_sb);
6211                         sbp = (caddr_t)&dest.user64_sb;
6212                 } else {
6213                         munge_user32_stat(&source.sb, &dest.user32_sb);
6214                         my_size = sizeof(dest.user32_sb);
6215                         sbp = (caddr_t)&dest.user32_sb;
6216                 }
6217
6218                 /*
6219                  * Check if we raced (post lookup) against the last unlink of a file.
6220                  */
6221                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
6222                         source.sb.st_nlink = 1;
6223                 }
6224         }
6225         if ((error = copyout(sbp, ub, my_size)) != 0) {
6226                 goto out;
6227         }
6228
6229         /* caller wants extended security information? */
6230         if (xsecurity != USER_ADDR_NULL) {
6231                 /* did we get any? */
6232                 if (fsec == KAUTH_FILESEC_NONE) {
6233                         if (susize(xsecurity_size, 0) != 0) {
6234                                 error = EFAULT;
6235                                 goto out;
6236                         }
6237                 } else {
6238                         /* find the user buffer size */
6239                         xsecurity_bufsize = fusize(xsecurity_size);
6240
6241                         /* copy out the actual data size */
6242                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
6243                                 error = EFAULT;
6244                                 goto out;
6245                         }
6246
6247                         /* if the caller supplied enough room, copy out to it */
6248                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec)) {
6249                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
6250                         }
6251                 }
6252         }
6253 out:
6254         if (fsec != KAUTH_FILESEC_NONE) {
6255                 kauth_filesec_free(fsec);
6256         }
6257         return error;
6258 }
6259
6260 /*
6261  * stat_extended: Get file status; with extended security (ACL).
6262  *
6263  * Parameters:    p                       (ignored)
6264  *                uap                     User argument descriptor (see below)
6265  *                retval                  (ignored)
6266  *
6267  * Indirect:      uap->path               Path of file to get status from
6268  *                uap->ub                 User buffer (holds file status info)
6269  *                uap->xsecurity          ACL to get (extended security)
6270  *                uap->xsecurity_size     Size of ACL
6271  *
6272  * Returns:        0                      Success
6273  *                !0                      errno value
6274  *
6275  */
6276 int
6277 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
6278     __unused int32_t *retval)
6279 {
6280         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6281                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6282                    0);
6283 }
6284
6285 /*
6286  * Returns:     0                       Success
6287  *      fstatat_internal:???            [see fstatat_internal() in this file]
6288  */
6289 int
6290 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
6291 {
6292         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6293                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0);
6294 }
6295
6296 int
6297 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
6298 {
6299         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6300                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0);
6301 }
6302
6303 /*
6304  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
6305  *
6306  * Parameters:    p                       (ignored)
6307  *                uap                     User argument descriptor (see below)
6308  *                retval                  (ignored)
6309  *
6310  * Indirect:      uap->path               Path of file to get status from
6311  *                uap->ub                 User buffer (holds file status info)
6312  *                uap->xsecurity          ACL to get (extended security)
6313  *                uap->xsecurity_size     Size of ACL
6314  *
6315  * Returns:        0                      Success
6316  *                !0                      errno value
6317  *
6318  */
6319 int
6320 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
6321 {
6322         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6323                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6324                    0);
6325 }
6326
6327 /*
6328  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
6329  *
6330  * Parameters:    p                       (ignored)
6331  *                uap                     User argument descriptor (see below)
6332  *                retval                  (ignored)
6333  *
6334  * Indirect:      uap->path               Path of file to get status from
6335  *                uap->ub                 User buffer (holds file status info)
6336  *                uap->xsecurity          ACL to get (extended security)
6337  *                uap->xsecurity_size     Size of ACL
6338  *
6339  * Returns:        0                      Success
6340  *                !0                      errno value
6341  *
6342  */
6343 int
6344 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
6345 {
6346         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6347                    uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
6348                    AT_SYMLINK_NOFOLLOW);
6349 }
6350
6351 /*
6352  * Get file status; this version does not follow links.
6353  */
6354 int
6355 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
6356 {
6357         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6358                    0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6359 }
6360
6361 int
6362 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
6363 {
6364         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6365                    0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW);
6366 }
6367
6368 /*
6369  * lstat64_extended: Get file status; can handle large inode numbers; does not
6370  * follow links; with extended security (ACL).
6371  *
6372  * Parameters:    p                       (ignored)
6373  *                uap                     User argument descriptor (see below)
6374  *                retval                  (ignored)
6375  *
6376  * Indirect:      uap->path               Path of file to get status from
6377  *                uap->ub                 User buffer (holds file status info)
6378  *                uap->xsecurity          ACL to get (extended security)
6379  *                uap->xsecurity_size     Size of ACL
6380  *
6381  * Returns:        0                      Success
6382  *                !0                      errno value
6383  *
6384  */
6385 int
6386 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
6387 {
6388         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6389                    uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
6390                    AT_SYMLINK_NOFOLLOW);
6391 }
6392
6393 int
6394 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
6395 {
6396         if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6397                 return EINVAL;
6398         }
6399
6400         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6401                    0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag);
6402 }
6403
6404 int
6405 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
6406     __unused int32_t *retval)
6407 {
6408         if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY)) {
6409                 return EINVAL;
6410         }
6411
6412         return fstatat_internal(vfs_context_current(), uap->path, uap->ub,
6413                    0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag);
6414 }
6415
6416 /*
6417  * Get configurable pathname variables.
6418  *
6419  * Returns:     0                       Success
6420  *      namei:???
6421  *      vn_pathconf:???
6422  *
6423  * Notes:       Global implementation  constants are intended to be
6424  *              implemented in this function directly; all other constants
6425  *              are per-FS implementation, and therefore must be handled in
6426  *              each respective FS, instead.
6427  *
6428  * XXX We implement some things globally right now that should actually be
6429  * XXX per-FS; we will need to deal with this at some point.
6430  */
6431 /* ARGSUSED */
6432 int
6433 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
6434 {
6435         int error;
6436         struct nameidata nd;
6437         vfs_context_t ctx = vfs_context_current();
6438
6439         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
6440             UIO_USERSPACE, uap->path, ctx);
6441         error = namei(&nd);
6442         if (error) {
6443                 return error;
6444         }
6445
6446         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
6447
6448         vnode_put(nd.ni_vp);
6449         nameidone(&nd);
6450         return error;
6451 }
6452
6453 /*
6454  * Return target name of a symbolic link.
6455  */
6456 /* ARGSUSED */
6457 static int
6458 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
6459     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
6460     int *retval)
6461 {
6462         vnode_t vp;
6463         uio_t auio;
6464         int error;
6465         struct nameidata nd;
6466         char uio_buf[UIO_SIZEOF(1)];
6467
6468         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
6469             seg, path, ctx);
6470
6471         error = nameiat(&nd, fd);
6472         if (error) {
6473                 return error;
6474         }
6475         vp = nd.ni_vp;
6476
6477         nameidone(&nd);
6478
6479         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
6480             &uio_buf[0], sizeof(uio_buf));
6481         uio_addiov(auio, buf, bufsize);
6482         if (vp->v_type != VLNK) {
6483                 error = EINVAL;
6484         } else {
6485 #if CONFIG_MACF
6486                 error = mac_vnode_check_readlink(ctx, vp);
6487 #endif
6488                 if (error == 0) {
6489                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
6490                             ctx);
6491                 }
6492                 if (error == 0) {
6493                         error = VNOP_READLINK(vp, auio, ctx);
6494                 }
6495         }
6496         vnode_put(vp);
6497
6498         *retval = bufsize - (int)uio_resid(auio);
6499         return error;
6500 }
6501
6502 int
6503 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
6504 {
6505         enum uio_seg procseg;
6506
6507         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6508         return readlinkat_internal(vfs_context_current(), AT_FDCWD,
6509                    CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
6510                    uap->count, procseg, retval);
6511 }
6512
6513 int
6514 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
6515 {
6516         enum uio_seg procseg;
6517
6518         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
6519         return readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
6520                    procseg, uap->buf, uap->bufsize, procseg, retval);
6521 }
6522
6523 /*
6524  * Change file flags, the deep inner layer.
6525  */
6526 static int
6527 chflags0(vnode_t vp, struct vnode_attr *va,
6528     int (*setattr)(vnode_t, void *, vfs_context_t),
6529     void *arg, vfs_context_t ctx)
6530 {
6531         kauth_action_t action = 0;
6532         int error;
6533
6534 #if CONFIG_MACF
6535         error = mac_vnode_check_setflags(ctx, vp, va->va_flags);
6536         if (error) {
6537                 goto out;
6538         }
6539 #endif
6540
6541         /* request authorisation, disregard immutability */
6542         if ((error = vnode_authattr(vp, va, &action, ctx)) != 0) {
6543                 goto out;
6544         }
6545         /*
6546          * Request that the auth layer disregard those file flags it's allowed to when
6547          * authorizing this operation; we need to do this in order to be able to
6548          * clear immutable flags.
6549          */
6550         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0)) {
6551                 goto out;
6552         }
6553         error = (*setattr)(vp, arg, ctx);
6554
6555 #if CONFIG_MACF
6556         if (error == 0) {
6557                 mac_vnode_notify_setflags(ctx, vp, va->va_flags);
6558         }
6559 #endif
6560
6561 out:
6562         return error;
6563 }
6564
6565 /*
6566  * Change file flags.
6567  *
6568  * NOTE: this will vnode_put() `vp'
6569  */
6570 static int
6571 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
6572 {
6573         struct vnode_attr va;
6574         int error;
6575
6576         VATTR_INIT(&va);
6577         VATTR_SET(&va, va_flags, flags);
6578
6579         error = chflags0(vp, &va, (void *)vnode_setattr, &va, ctx);
6580         vnode_put(vp);
6581
6582         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6583                 error = ENOTSUP;
6584         }
6585
6586         return error;
6587 }
6588
6589 /*
6590  * Change flags of a file given a path name.
6591  */
6592 /* ARGSUSED */
6593 int
6594 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6595 {
6596         vnode_t vp;
6597         vfs_context_t ctx = vfs_context_current();
6598         int error;
6599         struct nameidata nd;
6600
6601         AUDIT_ARG(fflags, uap->flags);
6602         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6603             UIO_USERSPACE, uap->path, ctx);
6604         error = namei(&nd);
6605         if (error) {
6606                 return error;
6607         }
6608         vp = nd.ni_vp;
6609         nameidone(&nd);
6610
6611         /* we don't vnode_put() here because chflags1 does internally */
6612         error = chflags1(vp, uap->flags, ctx);
6613
6614         return error;
6615 }
6616
6617 /*
6618  * Change flags of a file given a file descriptor.
6619  */
6620 /* ARGSUSED */
6621 int
6622 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6623 {
6624         vnode_t vp;
6625         int error;
6626
6627         AUDIT_ARG(fd, uap->fd);
6628         AUDIT_ARG(fflags, uap->flags);
6629         if ((error = file_vnode(uap->fd, &vp))) {
6630                 return error;
6631         }
6632
6633         if ((error = vnode_getwithref(vp))) {
6634                 file_drop(uap->fd);
6635                 return error;
6636         }
6637
6638         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6639
6640         /* we don't vnode_put() here because chflags1 does internally */
6641         error = chflags1(vp, uap->flags, vfs_context_current());
6642
6643         file_drop(uap->fd);
6644         return error;
6645 }
6646
6647 /*
6648  * Change security information on a filesystem object.
6649  *
6650  * Returns:     0                       Success
6651  *              EPERM                   Operation not permitted
6652  *              vnode_authattr:???      [anything vnode_authattr can return]
6653  *              vnode_authorize:???     [anything vnode_authorize can return]
6654  *              vnode_setattr:???       [anything vnode_setattr can return]
6655  *
6656  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
6657  *              translated to EPERM before being returned.
6658  */
6659 static int
6660 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6661 {
6662         kauth_action_t action;
6663         int error;
6664
6665         AUDIT_ARG(mode, vap->va_mode);
6666         /* XXX audit new args */
6667
6668 #if NAMEDSTREAMS
6669         /* chmod calls are not allowed for resource forks. */
6670         if (vp->v_flag & VISNAMEDSTREAM) {
6671                 return EPERM;
6672         }
6673 #endif
6674
6675 #if CONFIG_MACF
6676         if (VATTR_IS_ACTIVE(vap, va_mode) &&
6677             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0) {
6678                 return error;
6679         }
6680
6681         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6682                 if ((error = mac_vnode_check_setowner(ctx, vp,
6683                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6684                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1))) {
6685                         return error;
6686                 }
6687         }
6688
6689         if (VATTR_IS_ACTIVE(vap, va_acl) &&
6690             (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl))) {
6691                 return error;
6692         }
6693 #endif
6694
6695         /* make sure that the caller is allowed to set this security information */
6696         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6697             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6698                 if (error == EACCES) {
6699                         error = EPERM;
6700                 }
6701                 return error;
6702         }
6703
6704         if ((error = vnode_setattr(vp, vap, ctx)) != 0) {
6705                 return error;
6706         }
6707
6708 #if CONFIG_MACF
6709         if (VATTR_IS_ACTIVE(vap, va_mode)) {
6710                 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6711         }
6712
6713         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6714                 mac_vnode_notify_setowner(ctx, vp,
6715                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6716                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6717         }
6718
6719         if (VATTR_IS_ACTIVE(vap, va_acl)) {
6720                 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6721         }
6722 #endif
6723
6724         return error;
6725 }
6726
6727
6728 /*
6729  * Change mode of a file given a path name.
6730  *
6731  * Returns:     0                       Success
6732  *              namei:???               [anything namei can return]
6733  *              chmod_vnode:???         [anything chmod_vnode can return]
6734  */
6735 static int
6736 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6737     int fd, int flag, enum uio_seg segflg)
6738 {
6739         struct nameidata nd;
6740         int follow, error;
6741
6742         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6743         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
6744             segflg, path, ctx);
6745         if ((error = nameiat(&nd, fd))) {
6746                 return error;
6747         }
6748         error = chmod_vnode(ctx, nd.ni_vp, vap);
6749         vnode_put(nd.ni_vp);
6750         nameidone(&nd);
6751         return error;
6752 }
6753
6754 /*
6755  * chmod_extended: Change the mode of a file given a path name; with extended
6756  * argument list (including extended security (ACL)).
6757  *
6758  * Parameters:  p                       Process requesting the open
6759  *              uap                     User argument descriptor (see below)
6760  *              retval                  (ignored)
6761  *
6762  * Indirect:    uap->path               Path to object (same as 'chmod')
6763  *              uap->uid                UID to set
6764  *              uap->gid                GID to set
6765  *              uap->mode               File mode to set (same as 'chmod')
6766  *              uap->xsecurity          ACL to set (or delete)
6767  *
6768  * Returns:     0                       Success
6769  *              !0                      errno value
6770  *
6771  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
6772  *
6773  * XXX:         We should enummerate the possible errno values here, and where
6774  *              in the code they originated.
6775  */
6776 int
6777 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
6778 {
6779         int error;
6780         struct vnode_attr va;
6781         kauth_filesec_t xsecdst;
6782
6783         AUDIT_ARG(owner, uap->uid, uap->gid);
6784
6785         VATTR_INIT(&va);
6786         if (uap->mode != -1) {
6787                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6788         }
6789         if (uap->uid != KAUTH_UID_NONE) {
6790                 VATTR_SET(&va, va_uid, uap->uid);
6791         }
6792         if (uap->gid != KAUTH_GID_NONE) {
6793                 VATTR_SET(&va, va_gid, uap->gid);
6794         }
6795
6796         xsecdst = NULL;
6797         switch (uap->xsecurity) {
6798         /* explicit remove request */
6799         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6800                 VATTR_SET(&va, va_acl, NULL);
6801                 break;
6802         /* not being set */
6803         case USER_ADDR_NULL:
6804                 break;
6805         default:
6806                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6807                         return error;
6808                 }
6809                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6810                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6811         }
6812
6813         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6814             UIO_USERSPACE);
6815
6816         if (xsecdst != NULL) {
6817                 kauth_filesec_free(xsecdst);
6818         }
6819         return error;
6820 }
6821
6822 /*
6823  * Returns:     0                       Success
6824  *              chmodat:???             [anything chmodat can return]
6825  */
6826 static int
6827 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6828     int flag, enum uio_seg segflg)
6829 {
6830         struct vnode_attr va;
6831
6832         VATTR_INIT(&va);
6833         VATTR_SET(&va, va_mode, mode & ALLPERMS);
6834
6835         return chmodat(ctx, path, &va, fd, flag, segflg);
6836 }
6837
6838 int
6839 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6840 {
6841         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6842                    AT_FDCWD, 0, UIO_USERSPACE);
6843 }
6844
6845 int
6846 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6847 {
6848         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
6849                 return EINVAL;
6850         }
6851
6852         return fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6853                    uap->fd, uap->flag, UIO_USERSPACE);
6854 }
6855
6856 /*
6857  * Change mode of a file given a file descriptor.
6858  */
6859 static int
6860 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6861 {
6862         vnode_t vp;
6863         int error;
6864
6865         AUDIT_ARG(fd, fd);
6866
6867         if ((error = file_vnode(fd, &vp)) != 0) {
6868                 return error;
6869         }
6870         if ((error = vnode_getwithref(vp)) != 0) {
6871                 file_drop(fd);
6872                 return error;
6873         }
6874         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6875
6876         error = chmod_vnode(vfs_context_current(), vp, vap);
6877         (void)vnode_put(vp);
6878         file_drop(fd);
6879
6880         return error;
6881 }
6882
6883 /*
6884  * fchmod_extended: Change mode of a file given a file descriptor; with
6885  * extended argument list (including extended security (ACL)).
6886  *
6887  * Parameters:    p                       Process requesting to change file mode
6888  *                uap                     User argument descriptor (see below)
6889  *                retval                  (ignored)
6890  *
6891  * Indirect:      uap->mode               File mode to set (same as 'chmod')
6892  *                uap->uid                UID to set
6893  *                uap->gid                GID to set
6894  *                uap->xsecurity          ACL to set (or delete)
6895  *                uap->fd                 File descriptor of file to change mode
6896  *
6897  * Returns:        0                      Success
6898  *                !0                      errno value
6899  *
6900  */
6901 int
6902 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6903 {
6904         int error;
6905         struct vnode_attr va;
6906         kauth_filesec_t xsecdst;
6907
6908         AUDIT_ARG(owner, uap->uid, uap->gid);
6909
6910         VATTR_INIT(&va);
6911         if (uap->mode != -1) {
6912                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6913         }
6914         if (uap->uid != KAUTH_UID_NONE) {
6915                 VATTR_SET(&va, va_uid, uap->uid);
6916         }
6917         if (uap->gid != KAUTH_GID_NONE) {
6918                 VATTR_SET(&va, va_gid, uap->gid);
6919         }
6920
6921         xsecdst = NULL;
6922         switch (uap->xsecurity) {
6923         case USER_ADDR_NULL:
6924                 VATTR_SET(&va, va_acl, NULL);
6925                 break;
6926         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6927                 VATTR_SET(&va, va_acl, NULL);
6928                 break;
6929         /* not being set */
6930         case CAST_USER_ADDR_T(-1):
6931                 break;
6932         default:
6933                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
6934                         return error;
6935                 }
6936                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6937         }
6938
6939         error = fchmod1(p, uap->fd, &va);
6940
6941
6942         switch (uap->xsecurity) {
6943         case USER_ADDR_NULL:
6944         case CAST_USER_ADDR_T(-1):
6945                 break;
6946         default:
6947                 if (xsecdst != NULL) {
6948                         kauth_filesec_free(xsecdst);
6949                 }
6950         }
6951         return error;
6952 }
6953
6954 int
6955 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6956 {
6957         struct vnode_attr va;
6958
6959         VATTR_INIT(&va);
6960         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6961
6962         return fchmod1(p, uap->fd, &va);
6963 }
6964
6965
6966 /*
6967  * Set ownership given a path name.
6968  */
6969 /* ARGSUSED */
6970 static int
6971 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6972     gid_t gid, int flag, enum uio_seg segflg)
6973 {
6974         vnode_t vp;
6975         struct vnode_attr va;
6976         int error;
6977         struct nameidata nd;
6978         int follow;
6979         kauth_action_t action;
6980
6981         AUDIT_ARG(owner, uid, gid);
6982
6983         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6984         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6985             path, ctx);
6986         error = nameiat(&nd, fd);
6987         if (error) {
6988                 return error;
6989         }
6990         vp = nd.ni_vp;
6991
6992         nameidone(&nd);
6993
6994         VATTR_INIT(&va);
6995         if (uid != (uid_t)VNOVAL) {
6996                 VATTR_SET(&va, va_uid, uid);
6997         }
6998         if (gid != (gid_t)VNOVAL) {
6999                 VATTR_SET(&va, va_gid, gid);
7000         }
7001
7002 #if CONFIG_MACF
7003         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
7004         if (error) {
7005                 goto out;
7006         }
7007 #endif
7008
7009         /* preflight and authorize attribute changes */
7010         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7011                 goto out;
7012         }
7013         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7014                 goto out;
7015         }
7016         error = vnode_setattr(vp, &va, ctx);
7017
7018 #if CONFIG_MACF
7019         if (error == 0) {
7020                 mac_vnode_notify_setowner(ctx, vp, uid, gid);
7021         }
7022 #endif
7023
7024 out:
7025         /*
7026          * EACCES is only allowed from namei(); permissions failure should
7027          * return EPERM, so we need to translate the error code.
7028          */
7029         if (error == EACCES) {
7030                 error = EPERM;
7031         }
7032
7033         vnode_put(vp);
7034         return error;
7035 }
7036
7037 int
7038 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
7039 {
7040         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7041                    uap->uid, uap->gid, 0, UIO_USERSPACE);
7042 }
7043
7044 int
7045 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
7046 {
7047         return fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
7048                    uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE);
7049 }
7050
7051 int
7052 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
7053 {
7054         if (uap->flag & ~AT_SYMLINK_NOFOLLOW) {
7055                 return EINVAL;
7056         }
7057
7058         return fchownat_internal(vfs_context_current(), uap->fd, uap->path,
7059                    uap->uid, uap->gid, uap->flag, UIO_USERSPACE);
7060 }
7061
7062 /*
7063  * Set ownership given a file descriptor.
7064  */
7065 /* ARGSUSED */
7066 int
7067 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
7068 {
7069         struct vnode_attr va;
7070         vfs_context_t ctx = vfs_context_current();
7071         vnode_t vp;
7072         int error;
7073         kauth_action_t action;
7074
7075         AUDIT_ARG(owner, uap->uid, uap->gid);
7076         AUDIT_ARG(fd, uap->fd);
7077
7078         if ((error = file_vnode(uap->fd, &vp))) {
7079                 return error;
7080         }
7081
7082         if ((error = vnode_getwithref(vp))) {
7083                 file_drop(uap->fd);
7084                 return error;
7085         }
7086         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7087
7088         VATTR_INIT(&va);
7089         if (uap->uid != VNOVAL) {
7090                 VATTR_SET(&va, va_uid, uap->uid);
7091         }
7092         if (uap->gid != VNOVAL) {
7093                 VATTR_SET(&va, va_gid, uap->gid);
7094         }
7095
7096 #if NAMEDSTREAMS
7097         /* chown calls are not allowed for resource forks. */
7098         if (vp->v_flag & VISNAMEDSTREAM) {
7099                 error = EPERM;
7100                 goto out;
7101         }
7102 #endif
7103
7104 #if CONFIG_MACF
7105         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
7106         if (error) {
7107                 goto out;
7108         }
7109 #endif
7110
7111         /* preflight and authorize attribute changes */
7112         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7113                 goto out;
7114         }
7115         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7116                 if (error == EACCES) {
7117                         error = EPERM;
7118                 }
7119                 goto out;
7120         }
7121         error = vnode_setattr(vp, &va, ctx);
7122
7123 #if CONFIG_MACF
7124         if (error == 0) {
7125                 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
7126         }
7127 #endif
7128
7129 out:
7130         (void)vnode_put(vp);
7131         file_drop(uap->fd);
7132         return error;
7133 }
7134
7135 static int
7136 getutimes(user_addr_t usrtvp, struct timespec *tsp)
7137 {
7138         int error;
7139
7140         if (usrtvp == USER_ADDR_NULL) {
7141                 struct timeval old_tv;
7142                 /* XXX Y2038 bug because of microtime argument */
7143                 microtime(&old_tv);
7144                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
7145                 tsp[1] = tsp[0];
7146         } else {
7147                 if (IS_64BIT_PROCESS(current_proc())) {
7148                         struct user64_timeval tv[2];
7149                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
7150                         if (error) {
7151                                 return error;
7152                         }
7153                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7154                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7155                 } else {
7156                         struct user32_timeval tv[2];
7157                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
7158                         if (error) {
7159                                 return error;
7160                         }
7161                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
7162                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
7163                 }
7164         }
7165         return 0;
7166 }
7167
7168 static int
7169 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
7170     int nullflag)
7171 {
7172         int error;
7173         struct vnode_attr va;
7174         kauth_action_t action;
7175
7176         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7177
7178         VATTR_INIT(&va);
7179         VATTR_SET(&va, va_access_time, ts[0]);
7180         VATTR_SET(&va, va_modify_time, ts[1]);
7181         if (nullflag) {
7182                 va.va_vaflags |= VA_UTIMES_NULL;
7183         }
7184
7185 #if NAMEDSTREAMS
7186         /* utimes calls are not allowed for resource forks. */
7187         if (vp->v_flag & VISNAMEDSTREAM) {
7188                 error = EPERM;
7189                 goto out;
7190         }
7191 #endif
7192
7193 #if CONFIG_MACF
7194         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
7195         if (error) {
7196                 goto out;
7197         }
7198 #endif
7199         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7200                 if (!nullflag && error == EACCES) {
7201                         error = EPERM;
7202                 }
7203                 goto out;
7204         }
7205
7206         /* since we may not need to auth anything, check here */
7207         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7208                 if (!nullflag && error == EACCES) {
7209                         error = EPERM;
7210                 }
7211                 goto out;
7212         }
7213         error = vnode_setattr(vp, &va, ctx);
7214
7215 #if CONFIG_MACF
7216         if (error == 0) {
7217                 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
7218         }
7219 #endif
7220
7221 out:
7222         return error;
7223 }
7224
7225 /*
7226  * Set the access and modification times of a file.
7227  */
7228 /* ARGSUSED */
7229 int
7230 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
7231 {
7232         struct timespec ts[2];
7233         user_addr_t usrtvp;
7234         int error;
7235         struct nameidata nd;
7236         vfs_context_t ctx = vfs_context_current();
7237
7238         /*
7239          * AUDIT: Needed to change the order of operations to do the
7240          * name lookup first because auditing wants the path.
7241          */
7242         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
7243             UIO_USERSPACE, uap->path, ctx);
7244         error = namei(&nd);
7245         if (error) {
7246                 return error;
7247         }
7248         nameidone(&nd);
7249
7250         /*
7251          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
7252          * the current time instead.
7253          */
7254         usrtvp = uap->tptr;
7255         if ((error = getutimes(usrtvp, ts)) != 0) {
7256                 goto out;
7257         }
7258
7259         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
7260
7261 out:
7262         vnode_put(nd.ni_vp);
7263         return error;
7264 }
7265
7266 /*
7267  * Set the access and modification times of a file.
7268  */
7269 /* ARGSUSED */
7270 int
7271 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
7272 {
7273         struct timespec ts[2];
7274         vnode_t vp;
7275         user_addr_t usrtvp;
7276         int error;
7277
7278         AUDIT_ARG(fd, uap->fd);
7279         usrtvp = uap->tptr;
7280         if ((error = getutimes(usrtvp, ts)) != 0) {
7281                 return error;
7282         }
7283         if ((error = file_vnode(uap->fd, &vp)) != 0) {
7284                 return error;
7285         }
7286         if ((error = vnode_getwithref(vp))) {
7287                 file_drop(uap->fd);
7288                 return error;
7289         }
7290
7291         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
7292         vnode_put(vp);
7293         file_drop(uap->fd);
7294         return error;
7295 }
7296
7297 /*
7298  * Truncate a file given its path name.
7299  */
7300 /* ARGSUSED */
7301 int
7302 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
7303 {
7304         vnode_t vp;
7305         struct vnode_attr va;
7306         vfs_context_t ctx = vfs_context_current();
7307         int error;
7308         struct nameidata nd;
7309         kauth_action_t action;
7310
7311         if (uap->length < 0) {
7312                 return EINVAL;
7313         }
7314         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
7315             UIO_USERSPACE, uap->path, ctx);
7316         if ((error = namei(&nd))) {
7317                 return error;
7318         }
7319         vp = nd.ni_vp;
7320
7321         nameidone(&nd);
7322
7323         VATTR_INIT(&va);
7324         VATTR_SET(&va, va_data_size, uap->length);
7325
7326 #if CONFIG_MACF
7327         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
7328         if (error) {
7329                 goto out;
7330         }
7331 #endif
7332
7333         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
7334                 goto out;
7335         }
7336         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
7337                 goto out;
7338         }
7339         error = vnode_setattr(vp, &va, ctx);
7340
7341 #if CONFIG_MACF
7342         if (error == 0) {
7343                 mac_vnode_notify_truncate(ctx, NOCRED, vp);
7344         }
7345 #endif
7346
7347 out:
7348         vnode_put(vp);
7349         return error;
7350 }
7351
7352 /*
7353  * Truncate a file given a file descriptor.
7354  */
7355 /* ARGSUSED */
7356 int
7357 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
7358 {
7359         vfs_context_t ctx = vfs_context_current();
7360         struct vnode_attr va;
7361         vnode_t vp;
7362         struct fileproc *fp;
7363         int error;
7364         int fd = uap->fd;
7365
7366         AUDIT_ARG(fd, uap->fd);
7367         if (uap->length < 0) {
7368                 return EINVAL;
7369         }
7370
7371         if ((error = fp_lookup(p, fd, &fp, 0))) {
7372                 return error;
7373         }
7374
7375         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
7376         case DTYPE_PSXSHM:
7377                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
7378                 goto out;
7379         case DTYPE_VNODE:
7380                 break;
7381         default:
7382                 error = EINVAL;
7383                 goto out;
7384         }
7385
7386         vp = (vnode_t)fp->f_fglob->fg_data;
7387
7388         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
7389                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
7390                 error = EINVAL;
7391                 goto out;
7392         }
7393
7394         if ((error = vnode_getwithref(vp)) != 0) {
7395                 goto out;
7396         }
7397
7398         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7399
7400 #if CONFIG_MACF
7401         error = mac_vnode_check_truncate(ctx,
7402             fp->f_fglob->fg_cred, vp);
7403         if (error) {
7404                 (void)vnode_put(vp);
7405                 goto out;
7406         }
7407 #endif
7408         VATTR_INIT(&va);
7409         VATTR_SET(&va, va_data_size, uap->length);
7410         error = vnode_setattr(vp, &va, ctx);
7411
7412 #if CONFIG_MACF
7413         if (error == 0) {
7414                 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
7415         }
7416 #endif
7417
7418         (void)vnode_put(vp);
7419 out:
7420         file_drop(fd);
7421         return error;
7422 }
7423
7424
7425 /*
7426  * Sync an open file with synchronized I/O _file_ integrity completion
7427  */
7428 /* ARGSUSED */
7429 int
7430 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
7431 {
7432         __pthread_testcancel(1);
7433         return fsync_common(p, uap, MNT_WAIT);
7434 }
7435
7436
7437 /*
7438  * Sync an open file with synchronized I/O _file_ integrity completion
7439  *
7440  * Notes:       This is a legacy support function that does not test for
7441  *              thread cancellation points.
7442  */
7443 /* ARGSUSED */
7444 int
7445 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
7446 {
7447         return fsync_common(p, (struct fsync_args *)uap, MNT_WAIT);
7448 }
7449
7450
7451 /*
7452  * Sync an open file with synchronized I/O _data_ integrity completion
7453  */
7454 /* ARGSUSED */
7455 int
7456 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
7457 {
7458         __pthread_testcancel(1);
7459         return fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT);
7460 }
7461
7462
7463 /*
7464  * fsync_common
7465  *
7466  * Common fsync code to support both synchronized I/O file integrity completion
7467  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
7468  *
7469  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
7470  * will only guarantee that the file data contents are retrievable.  If
7471  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
7472  * includes additional metadata unnecessary for retrieving the file data
7473  * contents, such as atime, mtime, ctime, etc., also be committed to stable
7474  * storage.
7475  *
7476  * Parameters:  p                               The process
7477  *              uap->fd                         The descriptor to synchronize
7478  *              flags                           The data integrity flags
7479  *
7480  * Returns:     int                             Success
7481  *      fp_getfvp:EBADF                         Bad file descriptor
7482  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
7483  *      VNOP_FSYNC:???                          unspecified
7484  *
7485  * Notes:       We use struct fsync_args because it is a short name, and all
7486  *              caller argument structures are otherwise identical.
7487  */
7488 static int
7489 fsync_common(proc_t p, struct fsync_args *uap, int flags)
7490 {
7491         vnode_t vp;
7492         struct fileproc *fp;
7493         vfs_context_t ctx = vfs_context_current();
7494         int error;
7495
7496         AUDIT_ARG(fd, uap->fd);
7497
7498         if ((error = fp_getfvp(p, uap->fd, &fp, &vp))) {
7499                 return error;
7500         }
7501         if ((error = vnode_getwithref(vp))) {
7502                 file_drop(uap->fd);
7503                 return error;
7504         }
7505
7506         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
7507
7508         error = VNOP_FSYNC(vp, flags, ctx);
7509
7510 #if NAMEDRSRCFORK
7511         /* Sync resource fork shadow file if necessary. */
7512         if ((error == 0) &&
7513             (vp->v_flag & VISNAMEDSTREAM) &&
7514             (vp->v_parent != NULLVP) &&
7515             vnode_isshadow(vp) &&
7516             (fp->f_flags & FP_WRITTEN)) {
7517                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
7518         }
7519 #endif
7520
7521         (void)vnode_put(vp);
7522         file_drop(uap->fd);
7523         return error;
7524 }
7525
7526 /*
7527  * Duplicate files.  Source must be a file, target must be a file or
7528  * must not exist.
7529  *
7530  * XXX Copyfile authorisation checking is woefully inadequate, and will not
7531  *     perform inheritance correctly.
7532  */
7533 /* ARGSUSED */
7534 int
7535 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
7536 {
7537         vnode_t tvp, fvp, tdvp, sdvp;
7538         struct nameidata fromnd, tond;
7539         int error;
7540         vfs_context_t ctx = vfs_context_current();
7541 #if CONFIG_MACF
7542         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
7543         struct vnode_attr va;
7544 #endif
7545
7546         /* Check that the flags are valid. */
7547
7548         if (uap->flags & ~CPF_MASK) {
7549                 return EINVAL;
7550         }
7551
7552         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
7553             UIO_USERSPACE, uap->from, ctx);
7554         if ((error = namei(&fromnd))) {
7555                 return error;
7556         }
7557         fvp = fromnd.ni_vp;
7558
7559         NDINIT(&tond, CREATE, OP_LINK,
7560             LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7561             UIO_USERSPACE, uap->to, ctx);
7562         if ((error = namei(&tond))) {
7563                 goto out1;
7564         }
7565         tdvp = tond.ni_dvp;
7566         tvp = tond.ni_vp;
7567
7568         if (tvp != NULL) {
7569                 if (!(uap->flags & CPF_OVERWRITE)) {
7570                         error = EEXIST;
7571                         goto out;
7572                 }
7573         }
7574
7575         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
7576                 error = EISDIR;
7577                 goto out;
7578         }
7579
7580         /* This calls existing MAC hooks for open  */
7581         if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
7582             NULL))) {
7583                 goto out;
7584         }
7585
7586         if (tvp) {
7587                 /*
7588                  * See unlinkat_internal for an explanation of the potential
7589                  * ENOENT from the MAC hook but the gist is that the MAC hook
7590                  * can fail because vn_getpath isn't able to return the full
7591                  * path. We choose to ignore this failure.
7592                  */
7593                 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
7594                 if (error && error != ENOENT) {
7595                         goto out;
7596                 }
7597                 error = 0;
7598         }
7599
7600 #if CONFIG_MACF
7601         VATTR_INIT(&va);
7602         VATTR_SET(&va, va_type, fvp->v_type);
7603         /* Mask off all but regular access permissions */
7604         VATTR_SET(&va, va_mode,
7605             ((((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
7606         error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
7607         if (error) {
7608                 goto out;
7609         }
7610 #endif /* CONFIG_MACF */
7611
7612         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) {
7613                 goto out;
7614         }
7615
7616         if (fvp == tdvp) {
7617                 error = EINVAL;
7618         }
7619         /*
7620          * If source is the same as the destination (that is the
7621          * same inode number) then there is nothing to do.
7622          * (fixed to have POSIX semantics - CSM 3/2/98)
7623          */
7624         if (fvp == tvp) {
7625                 error = -1;
7626         }
7627         if (!error) {
7628                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
7629         }
7630 out:
7631         sdvp = tond.ni_startdir;
7632         /*
7633          * nameidone has to happen before we vnode_put(tdvp)
7634          * since it may need to release the fs_nodelock on the tdvp
7635          */
7636         nameidone(&tond);
7637
7638         if (tvp) {
7639                 vnode_put(tvp);
7640         }
7641         vnode_put(tdvp);
7642         vnode_put(sdvp);
7643 out1:
7644         vnode_put(fvp);
7645
7646         nameidone(&fromnd);
7647
7648         if (error == -1) {
7649                 return 0;
7650         }
7651         return error;
7652 }
7653
7654 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7655
7656 /*
7657  * Helper function for doing clones. The caller is expected to provide an
7658  * iocounted source vnode and release it.
7659  */
7660 static int
7661 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7662     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7663 {
7664         vnode_t tvp, tdvp;
7665         struct nameidata tond;
7666         int error;
7667         int follow;
7668         boolean_t free_src_acl;
7669         boolean_t attr_cleanup;
7670         enum vtype v_type;
7671         kauth_action_t action;
7672         struct componentname *cnp;
7673         uint32_t defaulted;
7674         struct vnode_attr va;
7675         struct vnode_attr nva;
7676         uint32_t vnop_flags;
7677
7678         v_type = vnode_vtype(fvp);
7679         switch (v_type) {
7680         case VLNK:
7681         /* FALLTHRU */
7682         case VREG:
7683                 action = KAUTH_VNODE_ADD_FILE;
7684                 break;
7685         case VDIR:
7686                 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7687                     fvp->v_mountedhere) {
7688                         return EINVAL;
7689                 }
7690                 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7691                 break;
7692         default:
7693                 return EINVAL;
7694         }
7695
7696         AUDIT_ARG(fd2, dst_dirfd);
7697         AUDIT_ARG(value32, flags);
7698
7699         follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7700         NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7701             UIO_USERSPACE, dst, ctx);
7702         if ((error = nameiat(&tond, dst_dirfd))) {
7703                 return error;
7704         }
7705         cnp = &tond.ni_cnd;
7706         tdvp = tond.ni_dvp;
7707         tvp = tond.ni_vp;
7708
7709         free_src_acl = FALSE;
7710         attr_cleanup = FALSE;
7711
7712         if (tvp != NULL) {
7713                 error = EEXIST;
7714                 goto out;
7715         }
7716
7717         if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7718                 error = EXDEV;
7719                 goto out;
7720         }
7721
7722 #if CONFIG_MACF
7723         if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp))) {
7724                 goto out;
7725         }
7726 #endif
7727         if ((error = vnode_authorize(tdvp, NULL, action, ctx))) {
7728                 goto out;
7729         }
7730
7731         action = KAUTH_VNODE_GENERIC_READ_BITS;
7732         if (data_read_authorised) {
7733                 action &= ~KAUTH_VNODE_READ_DATA;
7734         }
7735         if ((error = vnode_authorize(fvp, NULL, action, ctx))) {
7736                 goto out;
7737         }
7738
7739         /*
7740          * certain attributes may need to be changed from the source, we ask for
7741          * those here.
7742          */
7743         VATTR_INIT(&va);
7744         VATTR_WANTED(&va, va_uid);
7745         VATTR_WANTED(&va, va_gid);
7746         VATTR_WANTED(&va, va_mode);
7747         VATTR_WANTED(&va, va_flags);
7748         VATTR_WANTED(&va, va_acl);
7749
7750         if ((error = vnode_getattr(fvp, &va, ctx)) != 0) {
7751                 goto out;
7752         }
7753
7754         VATTR_INIT(&nva);
7755         VATTR_SET(&nva, va_type, v_type);
7756         if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
7757                 VATTR_SET(&nva, va_acl, va.va_acl);
7758                 free_src_acl = TRUE;
7759         }
7760
7761         /* Handle ACL inheritance, initialize vap. */
7762         if (v_type == VLNK) {
7763                 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
7764         } else {
7765                 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
7766                 if (error) {
7767                         goto out;
7768                 }
7769                 attr_cleanup = TRUE;
7770         }
7771
7772         vnop_flags = VNODE_CLONEFILE_DEFAULT;
7773         /*
7774          * We've got initial values for all security parameters,
7775          * If we are superuser, then we can change owners to be the
7776          * same as the source. Both superuser and the owner have default
7777          * WRITE_SECURITY privileges so all other fields can be taken
7778          * from source as well.
7779          */
7780         if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
7781                 if (VATTR_IS_SUPPORTED(&va, va_uid)) {
7782                         VATTR_SET(&nva, va_uid, va.va_uid);
7783                 }
7784                 if (VATTR_IS_SUPPORTED(&va, va_gid)) {
7785                         VATTR_SET(&nva, va_gid, va.va_gid);
7786                 }
7787         } else {
7788                 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
7789         }
7790
7791         if (VATTR_IS_SUPPORTED(&va, va_mode)) {
7792                 VATTR_SET(&nva, va_mode, va.va_mode);
7793         }
7794         if (VATTR_IS_SUPPORTED(&va, va_flags)) {
7795                 VATTR_SET(&nva, va_flags,
7796                     ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
7797                     (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
7798         }
7799
7800         error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
7801
7802         if (!error && tvp) {
7803                 int     update_flags = 0;
7804 #if CONFIG_FSE
7805                 int fsevent;
7806 #endif /* CONFIG_FSE */
7807
7808                 /*
7809                  * If some of the requested attributes weren't handled by the
7810                  * VNOP, use our fallback code.
7811                  */
7812                 if (!VATTR_ALL_SUPPORTED(&va)) {
7813                         (void)vnode_setattr_fallback(tvp, &nva, ctx);
7814                 }
7815
7816 #if CONFIG_MACF
7817                 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
7818                     VNODE_LABEL_CREATE, ctx);
7819 #endif
7820
7821                 // Make sure the name & parent pointers are hooked up
7822                 if (tvp->v_name == NULL) {
7823                         update_flags |= VNODE_UPDATE_NAME;
7824                 }
7825                 if (tvp->v_parent == NULLVP) {
7826                         update_flags |= VNODE_UPDATE_PARENT;
7827                 }
7828
7829                 if (update_flags) {
7830                         (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
7831                             cnp->cn_namelen, cnp->cn_hash, update_flags);
7832                 }
7833
7834 #if CONFIG_FSE
7835                 switch (vnode_vtype(tvp)) {
7836                 case VLNK:
7837                 /* FALLTHRU */
7838                 case VREG:
7839                         fsevent = FSE_CREATE_FILE;
7840                         break;
7841                 case VDIR:
7842                         fsevent = FSE_CREATE_DIR;
7843                         break;
7844                 default:
7845                         goto out;
7846                 }
7847
7848                 if (need_fsevent(fsevent, tvp)) {
7849                         /*
7850                          * The following is a sequence of three explicit events.
7851                          * A pair of FSE_CLONE events representing the source and destination
7852                          * followed by an FSE_CREATE_[FILE | DIR] for the destination.
7853                          * fseventsd may coalesce the destination clone and create events
7854                          * into a single event resulting in the following sequence for a client
7855                          * FSE_CLONE (src)
7856                          * FSE_CLONE | FSE_CREATE (dst)
7857                          */
7858                         add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
7859                             FSE_ARG_DONE);
7860                         add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
7861                             FSE_ARG_DONE);
7862                 }
7863 #endif /* CONFIG_FSE */
7864         }
7865
7866 out:
7867         if (attr_cleanup) {
7868                 vn_attribute_cleanup(&nva, defaulted);
7869         }
7870         if (free_src_acl && va.va_acl) {
7871                 kauth_acl_free(va.va_acl);
7872         }
7873         nameidone(&tond);
7874         if (tvp) {
7875                 vnode_put(tvp);
7876         }
7877         vnode_put(tdvp);
7878         return error;
7879 }
7880
7881 /*
7882  * clone files or directories, target must not exist.
7883  */
7884 /* ARGSUSED */
7885 int
7886 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7887     __unused int32_t *retval)
7888 {
7889         vnode_t fvp;
7890         struct nameidata fromnd;
7891         int follow;
7892         int error;
7893         vfs_context_t ctx = vfs_context_current();
7894
7895         /* Check that the flags are valid. */
7896         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7897                 return EINVAL;
7898         }
7899
7900         AUDIT_ARG(fd, uap->src_dirfd);
7901
7902         follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7903         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7904             UIO_USERSPACE, uap->src, ctx);
7905         if ((error = nameiat(&fromnd, uap->src_dirfd))) {
7906                 return error;
7907         }
7908
7909         fvp = fromnd.ni_vp;
7910         nameidone(&fromnd);
7911
7912         error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7913             uap->flags, ctx);
7914
7915         vnode_put(fvp);
7916         return error;
7917 }
7918
7919 int
7920 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7921     __unused int32_t *retval)
7922 {
7923         vnode_t fvp;
7924         struct fileproc *fp;
7925         int error;
7926         vfs_context_t ctx = vfs_context_current();
7927
7928         /* Check that the flags are valid. */
7929         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY)) {
7930                 return EINVAL;
7931         }
7932
7933         AUDIT_ARG(fd, uap->src_fd);
7934         error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7935         if (error) {
7936                 return error;
7937         }
7938
7939         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7940                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7941                 error = EBADF;
7942                 goto out;
7943         }
7944
7945         if ((error = vnode_getwithref(fvp))) {
7946                 goto out;
7947         }
7948
7949         AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7950
7951         error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7952             uap->flags, ctx);
7953
7954         vnode_put(fvp);
7955 out:
7956         file_drop(uap->src_fd);
7957         return error;
7958 }
7959
7960 static int
7961 rename_submounts_callback(mount_t mp, void *arg)
7962 {
7963         int error = 0;
7964         mount_t pmp = (mount_t)arg;
7965         int prefix_len = strlen(pmp->mnt_vfsstat.f_mntonname);
7966
7967         if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) {
7968                 return 0;
7969         }
7970
7971         if (mp->mnt_vfsstat.f_mntonname[prefix_len] != '/') {
7972                 return 0;
7973         }
7974
7975         if ((error = vfs_busy(mp, LK_NOWAIT))) {
7976                 printf("vfs_busy failed with %d for %s\n", error, mp->mnt_vfsstat.f_mntonname);
7977                 return -1;
7978         }
7979
7980         int pathlen = MAXPATHLEN;
7981         if ((error = vn_getpath_ext(mp->mnt_vnodecovered, NULL, mp->mnt_vfsstat.f_mntonname, &pathlen, VN_GETPATH_FSENTER))) {
7982                 printf("vn_getpath_ext failed with %d for mnt_vnodecovered of %s\n", error, mp->mnt_vfsstat.f_mntonname);
7983         }
7984
7985         vfs_unbusy(mp);
7986
7987         return error;
7988 }
7989
7990 /*
7991  * Rename files.  Source and destination must either both be directories,
7992  * or both not be directories.  If target is a directory, it must be empty.
7993  */
7994 /* ARGSUSED */
7995 static int
7996 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
7997     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
7998 {
7999         if (flags & ~VFS_RENAME_FLAGS_MASK) {
8000                 return EINVAL;
8001         }
8002
8003         if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL)) {
8004                 return EINVAL;
8005         }
8006
8007         vnode_t tvp, tdvp;
8008         vnode_t fvp, fdvp;
8009         struct nameidata *fromnd, *tond;
8010         int error;
8011         int do_retry;
8012         int retry_count;
8013         int mntrename;
8014         int need_event;
8015         int need_kpath2;
8016         int has_listeners;
8017         const char *oname = NULL;
8018         char *from_name = NULL, *to_name = NULL;
8019         char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL;
8020         int from_len = 0, to_len = 0;
8021         int from_len_no_firmlink = 0, to_len_no_firmlink = 0;
8022         int holding_mntlock;
8023         mount_t locked_mp = NULL;
8024         vnode_t oparent = NULLVP;
8025 #if CONFIG_FSE
8026         fse_info from_finfo, to_finfo;
8027 #endif
8028         int from_truncated = 0, to_truncated = 0;
8029         int from_truncated_no_firmlink = 0, to_truncated_no_firmlink = 0;
8030         int batched = 0;
8031         struct vnode_attr *fvap, *tvap;
8032         int continuing = 0;
8033         /* carving out a chunk for structs that are too big to be on stack. */
8034         struct {
8035                 struct nameidata from_node, to_node;
8036                 struct vnode_attr fv_attr, tv_attr;
8037         } * __rename_data;
8038         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
8039         fromnd = &__rename_data->from_node;
8040         tond = &__rename_data->to_node;
8041
8042         holding_mntlock = 0;
8043         do_retry = 0;
8044         retry_count = 0;
8045 retry:
8046         fvp = tvp = NULL;
8047         fdvp = tdvp = NULL;
8048         fvap = tvap = NULL;
8049         mntrename = FALSE;
8050
8051         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
8052             segflg, from, ctx);
8053         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
8054
8055         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
8056             segflg, to, ctx);
8057         tond->ni_flag = NAMEI_COMPOUNDRENAME;
8058
8059 continue_lookup:
8060         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8061                 if ((error = nameiat(fromnd, fromfd))) {
8062                         goto out1;
8063                 }
8064                 fdvp = fromnd->ni_dvp;
8065                 fvp  = fromnd->ni_vp;
8066
8067                 if (fvp && fvp->v_type == VDIR) {
8068                         tond->ni_cnd.cn_flags |= WILLBEDIR;
8069                 }
8070         }
8071
8072         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
8073                 if ((error = nameiat(tond, tofd))) {
8074                         /*
8075                          * Translate error code for rename("dir1", "dir2/.").
8076                          */
8077                         if (error == EISDIR && fvp->v_type == VDIR) {
8078                                 error = EINVAL;
8079                         }
8080                         goto out1;
8081                 }
8082                 tdvp = tond->ni_dvp;
8083                 tvp  = tond->ni_vp;
8084         }
8085
8086 #if DEVELOPMENT || DEBUG
8087         /*
8088          * XXX VSWAP: Check for entitlements or special flag here
8089          * so we can restrict access appropriately.
8090          */
8091 #else /* DEVELOPMENT || DEBUG */
8092
8093         if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
8094                 error = EPERM;
8095                 goto out1;
8096         }
8097
8098         if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
8099                 error = EPERM;
8100                 goto out1;
8101         }
8102 #endif /* DEVELOPMENT || DEBUG */
8103
8104         if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
8105                 error = ENOENT;
8106                 goto out1;
8107         }
8108
8109         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
8110                 error = EEXIST;
8111                 goto out1;
8112         }
8113
8114         batched = vnode_compound_rename_available(fdvp);
8115
8116 #if CONFIG_FSE
8117         need_event = need_fsevent(FSE_RENAME, fdvp);
8118         if (need_event) {
8119                 if (fvp) {
8120                         get_fse_info(fvp, &from_finfo, ctx);
8121                 } else {
8122                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
8123                         if (error) {
8124                                 goto out1;
8125                         }
8126
8127                         fvap = &__rename_data->fv_attr;
8128                 }
8129
8130                 if (tvp) {
8131                         get_fse_info(tvp, &to_finfo, ctx);
8132                 } else if (batched) {
8133                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
8134                         if (error) {
8135                                 goto out1;
8136                         }
8137
8138                         tvap = &__rename_data->tv_attr;
8139                 }
8140         }
8141 #else
8142         need_event = 0;
8143 #endif /* CONFIG_FSE */
8144
8145         has_listeners = kauth_authorize_fileop_has_listeners();
8146
8147         need_kpath2 = 0;
8148 #if CONFIG_AUDIT
8149         if (AUDIT_RECORD_EXISTS()) {
8150                 need_kpath2 = 1;
8151         }
8152 #endif
8153
8154         if (need_event || has_listeners) {
8155                 if (from_name == NULL) {
8156                         GET_PATH(from_name);
8157                         if (from_name == NULL) {
8158                                 error = ENOMEM;
8159                                 goto out1;
8160                         }
8161                 }
8162
8163                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
8164
8165                 if (from_name_no_firmlink == NULL) {
8166                         GET_PATH(from_name_no_firmlink);
8167                         if (from_name_no_firmlink == NULL) {
8168                                 error = ENOMEM;
8169                                 goto out1;
8170                         }
8171                 }
8172
8173                 from_len_no_firmlink = safe_getpath_no_firmlink(fdvp, fromnd->ni_cnd.cn_nameptr, from_name_no_firmlink, MAXPATHLEN, &from_truncated_no_firmlink);
8174         }
8175
8176         if (need_event || need_kpath2 || has_listeners) {
8177                 if (to_name == NULL) {
8178                         GET_PATH(to_name);
8179                         if (to_name == NULL) {
8180                                 error = ENOMEM;
8181                                 goto out1;
8182                         }
8183                 }
8184
8185                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
8186
8187                 if (to_name_no_firmlink == NULL) {
8188                         GET_PATH(to_name_no_firmlink);
8189                         if (to_name_no_firmlink == NULL) {
8190                                 error = ENOMEM;
8191                                 goto out1;
8192                         }
8193                 }
8194
8195                 to_len_no_firmlink = safe_getpath_no_firmlink(tdvp, tond->ni_cnd.cn_nameptr, to_name_no_firmlink, MAXPATHLEN, &to_truncated_no_firmlink);
8196                 if (to_name && need_kpath2) {
8197                         AUDIT_ARG(kpath, to_name, ARG_KPATH2);
8198                 }
8199         }
8200         if (!fvp) {
8201                 /*
8202                  * Claim: this check will never reject a valid rename.
8203                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
8204                  * Suppose fdvp and tdvp are not on the same mount.
8205                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
8206                  *      then you can't move it to within another dir on the same mountpoint.
8207                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
8208                  *
8209                  * If this check passes, then we are safe to pass these vnodes to the same FS.
8210                  */
8211                 if (fdvp->v_mount != tdvp->v_mount) {
8212                         error = EXDEV;
8213                         goto out1;
8214                 }
8215                 goto skipped_lookup;
8216         }
8217
8218         if (!batched) {
8219                 error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
8220                 if (error) {
8221                         if (error == ENOENT) {
8222                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8223                                         /*
8224                                          * We encountered a race where after doing the namei, tvp stops
8225                                          * being valid. If so, simply re-drive the rename call from the
8226                                          * top.
8227                                          */
8228                                         do_retry = 1;
8229                                         retry_count += 1;
8230                                 }
8231                         }
8232                         goto out1;
8233                 }
8234         }
8235
8236         /*
8237          * If the source and destination are the same (i.e. they're
8238          * links to the same vnode) and the target file system is
8239          * case sensitive, then there is nothing to do.
8240          *
8241          * XXX Come back to this.
8242          */
8243         if (fvp == tvp) {
8244                 int pathconf_val;
8245
8246                 /*
8247                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
8248                  * then assume that this file system is case sensitive.
8249                  */
8250                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
8251                     pathconf_val != 0) {
8252                         goto out1;
8253                 }
8254         }
8255
8256         /*
8257          * Allow the renaming of mount points.
8258          * - target must not exist
8259          * - target must reside in the same directory as source
8260          * - union mounts cannot be renamed
8261          * - "/" cannot be renamed
8262          *
8263          * XXX Handle this in VFS after a continued lookup (if we missed
8264          * in the cache to start off)
8265          *
8266          * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
8267          * we'll skip past here.  The file system is responsible for
8268          * checking that @tvp is not a descendent of @fvp and vice versa
8269          * so it should always return EINVAL if either @tvp or @fvp is the
8270          * root of a volume.
8271          */
8272         if ((fvp->v_flag & VROOT) &&
8273             (fvp->v_type == VDIR) &&
8274             (tvp == NULL) &&
8275             (fvp->v_mountedhere == NULL) &&
8276             (fdvp == tdvp) &&
8277             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0) &&
8278             ((fvp->v_mount->mnt_kern_flag & MNTK_SYSTEM) == 0) &&
8279             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
8280                 vnode_t coveredvp;
8281
8282                 /* switch fvp to the covered vnode */
8283                 coveredvp = fvp->v_mount->mnt_vnodecovered;
8284                 if ((vnode_getwithref(coveredvp))) {
8285                         error = ENOENT;
8286                         goto out1;
8287                 }
8288                 vnode_put(fvp);
8289
8290                 fvp = coveredvp;
8291                 mntrename = TRUE;
8292         }
8293         /*
8294          * Check for cross-device rename.
8295          */
8296         if ((fvp->v_mount != tdvp->v_mount) ||
8297             (tvp && (fvp->v_mount != tvp->v_mount))) {
8298                 error = EXDEV;
8299                 goto out1;
8300         }
8301
8302         /*
8303          * If source is the same as the destination (that is the
8304          * same inode number) then there is nothing to do...
8305          * EXCEPT if the underlying file system supports case
8306          * insensitivity and is case preserving.  In this case
8307          * the file system needs to handle the special case of
8308          * getting the same vnode as target (fvp) and source (tvp).
8309          *
8310          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
8311          * and _PC_CASE_PRESERVING can have this exception, and they need to
8312          * handle the special case of getting the same vnode as target and
8313          * source.  NOTE: Then the target is unlocked going into vnop_rename,
8314          * so not to cause locking problems. There is a single reference on tvp.
8315          *
8316          * NOTE - that fvp == tvp also occurs if they are hard linked and
8317          * that correct behaviour then is just to return success without doing
8318          * anything.
8319          *
8320          * XXX filesystem should take care of this itself, perhaps...
8321          */
8322         if (fvp == tvp && fdvp == tdvp) {
8323                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
8324                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
8325                     fromnd->ni_cnd.cn_namelen)) {
8326                         goto out1;
8327                 }
8328         }
8329
8330         if (holding_mntlock && fvp->v_mount != locked_mp) {
8331                 /*
8332                  * we're holding a reference and lock
8333                  * on locked_mp, but it no longer matches
8334                  * what we want to do... so drop our hold
8335                  */
8336                 mount_unlock_renames(locked_mp);
8337                 mount_drop(locked_mp, 0);
8338                 holding_mntlock = 0;
8339         }
8340         if (tdvp != fdvp && fvp->v_type == VDIR) {
8341                 /*
8342                  * serialize renames that re-shape
8343                  * the tree... if holding_mntlock is
8344                  * set, then we're ready to go...
8345                  * otherwise we
8346                  * first need to drop the iocounts
8347                  * we picked up, second take the
8348                  * lock to serialize the access,
8349                  * then finally start the lookup
8350                  * process over with the lock held
8351                  */
8352                 if (!holding_mntlock) {
8353                         /*
8354                          * need to grab a reference on
8355                          * the mount point before we
8356                          * drop all the iocounts... once
8357                          * the iocounts are gone, the mount
8358                          * could follow
8359                          */
8360                         locked_mp = fvp->v_mount;
8361                         mount_ref(locked_mp, 0);
8362
8363                         /*
8364                          * nameidone has to happen before we vnode_put(tvp)
8365                          * since it may need to release the fs_nodelock on the tvp
8366                          */
8367                         nameidone(tond);
8368
8369                         if (tvp) {
8370                                 vnode_put(tvp);
8371                         }
8372                         vnode_put(tdvp);
8373
8374                         /*
8375                          * nameidone has to happen before we vnode_put(fdvp)
8376                          * since it may need to release the fs_nodelock on the fvp
8377                          */
8378                         nameidone(fromnd);
8379
8380                         vnode_put(fvp);
8381                         vnode_put(fdvp);
8382
8383                         mount_lock_renames(locked_mp);
8384                         holding_mntlock = 1;
8385
8386                         goto retry;
8387                 }
8388         } else {
8389                 /*
8390                  * when we dropped the iocounts to take
8391                  * the lock, we allowed the identity of
8392                  * the various vnodes to change... if they did,
8393                  * we may no longer be dealing with a rename
8394                  * that reshapes the tree... once we're holding
8395                  * the iocounts, the vnodes can't change type
8396                  * so we're free to drop the lock at this point
8397                  * and continue on
8398                  */
8399                 if (holding_mntlock) {
8400                         mount_unlock_renames(locked_mp);
8401                         mount_drop(locked_mp, 0);
8402                         holding_mntlock = 0;
8403                 }
8404         }
8405
8406         // save these off so we can later verify that fvp is the same
8407         oname   = fvp->v_name;
8408         oparent = fvp->v_parent;
8409
8410 skipped_lookup:
8411         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
8412             tdvp, &tvp, &tond->ni_cnd, tvap,
8413             flags, ctx);
8414
8415         if (holding_mntlock) {
8416                 /*
8417                  * we can drop our serialization
8418                  * lock now
8419                  */
8420                 mount_unlock_renames(locked_mp);
8421                 mount_drop(locked_mp, 0);
8422                 holding_mntlock = 0;
8423         }
8424         if (error) {
8425                 if (error == EDATALESS) {
8426                         /*
8427                          * If we've been here before, something has gone
8428                          * horribly wrong and we should just get out lest
8429                          * we spiral around the drain forever.
8430                          */
8431                         if (flags & VFS_RENAME_DATALESS) {
8432                                 error = EIO;
8433                                 goto out1;
8434                         }
8435
8436                         /*
8437                          * The object we're renaming is dataless (or has a
8438                          * dataless descendent) and requires materialization
8439                          * before the rename occurs.  But we're holding the
8440                          * mount point's rename lock, so it's not safe to
8441                          * make the upcall.
8442                          *
8443                          * In this case, we release the lock, perform the
8444                          * materialization, and start the whole thing over.
8445                          */
8446                         error = vnode_materialize_dataless_file(fvp,
8447                             NAMESPACE_HANDLER_RENAME_OP);
8448
8449                         if (error == 0) {
8450                                 /*
8451                                  * The next time around we need to tell the
8452                                  * file system that the materializtaion has
8453                                  * been performed.
8454                                  */
8455                                 flags |= VFS_RENAME_DATALESS;
8456                                 do_retry = 1;
8457                         }
8458                         goto out1;
8459                 }
8460                 if (error == EKEEPLOOKING) {
8461                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8462                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
8463                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
8464                                 }
8465                         }
8466
8467                         fromnd->ni_vp = fvp;
8468                         tond->ni_vp = tvp;
8469
8470                         goto continue_lookup;
8471                 }
8472
8473                 /*
8474                  * We may encounter a race in the VNOP where the destination didn't
8475                  * exist when we did the namei, but it does by the time we go and
8476                  * try to create the entry. In this case, we should re-drive this rename
8477                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
8478                  * but other filesystems susceptible to this race could return it, too.
8479                  */
8480                 if (error == ERECYCLE) {
8481                         do_retry = 1;
8482                 }
8483
8484                 /*
8485                  * For compound VNOPs, the authorization callback may return
8486                  * ENOENT in case of racing hardlink lookups hitting the name
8487                  * cache, redrive the lookup.
8488                  */
8489                 if (batched && error == ENOENT) {
8490                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8491                                 do_retry = 1;
8492                                 retry_count += 1;
8493                         }
8494                 }
8495
8496                 goto out1;
8497         }
8498
8499         /* call out to allow 3rd party notification of rename.
8500          * Ignore result of kauth_authorize_fileop call.
8501          */
8502         kauth_authorize_fileop(vfs_context_ucred(ctx),
8503             KAUTH_FILEOP_RENAME,
8504             (uintptr_t)from_name, (uintptr_t)to_name);
8505         if (flags & VFS_RENAME_SWAP) {
8506                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8507                     KAUTH_FILEOP_RENAME,
8508                     (uintptr_t)to_name, (uintptr_t)from_name);
8509         }
8510
8511 #if CONFIG_FSE
8512         if (from_name != NULL && to_name != NULL) {
8513                 if (from_truncated || to_truncated) {
8514                         // set it here since only the from_finfo gets reported up to user space
8515                         from_finfo.mode |= FSE_TRUNCATED_PATH;
8516                 }
8517
8518                 if (tvap && tvp) {
8519                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
8520                 }
8521                 if (fvap) {
8522                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
8523                 }
8524
8525                 if (tvp) {
8526                         add_fsevent(FSE_RENAME, ctx,
8527                             FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8528                             FSE_ARG_FINFO, &from_finfo,
8529                             FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8530                             FSE_ARG_FINFO, &to_finfo,
8531                             FSE_ARG_DONE);
8532                         if (flags & VFS_RENAME_SWAP) {
8533                                 /*
8534                                  * Strictly speaking, swap is the equivalent of
8535                                  * *three* renames.  FSEvents clients should only take
8536                                  * the events as a hint, so we only bother reporting
8537                                  * two.
8538                                  */
8539                                 add_fsevent(FSE_RENAME, ctx,
8540                                     FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8541                                     FSE_ARG_FINFO, &to_finfo,
8542                                     FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8543                                     FSE_ARG_FINFO, &from_finfo,
8544                                     FSE_ARG_DONE);
8545                         }
8546                 } else {
8547                         add_fsevent(FSE_RENAME, ctx,
8548                             FSE_ARG_STRING, from_len_no_firmlink, from_name_no_firmlink,
8549                             FSE_ARG_FINFO, &from_finfo,
8550                             FSE_ARG_STRING, to_len_no_firmlink, to_name_no_firmlink,
8551                             FSE_ARG_DONE);
8552                 }
8553         }
8554 #endif /* CONFIG_FSE */
8555
8556         /*
8557          * update filesystem's mount point data
8558          */
8559         if (mntrename) {
8560                 char *cp, *pathend, *mpname;
8561                 char * tobuf;
8562                 struct mount *mp;
8563                 int maxlen;
8564                 size_t len = 0;
8565
8566                 mp = fvp->v_mountedhere;
8567
8568                 if (vfs_busy(mp, LK_NOWAIT)) {
8569                         error = EBUSY;
8570                         goto out1;
8571                 }
8572                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
8573
8574                 if (UIO_SEG_IS_USER_SPACE(segflg)) {
8575                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
8576                 } else {
8577                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
8578                 }
8579                 if (!error) {
8580                         /* find current mount point prefix */
8581                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
8582                         for (cp = pathend; *cp != '\0'; ++cp) {
8583                                 if (*cp == '/') {
8584                                         pathend = cp + 1;
8585                                 }
8586                         }
8587                         /* find last component of target name */
8588                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
8589                                 if (*cp == '/') {
8590                                         mpname = cp + 1;
8591                                 }
8592                         }
8593
8594                         /* Update f_mntonname of sub mounts */
8595                         vfs_iterate(0, rename_submounts_callback, (void *)mp);
8596
8597                         /* append name to prefix */
8598                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
8599                         bzero(pathend, maxlen);
8600
8601                         strlcpy(pathend, mpname, maxlen);
8602                 }
8603                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
8604
8605                 vfs_unbusy(mp);
8606
8607                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
8608         }
8609         /*
8610          * fix up name & parent pointers.  note that we first
8611          * check that fvp has the same name/parent pointers it
8612          * had before the rename call... this is a 'weak' check
8613          * at best...
8614          *
8615          * XXX oparent and oname may not be set in the compound vnop case
8616          */
8617         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
8618                 int update_flags;
8619
8620                 update_flags = VNODE_UPDATE_NAME;
8621
8622                 if (fdvp != tdvp) {
8623                         update_flags |= VNODE_UPDATE_PARENT;
8624                 }
8625
8626                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
8627         }
8628 out1:
8629         if (to_name != NULL) {
8630                 RELEASE_PATH(to_name);
8631                 to_name = NULL;
8632         }
8633         if (to_name_no_firmlink != NULL) {
8634                 RELEASE_PATH(to_name_no_firmlink);
8635                 to_name_no_firmlink = NULL;
8636         }
8637         if (from_name != NULL) {
8638                 RELEASE_PATH(from_name);
8639                 from_name = NULL;
8640         }
8641         if (from_name_no_firmlink != NULL) {
8642                 RELEASE_PATH(from_name_no_firmlink);
8643                 from_name_no_firmlink = NULL;
8644         }
8645         if (holding_mntlock) {
8646                 mount_unlock_renames(locked_mp);
8647                 mount_drop(locked_mp, 0);
8648                 holding_mntlock = 0;
8649         }
8650         if (tdvp) {
8651                 /*
8652                  * nameidone has to happen before we vnode_put(tdvp)
8653                  * since it may need to release the fs_nodelock on the tdvp
8654                  */
8655                 nameidone(tond);
8656
8657                 if (tvp) {
8658                         vnode_put(tvp);
8659                 }
8660                 vnode_put(tdvp);
8661         }
8662         if (fdvp) {
8663                 /*
8664                  * nameidone has to happen before we vnode_put(fdvp)
8665                  * since it may need to release the fs_nodelock on the fdvp
8666                  */
8667                 nameidone(fromnd);
8668
8669                 if (fvp) {
8670                         vnode_put(fvp);
8671                 }
8672                 vnode_put(fdvp);
8673         }
8674
8675         /*
8676          * If things changed after we did the namei, then we will re-drive
8677          * this rename call from the top.
8678          */
8679         if (do_retry) {
8680                 do_retry = 0;
8681                 goto retry;
8682         }
8683
8684         FREE(__rename_data, M_TEMP);
8685         return error;
8686 }
8687
8688 int
8689 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
8690 {
8691         return renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
8692                    AT_FDCWD, uap->to, UIO_USERSPACE, 0);
8693 }
8694
8695 int
8696 renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
8697 {
8698         return renameat_internal(
8699                 vfs_context_current(),
8700                 uap->fromfd, uap->from,
8701                 uap->tofd, uap->to,
8702                 UIO_USERSPACE, uap->flags);
8703 }
8704
8705 int
8706 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
8707 {
8708         return renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
8709                    uap->tofd, uap->to, UIO_USERSPACE, 0);
8710 }
8711
8712 /*
8713  * Make a directory file.
8714  *
8715  * Returns:     0                       Success
8716  *              EEXIST
8717  *      namei:???
8718  *      vnode_authorize:???
8719  *      vn_create:???
8720  */
8721 /* ARGSUSED */
8722 static int
8723 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
8724     enum uio_seg segflg)
8725 {
8726         vnode_t vp, dvp;
8727         int error;
8728         int update_flags = 0;
8729         int batched;
8730         struct nameidata nd;
8731
8732         AUDIT_ARG(mode, vap->va_mode);
8733         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
8734             path, ctx);
8735         nd.ni_cnd.cn_flags |= WILLBEDIR;
8736         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
8737
8738 continue_lookup:
8739         error = nameiat(&nd, fd);
8740         if (error) {
8741                 return error;
8742         }
8743         dvp = nd.ni_dvp;
8744         vp = nd.ni_vp;
8745
8746         if (vp != NULL) {
8747                 error = EEXIST;
8748                 goto out;
8749         }
8750
8751         batched = vnode_compound_mkdir_available(dvp);
8752
8753         VATTR_SET(vap, va_type, VDIR);
8754
8755         /*
8756          * XXX
8757          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
8758          * only get EXISTS or EISDIR for existing path components, and not that it could see
8759          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
8760          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
8761          */
8762         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
8763                 if (error == EACCES || error == EPERM) {
8764                         int error2;
8765
8766                         nameidone(&nd);
8767                         vnode_put(dvp);
8768                         dvp = NULLVP;
8769
8770                         /*
8771                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
8772                          * rather than EACCESS if the target exists.
8773                          */
8774                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
8775                             path, ctx);
8776                         error2 = nameiat(&nd, fd);
8777                         if (error2) {
8778                                 goto out;
8779                         } else {
8780                                 vp = nd.ni_vp;
8781                                 error = EEXIST;
8782                                 goto out;
8783                         }
8784                 }
8785
8786                 goto out;
8787         }
8788
8789         /*
8790          * make the directory
8791          */
8792         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
8793                 if (error == EKEEPLOOKING) {
8794                         nd.ni_vp = vp;
8795                         goto continue_lookup;
8796                 }
8797
8798                 goto out;
8799         }
8800
8801         // Make sure the name & parent pointers are hooked up
8802         if (vp->v_name == NULL) {
8803                 update_flags |= VNODE_UPDATE_NAME;
8804         }
8805         if (vp->v_parent == NULLVP) {
8806                 update_flags |= VNODE_UPDATE_PARENT;
8807         }
8808
8809         if (update_flags) {
8810                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
8811         }
8812
8813 #if CONFIG_FSE
8814         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
8815 #endif
8816
8817 out:
8818         /*
8819          * nameidone has to happen before we vnode_put(dvp)
8820          * since it may need to release the fs_nodelock on the dvp
8821          */
8822         nameidone(&nd);
8823
8824         if (vp) {
8825                 vnode_put(vp);
8826         }
8827         if (dvp) {
8828                 vnode_put(dvp);
8829         }
8830
8831         return error;
8832 }
8833
8834 /*
8835  * mkdir_extended: Create a directory; with extended security (ACL).
8836  *
8837  * Parameters:    p                       Process requesting to create the directory
8838  *                uap                     User argument descriptor (see below)
8839  *                retval                  (ignored)
8840  *
8841  * Indirect:      uap->path               Path of directory to create
8842  *                uap->mode               Access permissions to set
8843  *                uap->xsecurity          ACL to set
8844  *
8845  * Returns:        0                      Success
8846  *                !0                      Not success
8847  *
8848  */
8849 int
8850 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
8851 {
8852         int ciferror;
8853         kauth_filesec_t xsecdst;
8854         struct vnode_attr va;
8855
8856         AUDIT_ARG(owner, uap->uid, uap->gid);
8857
8858         xsecdst = NULL;
8859         if ((uap->xsecurity != USER_ADDR_NULL) &&
8860             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)) {
8861                 return ciferror;
8862         }
8863
8864         VATTR_INIT(&va);
8865         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8866         if (xsecdst != NULL) {
8867                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
8868         }
8869
8870         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8871             UIO_USERSPACE);
8872         if (xsecdst != NULL) {
8873                 kauth_filesec_free(xsecdst);
8874         }
8875         return ciferror;
8876 }
8877
8878 int
8879 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
8880 {
8881         struct vnode_attr va;
8882
8883         VATTR_INIT(&va);
8884         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8885
8886         return mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8887                    UIO_USERSPACE);
8888 }
8889
8890 int
8891 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
8892 {
8893         struct vnode_attr va;
8894
8895         VATTR_INIT(&va);
8896         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8897
8898         return mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
8899                    UIO_USERSPACE);
8900 }
8901
8902 static int
8903 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
8904     enum uio_seg segflg, int unlink_flags)
8905 {
8906         vnode_t vp, dvp;
8907         int error;
8908         struct nameidata nd;
8909         char     *path = NULL;
8910         char     *no_firmlink_path = NULL;
8911         int       len_path = 0;
8912         int       len_no_firmlink_path = 0;
8913         int has_listeners = 0;
8914         int need_event = 0;
8915         int truncated_path = 0;
8916         int truncated_no_firmlink_path = 0;
8917 #if CONFIG_FSE
8918         struct vnode_attr va;
8919 #endif /* CONFIG_FSE */
8920         struct vnode_attr *vap = NULL;
8921         int restart_count = 0;
8922         int batched;
8923
8924         int restart_flag;
8925
8926         /*
8927          * This loop exists to restart rmdir in the unlikely case that two
8928          * processes are simultaneously trying to remove the same directory
8929          * containing orphaned appleDouble files.
8930          */
8931         do {
8932                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
8933                     segflg, dirpath, ctx);
8934                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
8935 continue_lookup:
8936                 restart_flag = 0;
8937                 vap = NULL;
8938
8939                 error = nameiat(&nd, fd);
8940                 if (error) {
8941                         return error;
8942                 }
8943
8944                 dvp = nd.ni_dvp;
8945                 vp = nd.ni_vp;
8946
8947                 if (vp) {
8948                         batched = vnode_compound_rmdir_available(vp);
8949
8950                         if (vp->v_flag & VROOT) {
8951                                 /*
8952                                  * The root of a mounted filesystem cannot be deleted.
8953                                  */
8954                                 error = EBUSY;
8955                                 goto out;
8956                         }
8957
8958 #if DEVELOPMENT || DEBUG
8959                         /*
8960                          * XXX VSWAP: Check for entitlements or special flag here
8961                          * so we can restrict access appropriately.
8962                          */
8963 #else /* DEVELOPMENT || DEBUG */
8964
8965                         if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
8966                                 error = EPERM;
8967                                 goto out;
8968                         }
8969 #endif /* DEVELOPMENT || DEBUG */
8970
8971                         /*
8972                          * Removed a check here; we used to abort if vp's vid
8973                          * was not the same as what we'd seen the last time around.
8974                          * I do not think that check was valid, because if we retry
8975                          * and all dirents are gone, the directory could legitimately
8976                          * be recycled but still be present in a situation where we would
8977                          * have had permission to delete.  Therefore, we won't make
8978                          * an effort to preserve that check now that we may not have a
8979                          * vp here.
8980                          */
8981
8982                         if (!batched) {
8983                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
8984                                 if (error) {
8985                                         if (error == ENOENT) {
8986                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8987                                                         restart_flag = 1;
8988                                                         restart_count += 1;
8989                                                 }
8990                                         }
8991                                         goto out;
8992                                 }
8993                         }
8994                 } else {
8995                         batched = 1;
8996
8997                         if (!vnode_compound_rmdir_available(dvp)) {
8998                                 panic("No error, but no compound rmdir?");
8999                         }
9000                 }
9001
9002 #if CONFIG_FSE
9003                 fse_info  finfo;
9004
9005                 need_event = need_fsevent(FSE_DELETE, dvp);
9006                 if (need_event) {
9007                         if (!batched) {
9008                                 get_fse_info(vp, &finfo, ctx);
9009                         } else {
9010                                 error = vfs_get_notify_attributes(&va);
9011                                 if (error) {
9012                                         goto out;
9013                                 }
9014
9015                                 vap = &va;
9016                         }
9017                 }
9018 #endif
9019                 has_listeners = kauth_authorize_fileop_has_listeners();
9020                 if (need_event || has_listeners) {
9021                         if (path == NULL) {
9022                                 GET_PATH(path);
9023                                 if (path == NULL) {
9024                                         error = ENOMEM;
9025                                         goto out;
9026                                 }
9027                         }
9028
9029                         len_path = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
9030
9031                         if (no_firmlink_path == NULL) {
9032                                 GET_PATH(no_firmlink_path);
9033                                 if (no_firmlink_path == NULL) {
9034                                         error = ENOMEM;
9035                                         goto out;
9036                                 }
9037                         }
9038
9039                         len_no_firmlink_path = safe_getpath_no_firmlink(dvp, nd.ni_cnd.cn_nameptr, no_firmlink_path, MAXPATHLEN, &truncated_no_firmlink_path);
9040 #if CONFIG_FSE
9041                         if (truncated_no_firmlink_path) {
9042                                 finfo.mode |= FSE_TRUNCATED_PATH;
9043                         }
9044 #endif
9045                 }
9046
9047                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9048                 nd.ni_vp = vp;
9049                 if (vp == NULLVP) {
9050                         /* Couldn't find a vnode */
9051                         goto out;
9052                 }
9053
9054                 if (error == EKEEPLOOKING) {
9055                         goto continue_lookup;
9056                 } else if (batched && error == ENOENT) {
9057                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
9058                                 /*
9059                                  * For compound VNOPs, the authorization callback
9060                                  * may return ENOENT in case of racing hard link lookups
9061                                  * redrive the lookup.
9062                                  */
9063                                 restart_flag = 1;
9064                                 restart_count += 1;
9065                                 goto out;
9066                         }
9067                 }
9068
9069                 /*
9070                  * XXX There's no provision for passing flags
9071                  * to VNOP_RMDIR().  So, if vn_rmdir() fails
9072                  * because it's not empty, then we try again
9073                  * with VNOP_REMOVE(), passing in a special
9074                  * flag that clever file systems will know
9075                  * how to handle.
9076                  */
9077                 if (error == ENOTEMPTY &&
9078                     (unlink_flags & VNODE_REMOVE_DATALESS_DIR) != 0) {
9079                         /*
9080                          * If this fails, we want to keep the original
9081                          * error.
9082                          */
9083                         if (vn_remove(dvp, &vp, &nd,
9084                             VNODE_REMOVE_DATALESS_DIR, vap, ctx) == 0) {
9085                                 error = 0;
9086                         }
9087                 }
9088
9089 #if CONFIG_APPLEDOUBLE
9090                 /*
9091                  * Special case to remove orphaned AppleDouble
9092                  * files. I don't like putting this in the kernel,
9093                  * but carbon does not like putting this in carbon either,
9094                  * so here we are.
9095                  */
9096                 if (error == ENOTEMPTY) {
9097                         int ad_error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
9098                         if (ad_error == EBUSY) {
9099                                 error = ad_error;
9100                                 goto out;
9101                         }
9102
9103
9104                         /*
9105                          * Assuming everything went well, we will try the RMDIR again
9106                          */
9107                         if (!ad_error) {
9108                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
9109                         }
9110                 }
9111 #endif /* CONFIG_APPLEDOUBLE */
9112                 /*
9113                  * Call out to allow 3rd party notification of delete.
9114                  * Ignore result of kauth_authorize_fileop call.
9115                  */
9116                 if (!error) {
9117                         if (has_listeners) {
9118                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
9119                                     KAUTH_FILEOP_DELETE,
9120                                     (uintptr_t)vp,
9121                                     (uintptr_t)path);
9122                         }
9123
9124                         if (vp->v_flag & VISHARDLINK) {
9125                                 // see the comment in unlink1() about why we update
9126                                 // the parent of a hard link when it is removed
9127                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
9128                         }
9129
9130 #if CONFIG_FSE
9131                         if (need_event) {
9132                                 if (vap) {
9133                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
9134                                 }
9135                                 add_fsevent(FSE_DELETE, ctx,
9136                                     FSE_ARG_STRING, len_no_firmlink_path, no_firmlink_path,
9137                                     FSE_ARG_FINFO, &finfo,
9138                                     FSE_ARG_DONE);
9139                         }
9140 #endif
9141                 }
9142
9143 out:
9144                 if (path != NULL) {
9145                         RELEASE_PATH(path);
9146                         path = NULL;
9147                 }
9148
9149                 if (no_firmlink_path != NULL) {
9150                         RELEASE_PATH(no_firmlink_path);
9151                         no_firmlink_path = NULL;
9152                 }
9153
9154                 /*
9155                  * nameidone has to happen before we vnode_put(dvp)
9156                  * since it may need to release the fs_nodelock on the dvp
9157                  */
9158                 nameidone(&nd);
9159                 vnode_put(dvp);
9160
9161                 if (vp) {
9162                         vnode_put(vp);
9163                 }
9164
9165                 if (restart_flag == 0) {
9166                         wakeup_one((caddr_t)vp);
9167                         return error;
9168                 }
9169                 tsleep(vp, PVFS, "rm AD", 1);
9170         } while (restart_flag != 0);
9171
9172         return error;
9173 }
9174
9175 /*
9176  * Remove a directory file.
9177  */
9178 /* ARGSUSED */
9179 int
9180 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
9181 {
9182         return rmdirat_internal(vfs_context_current(), AT_FDCWD,
9183                    CAST_USER_ADDR_T(uap->path), UIO_USERSPACE, 0);
9184 }
9185
9186 /* Get direntry length padded to 8 byte alignment */
9187 #define DIRENT64_LEN(namlen) \
9188         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
9189
9190 /* Get dirent length padded to 4 byte alignment */
9191 #define DIRENT_LEN(namelen) \
9192         ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
9193
9194 /* Get the end of this dirent */
9195 #define DIRENT_END(dep) \
9196         (((char *)(dep)) + (dep)->d_reclen - 1)
9197
9198 errno_t
9199 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
9200     int *numdirent, vfs_context_t ctxp)
9201 {
9202         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
9203         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
9204             ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) {
9205                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
9206         } else {
9207                 size_t bufsize;
9208                 void * bufptr;
9209                 uio_t auio;
9210                 struct direntry *entry64;
9211                 struct dirent *dep;
9212                 int bytesread;
9213                 int error;
9214
9215                 /*
9216                  * We're here because the underlying file system does not
9217                  * support direnties or we mounted denying support so we must
9218                  * fall back to dirents and convert them to direntries.
9219                  *
9220                  * Our kernel buffer needs to be smaller since re-packing will
9221                  * expand each dirent.  The worse case (when the name length
9222                  * is 3 or less) corresponds to a struct direntry size of 32
9223                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
9224                  * (4-byte aligned).  So having a buffer that is 3/8 the size
9225                  * will prevent us from reading more than we can pack.
9226                  *
9227                  * Since this buffer is wired memory, we will limit the
9228                  * buffer size to a maximum of 32K. We would really like to
9229                  * use 32K in the MIN(), but we use magic number 87371 to
9230                  * prevent uio_resid() * 3 / 8 from overflowing.
9231                  */
9232                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
9233                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
9234                 if (bufptr == NULL) {
9235                         return ENOMEM;
9236                 }
9237
9238                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
9239                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
9240                 auio->uio_offset = uio->uio_offset;
9241
9242                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
9243
9244                 dep = (struct dirent *)bufptr;
9245                 bytesread = bufsize - uio_resid(auio);
9246
9247                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
9248                     M_TEMP, M_WAITOK);
9249                 /*
9250                  * Convert all the entries and copy them out to user's buffer.
9251                  */
9252                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
9253                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
9254
9255                         if (DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
9256                             DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
9257                                 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
9258                                     vp->v_mount->mnt_vfsstat.f_mntonname,
9259                                     vp->v_name ? vp->v_name : "<unknown>");
9260                                 error = EIO;
9261                                 break;
9262                         }
9263
9264                         bzero(entry64, enbufsize);
9265                         /* Convert a dirent to a dirent64. */
9266                         entry64->d_ino = dep->d_ino;
9267                         entry64->d_seekoff = 0;
9268                         entry64->d_reclen = enbufsize;
9269                         entry64->d_namlen = dep->d_namlen;
9270                         entry64->d_type = dep->d_type;
9271                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
9272
9273                         /* Move to next entry. */
9274                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
9275
9276                         /* Copy entry64 to user's buffer. */
9277                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
9278                 }
9279
9280                 /* Update the real offset using the offset we got from VNOP_READDIR. */
9281                 if (error == 0) {
9282                         uio->uio_offset = auio->uio_offset;
9283                 }
9284                 uio_free(auio);
9285                 FREE(bufptr, M_TEMP);
9286                 FREE(entry64, M_TEMP);
9287                 return error;
9288         }
9289 }
9290
9291 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
9292
9293 /*
9294  * Read a block of directory entries in a file system independent format.
9295  */
9296 static int
9297 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
9298     off_t *offset, int *eofflag, int flags)
9299 {
9300         vnode_t vp;
9301         struct vfs_context context = *vfs_context_current();    /* local copy */
9302         struct fileproc *fp;
9303         uio_t auio;
9304         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9305         off_t loff;
9306         int error, numdirent;
9307         char uio_buf[UIO_SIZEOF(1)];
9308
9309         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
9310         if (error) {
9311                 return error;
9312         }
9313         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9314                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9315                 error = EBADF;
9316                 goto out;
9317         }
9318
9319         if (bufsize > GETDIRENTRIES_MAXBUFSIZE) {
9320                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
9321         }
9322
9323 #if CONFIG_MACF
9324         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
9325         if (error) {
9326                 goto out;
9327         }
9328 #endif
9329         if ((error = vnode_getwithref(vp))) {
9330                 goto out;
9331         }
9332         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9333
9334 unionread:
9335         if (vp->v_type != VDIR) {
9336                 (void)vnode_put(vp);
9337                 error = EINVAL;
9338                 goto out;
9339         }
9340
9341 #if CONFIG_MACF
9342         error = mac_vnode_check_readdir(&context, vp);
9343         if (error != 0) {
9344                 (void)vnode_put(vp);
9345                 goto out;
9346         }
9347 #endif /* MAC */
9348
9349         loff = fp->f_fglob->fg_offset;
9350         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9351         uio_addiov(auio, bufp, bufsize);
9352
9353         if (flags & VNODE_READDIR_EXTENDED) {
9354                 error = vnode_readdir64(vp, auio, flags, eofflag, &numdirent, &context);
9355                 fp->f_fglob->fg_offset = uio_offset(auio);
9356         } else {
9357                 error = VNOP_READDIR(vp, auio, 0, eofflag, &numdirent, &context);
9358                 fp->f_fglob->fg_offset = uio_offset(auio);
9359         }
9360         if (error) {
9361                 (void)vnode_put(vp);
9362                 goto out;
9363         }
9364
9365         if ((user_ssize_t)bufsize == uio_resid(auio)) {
9366                 if (union_dircheckp) {
9367                         error = union_dircheckp(&vp, fp, &context);
9368                         if (error == -1) {
9369                                 goto unionread;
9370                         }
9371                         if (error) {
9372                                 (void)vnode_put(vp);
9373                                 goto out;
9374                         }
9375                 }
9376
9377                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
9378                         struct vnode *tvp = vp;
9379                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
9380                                 vnode_ref(vp);
9381                                 fp->f_fglob->fg_data = (caddr_t) vp;
9382                                 fp->f_fglob->fg_offset = 0;
9383                                 vnode_rele(tvp);
9384                                 vnode_put(tvp);
9385                                 goto unionread;
9386                         }
9387                         vp = tvp;
9388                 }
9389         }
9390
9391         vnode_put(vp);
9392         if (offset) {
9393                 *offset = loff;
9394         }
9395
9396         *bytesread = bufsize - uio_resid(auio);
9397 out:
9398         file_drop(fd);
9399         return error;
9400 }
9401
9402
9403 int
9404 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
9405 {
9406         off_t offset;
9407         ssize_t bytesread;
9408         int error, eofflag;
9409
9410         AUDIT_ARG(fd, uap->fd);
9411         error = getdirentries_common(uap->fd, uap->buf, uap->count,
9412             &bytesread, &offset, &eofflag, 0);
9413
9414         if (error == 0) {
9415                 if (proc_is64bit(p)) {
9416                         user64_long_t base = (user64_long_t)offset;
9417                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
9418                 } else {
9419                         user32_long_t base = (user32_long_t)offset;
9420                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
9421                 }
9422                 *retval = bytesread;
9423         }
9424         return error;
9425 }
9426
9427 int
9428 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
9429 {
9430         off_t offset;
9431         ssize_t bytesread;
9432         int error, eofflag;
9433         user_size_t bufsize;
9434
9435         AUDIT_ARG(fd, uap->fd);
9436
9437         /*
9438          * If the buffer is at least GETDIRENTRIES64_EXTENDED_BUFSIZE large,
9439          * then the kernel carves out the last 4 bytes to return extended
9440          * information to userspace (namely whether we reached EOF with this call).
9441          */
9442         if (uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9443                 bufsize = uap->bufsize - sizeof(getdirentries64_flags_t);
9444         } else {
9445                 bufsize = uap->bufsize;
9446         }
9447
9448         error = getdirentries_common(uap->fd, uap->buf, bufsize,
9449             &bytesread, &offset, &eofflag, VNODE_READDIR_EXTENDED);
9450
9451         if (error == 0) {
9452                 *retval = bytesread;
9453                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
9454
9455                 if (error == 0 && uap->bufsize >= GETDIRENTRIES64_EXTENDED_BUFSIZE) {
9456                         getdirentries64_flags_t flags = 0;
9457                         if (eofflag) {
9458                                 flags |= GETDIRENTRIES64_EOF;
9459                         }
9460                         error = copyout(&flags, (user_addr_t)uap->buf + bufsize,
9461                             sizeof(flags));
9462                 }
9463         }
9464         return error;
9465 }
9466
9467
9468 /*
9469  * Set the mode mask for creation of filesystem nodes.
9470  * XXX implement xsecurity
9471  */
9472 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
9473 static int
9474 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
9475 {
9476         struct filedesc *fdp;
9477
9478         AUDIT_ARG(mask, newmask);
9479         proc_fdlock(p);
9480         fdp = p->p_fd;
9481         *retval = fdp->fd_cmask;
9482         fdp->fd_cmask = newmask & ALLPERMS;
9483         proc_fdunlock(p);
9484         return 0;
9485 }
9486
9487 /*
9488  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
9489  *
9490  * Parameters:    p                       Process requesting to set the umask
9491  *                uap                     User argument descriptor (see below)
9492  *                retval                  umask of the process (parameter p)
9493  *
9494  * Indirect:      uap->newmask            umask to set
9495  *                uap->xsecurity          ACL to set
9496  *
9497  * Returns:        0                      Success
9498  *                !0                      Not success
9499  *
9500  */
9501 int
9502 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
9503 {
9504         int ciferror;
9505         kauth_filesec_t xsecdst;
9506
9507         xsecdst = KAUTH_FILESEC_NONE;
9508         if (uap->xsecurity != USER_ADDR_NULL) {
9509                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0) {
9510                         return ciferror;
9511                 }
9512         } else {
9513                 xsecdst = KAUTH_FILESEC_NONE;
9514         }
9515
9516         ciferror = umask1(p, uap->newmask, xsecdst, retval);
9517
9518         if (xsecdst != KAUTH_FILESEC_NONE) {
9519                 kauth_filesec_free(xsecdst);
9520         }
9521         return ciferror;
9522 }
9523
9524 int
9525 umask(proc_t p, struct umask_args *uap, int32_t *retval)
9526 {
9527         return umask1(p, uap->newmask, UMASK_NOXSECURITY, retval);
9528 }
9529
9530 /*
9531  * Void all references to file by ripping underlying filesystem
9532  * away from vnode.
9533  */
9534 /* ARGSUSED */
9535 int
9536 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
9537 {
9538         vnode_t vp;
9539         struct vnode_attr va;
9540         vfs_context_t ctx = vfs_context_current();
9541         int error;
9542         struct nameidata nd;
9543
9544         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
9545             uap->path, ctx);
9546         error = namei(&nd);
9547         if (error) {
9548                 return error;
9549         }
9550         vp = nd.ni_vp;
9551
9552         nameidone(&nd);
9553
9554         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
9555                 error = ENOTSUP;
9556                 goto out;
9557         }
9558
9559         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
9560                 error = EBUSY;
9561                 goto out;
9562         }
9563
9564 #if CONFIG_MACF
9565         error = mac_vnode_check_revoke(ctx, vp);
9566         if (error) {
9567                 goto out;
9568         }
9569 #endif
9570
9571         VATTR_INIT(&va);
9572         VATTR_WANTED(&va, va_uid);
9573         if ((error = vnode_getattr(vp, &va, ctx))) {
9574                 goto out;
9575         }
9576         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
9577             (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
9578                 goto out;
9579         }
9580         if (vp->v_usecount > 0 || (vnode_isaliased(vp))) {
9581                 VNOP_REVOKE(vp, REVOKEALL, ctx);
9582         }
9583 out:
9584         vnode_put(vp);
9585         return error;
9586 }
9587
9588
9589 /*
9590  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
9591  *  The following system calls are designed to support features
9592  *  which are specific to the HFS & HFS Plus volume formats
9593  */
9594
9595
9596 /*
9597  * Obtain attribute information on objects in a directory while enumerating
9598  * the directory.
9599  */
9600 /* ARGSUSED */
9601 int
9602 getdirentriesattr(proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
9603 {
9604         vnode_t vp;
9605         struct fileproc *fp;
9606         uio_t auio = NULL;
9607         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9608         uint32_t count = 0, savecount = 0;
9609         uint32_t newstate = 0;
9610         int error, eofflag;
9611         uint32_t loff = 0;
9612         struct attrlist attributelist;
9613         vfs_context_t ctx = vfs_context_current();
9614         int fd = uap->fd;
9615         char uio_buf[UIO_SIZEOF(1)];
9616         kauth_action_t action;
9617
9618         AUDIT_ARG(fd, fd);
9619
9620         /* Get the attributes into kernel space */
9621         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
9622                 return error;
9623         }
9624         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
9625                 return error;
9626         }
9627         savecount = count;
9628         if ((error = fp_getfvp(p, fd, &fp, &vp))) {
9629                 return error;
9630         }
9631         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
9632                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
9633                 error = EBADF;
9634                 goto out;
9635         }
9636
9637
9638 #if CONFIG_MACF
9639         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
9640             fp->f_fglob);
9641         if (error) {
9642                 goto out;
9643         }
9644 #endif
9645
9646
9647         if ((error = vnode_getwithref(vp))) {
9648                 goto out;
9649         }
9650
9651         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
9652
9653 unionread:
9654         if (vp->v_type != VDIR) {
9655                 (void)vnode_put(vp);
9656                 error = EINVAL;
9657                 goto out;
9658         }
9659
9660 #if CONFIG_MACF
9661         error = mac_vnode_check_readdir(ctx, vp);
9662         if (error != 0) {
9663                 (void)vnode_put(vp);
9664                 goto out;
9665         }
9666 #endif /* MAC */
9667
9668         /* set up the uio structure which will contain the users return buffer */
9669         loff = fp->f_fglob->fg_offset;
9670         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9671         uio_addiov(auio, uap->buffer, uap->buffersize);
9672
9673         /*
9674          * If the only item requested is file names, we can let that past with
9675          * just LIST_DIRECTORY.  If they want any other attributes, that means
9676          * they need SEARCH as well.
9677          */
9678         action = KAUTH_VNODE_LIST_DIRECTORY;
9679         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
9680             attributelist.fileattr || attributelist.dirattr) {
9681                 action |= KAUTH_VNODE_SEARCH;
9682         }
9683
9684         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
9685                 /* Believe it or not, uap->options only has 32-bits of valid
9686                  * info, so truncate before extending again */
9687
9688                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
9689                     (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
9690         }
9691
9692         if (error) {
9693                 (void) vnode_put(vp);
9694                 goto out;
9695         }
9696
9697         /*
9698          * If we've got the last entry of a directory in a union mount
9699          * then reset the eofflag and pretend there's still more to come.
9700          * The next call will again set eofflag and the buffer will be empty,
9701          * so traverse to the underlying directory and do the directory
9702          * read there.
9703          */
9704         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
9705                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
9706                         eofflag = 0;
9707                 } else {                                                // Empty buffer
9708                         struct vnode *tvp = vp;
9709                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
9710                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
9711                                 fp->f_fglob->fg_data = (caddr_t) vp;
9712                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
9713                                 count = savecount;
9714                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
9715                                 vnode_put(tvp);
9716                                 goto unionread;
9717                         }
9718                         vp = tvp;
9719                 }
9720         }
9721
9722         (void)vnode_put(vp);
9723
9724         if (error) {
9725                 goto out;
9726         }
9727         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
9728
9729         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count)))) {
9730                 goto out;
9731         }
9732         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate)))) {
9733                 goto out;
9734         }
9735         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff)))) {
9736                 goto out;
9737         }
9738
9739         *retval = eofflag;  /* similar to getdirentries */
9740         error = 0;
9741 out:
9742         file_drop(fd);
9743         return error; /* return error earlier, an retval of 0 or 1 now */
9744 } /* end of getdirentriesattr system call */
9745
9746 /*
9747  * Exchange data between two files
9748  */
9749
9750 /* ARGSUSED */
9751 int
9752 exchangedata(__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
9753 {
9754         struct nameidata fnd, snd;
9755         vfs_context_t ctx = vfs_context_current();
9756         vnode_t fvp;
9757         vnode_t svp;
9758         int error;
9759         u_int32_t nameiflags;
9760         char *fpath = NULL;
9761         char *spath = NULL;
9762         int   flen = 0, slen = 0;
9763         int from_truncated = 0, to_truncated = 0;
9764 #if CONFIG_FSE
9765         fse_info f_finfo, s_finfo;
9766 #endif
9767
9768         nameiflags = 0;
9769         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
9770                 nameiflags |= FOLLOW;
9771         }
9772
9773         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
9774             UIO_USERSPACE, uap->path1, ctx);
9775
9776         error = namei(&fnd);
9777         if (error) {
9778                 goto out2;
9779         }
9780
9781         nameidone(&fnd);
9782         fvp = fnd.ni_vp;
9783
9784         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
9785             UIO_USERSPACE, uap->path2, ctx);
9786
9787         error = namei(&snd);
9788         if (error) {
9789                 vnode_put(fvp);
9790                 goto out2;
9791         }
9792         nameidone(&snd);
9793         svp = snd.ni_vp;
9794
9795         /*
9796          * if the files are the same, return an inval error
9797          */
9798         if (svp == fvp) {
9799                 error = EINVAL;
9800                 goto out;
9801         }
9802
9803         /*
9804          * if the files are on different volumes, return an error
9805          */
9806         if (svp->v_mount != fvp->v_mount) {
9807                 error = EXDEV;
9808                 goto out;
9809         }
9810
9811         /* If they're not files, return an error */
9812         if ((vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
9813                 error = EINVAL;
9814                 goto out;
9815         }
9816
9817 #if CONFIG_MACF
9818         error = mac_vnode_check_exchangedata(ctx,
9819             fvp, svp);
9820         if (error) {
9821                 goto out;
9822         }
9823 #endif
9824         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
9825             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0)) {
9826                 goto out;
9827         }
9828
9829         if (
9830 #if CONFIG_FSE
9831                 need_fsevent(FSE_EXCHANGE, fvp) ||
9832 #endif
9833                 kauth_authorize_fileop_has_listeners()) {
9834                 GET_PATH(fpath);
9835                 GET_PATH(spath);
9836                 if (fpath == NULL || spath == NULL) {
9837                         error = ENOMEM;
9838                         goto out;
9839                 }
9840
9841                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
9842                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
9843
9844 #if CONFIG_FSE
9845                 get_fse_info(fvp, &f_finfo, ctx);
9846                 get_fse_info(svp, &s_finfo, ctx);
9847                 if (from_truncated || to_truncated) {
9848                         // set it here since only the f_finfo gets reported up to user space
9849                         f_finfo.mode |= FSE_TRUNCATED_PATH;
9850                 }
9851 #endif
9852         }
9853         /* Ok, make the call */
9854         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
9855
9856         if (error == 0) {
9857                 const char *tmpname;
9858
9859                 if (fpath != NULL && spath != NULL) {
9860                         /* call out to allow 3rd party notification of exchangedata.
9861                          * Ignore result of kauth_authorize_fileop call.
9862                          */
9863                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
9864                             (uintptr_t)fpath, (uintptr_t)spath);
9865                 }
9866                 name_cache_lock();
9867
9868                 tmpname     = fvp->v_name;
9869                 fvp->v_name = svp->v_name;
9870                 svp->v_name = tmpname;
9871
9872                 if (fvp->v_parent != svp->v_parent) {
9873                         vnode_t tmp;
9874
9875                         tmp           = fvp->v_parent;
9876                         fvp->v_parent = svp->v_parent;
9877                         svp->v_parent = tmp;
9878                 }
9879                 name_cache_unlock();
9880
9881 #if CONFIG_FSE
9882                 if (fpath != NULL && spath != NULL) {
9883                         add_fsevent(FSE_EXCHANGE, ctx,
9884                             FSE_ARG_STRING, flen, fpath,
9885                             FSE_ARG_FINFO, &f_finfo,
9886                             FSE_ARG_STRING, slen, spath,
9887                             FSE_ARG_FINFO, &s_finfo,
9888                             FSE_ARG_DONE);
9889                 }
9890 #endif
9891         }
9892
9893 out:
9894         if (fpath != NULL) {
9895                 RELEASE_PATH(fpath);
9896         }
9897         if (spath != NULL) {
9898                 RELEASE_PATH(spath);
9899         }
9900         vnode_put(svp);
9901         vnode_put(fvp);
9902 out2:
9903         return error;
9904 }
9905
9906 /*
9907  * Return (in MB) the amount of freespace on the given vnode's volume.
9908  */
9909 uint32_t freespace_mb(vnode_t vp);
9910
9911 uint32_t
9912 freespace_mb(vnode_t vp)
9913 {
9914         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
9915         return ((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
9916                vp->v_mount->mnt_vfsstat.f_bsize) >> 20;
9917 }
9918
9919 #if CONFIG_SEARCHFS
9920
9921 /* ARGSUSED */
9922
9923 int
9924 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
9925 {
9926         vnode_t vp, tvp;
9927         int i, error = 0;
9928         int fserror = 0;
9929         struct nameidata nd;
9930         struct user64_fssearchblock searchblock;
9931         struct searchstate *state;
9932         struct attrlist *returnattrs;
9933         struct timeval timelimit;
9934         void *searchparams1, *searchparams2;
9935         uio_t auio = NULL;
9936         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9937         uint32_t nummatches;
9938         int mallocsize;
9939         uint32_t nameiflags;
9940         vfs_context_t ctx = vfs_context_current();
9941         char uio_buf[UIO_SIZEOF(1)];
9942
9943         /* Start by copying in fsearchblock parameter list */
9944         if (IS_64BIT_PROCESS(p)) {
9945                 error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
9946                 timelimit.tv_sec = searchblock.timelimit.tv_sec;
9947                 timelimit.tv_usec = searchblock.timelimit.tv_usec;
9948         } else {
9949                 struct user32_fssearchblock tmp_searchblock;
9950
9951                 error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
9952                 // munge into 64-bit version
9953                 searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
9954                 searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
9955                 searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
9956                 searchblock.maxmatches = tmp_searchblock.maxmatches;
9957                 /*
9958                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
9959                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
9960                  */
9961                 timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
9962                 timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
9963                 searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
9964                 searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
9965                 searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
9966                 searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
9967                 searchblock.searchattrs = tmp_searchblock.searchattrs;
9968         }
9969         if (error) {
9970                 return error;
9971         }
9972
9973         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
9974          */
9975         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
9976             searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS) {
9977                 return EINVAL;
9978         }
9979
9980         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
9981         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
9982         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
9983         /* block.                                                                                             */
9984         /*                                                                                                    */
9985         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
9986         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
9987         /*       assumes the size is still 556 bytes it will continue to work                                 */
9988
9989         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
9990             sizeof(struct attrlist) + sizeof(struct searchstate) + (2 * sizeof(uint32_t));
9991
9992         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
9993
9994         /* Now set up the various pointers to the correct place in our newly allocated memory */
9995
9996         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
9997         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
9998         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof(struct attrlist));
9999
10000         /* Now copy in the stuff given our local variables. */
10001
10002         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1))) {
10003                 goto freeandexit;
10004         }
10005
10006         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2))) {
10007                 goto freeandexit;
10008         }
10009
10010         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist)))) {
10011                 goto freeandexit;
10012         }
10013
10014         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate)))) {
10015                 goto freeandexit;
10016         }
10017
10018         /*
10019          * When searching a union mount, need to set the
10020          * start flag at the first call on each layer to
10021          * reset state for the new volume.
10022          */
10023         if (uap->options & SRCHFS_START) {
10024                 state->ss_union_layer = 0;
10025         } else {
10026                 uap->options |= state->ss_union_flags;
10027         }
10028         state->ss_union_flags = 0;
10029
10030         /*
10031          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
10032          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
10033          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
10034          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
10035          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
10036          */
10037
10038         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
10039                 attrreference_t* string_ref;
10040                 u_int32_t* start_length;
10041                 user64_size_t param_length;
10042
10043                 /* validate searchparams1 */
10044                 param_length = searchblock.sizeofsearchparams1;
10045                 /* skip the word that specifies length of the buffer */
10046                 start_length = (u_int32_t*) searchparams1;
10047                 start_length = start_length + 1;
10048                 string_ref = (attrreference_t*) start_length;
10049
10050                 /* ensure no negative offsets or too big offsets */
10051                 if (string_ref->attr_dataoffset < 0) {
10052                         error = EINVAL;
10053                         goto freeandexit;
10054                 }
10055                 if (string_ref->attr_length > MAXPATHLEN) {
10056                         error = EINVAL;
10057                         goto freeandexit;
10058                 }
10059
10060                 /* Check for pointer overflow in the string ref */
10061                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
10062                         error = EINVAL;
10063                         goto freeandexit;
10064                 }
10065
10066                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
10067                         error = EINVAL;
10068                         goto freeandexit;
10069                 }
10070                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
10071                         error = EINVAL;
10072                         goto freeandexit;
10073                 }
10074         }
10075
10076         /* set up the uio structure which will contain the users return buffer */
10077         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
10078         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
10079
10080         nameiflags = 0;
10081         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
10082                 nameiflags |= FOLLOW;
10083         }
10084         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
10085             UIO_USERSPACE, uap->path, ctx);
10086
10087         error = namei(&nd);
10088         if (error) {
10089                 goto freeandexit;
10090         }
10091         vp = nd.ni_vp;
10092         nameidone(&nd);
10093
10094         /*
10095          * Switch to the root vnode for the volume
10096          */
10097         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
10098         vnode_put(vp);
10099         if (error) {
10100                 goto freeandexit;
10101         }
10102         vp = tvp;
10103
10104         /*
10105          * If it's a union mount, the path lookup takes
10106          * us to the top layer. But we may need to descend
10107          * to a lower layer. For non-union mounts the layer
10108          * is always zero.
10109          */
10110         for (i = 0; i < (int) state->ss_union_layer; i++) {
10111                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0) {
10112                         break;
10113                 }
10114                 tvp = vp;
10115                 vp = vp->v_mount->mnt_vnodecovered;
10116                 if (vp == NULL) {
10117                         vnode_put(tvp);
10118                         error = ENOENT;
10119                         goto freeandexit;
10120                 }
10121                 error = vnode_getwithref(vp);
10122                 vnode_put(tvp);
10123                 if (error) {
10124                         goto freeandexit;
10125                 }
10126         }
10127
10128 #if CONFIG_MACF
10129         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
10130         if (error) {
10131                 vnode_put(vp);
10132                 goto freeandexit;
10133         }
10134 #endif
10135
10136
10137         /*
10138          * If searchblock.maxmatches == 0, then skip the search. This has happened
10139          * before and sometimes the underlying code doesnt deal with it well.
10140          */
10141         if (searchblock.maxmatches == 0) {
10142                 nummatches = 0;
10143                 goto saveandexit;
10144         }
10145
10146         /*
10147          * Allright, we have everything we need, so lets make that call.
10148          *
10149          * We keep special track of the return value from the file system:
10150          * EAGAIN is an acceptable error condition that shouldn't keep us
10151          * from copying out any results...
10152          */
10153
10154         fserror = VNOP_SEARCHFS(vp,
10155             searchparams1,
10156             searchparams2,
10157             &searchblock.searchattrs,
10158             (u_long)searchblock.maxmatches,
10159             &timelimit,
10160             returnattrs,
10161             &nummatches,
10162             (u_long)uap->scriptcode,
10163             (u_long)uap->options,
10164             auio,
10165             (struct searchstate *) &state->ss_fsstate,
10166             ctx);
10167
10168         /*
10169          * If it's a union mount we need to be called again
10170          * to search the mounted-on filesystem.
10171          */
10172         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
10173                 state->ss_union_flags = SRCHFS_START;
10174                 state->ss_union_layer++;        // search next layer down
10175                 fserror = EAGAIN;
10176         }
10177
10178 saveandexit:
10179
10180         vnode_put(vp);
10181
10182         /* Now copy out the stuff that needs copying out. That means the number of matches, the
10183          *  search state.  Everything was already put into he return buffer by the vop call. */
10184
10185         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) {
10186                 goto freeandexit;
10187         }
10188
10189         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) {
10190                 goto freeandexit;
10191         }
10192
10193         error = fserror;
10194
10195 freeandexit:
10196
10197         FREE(searchparams1, M_TEMP);
10198
10199         return error;
10200 } /* end of searchfs system call */
10201
10202 #else /* CONFIG_SEARCHFS */
10203
10204 int
10205 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
10206 {
10207         return ENOTSUP;
10208 }
10209
10210 #endif /* CONFIG_SEARCHFS */
10211
10212
10213 #if CONFIG_DATALESS_FILES
10214
10215 /*
10216  * === Namespace Resolver Up-call Mechanism ===
10217  *
10218  * When I/O is performed to a dataless file or directory (read, write,
10219  * lookup-in, etc.), the file system performs an upcall to the namespace
10220  * resolver (filecoordinationd) to materialize the object.
10221  *
10222  * We need multiple up-calls to be in flight at once, and we need these
10223  * up-calls to be interruptible, thus the following implementation:
10224  *
10225  * => The nspace_resolver_request represents the in-kernel request state.
10226  *    It contains a request ID, storage space for the errno code returned
10227  *    by filecoordinationd, and flags.
10228  *
10229  * => The request ID is simply a global monotonically incrementing 32-bit
10230  *    number.  Outstanding requests are stored in a hash table, and the
10231  *    hash function is extremely simple.
10232  *
10233  * => When an upcall is to be made to filecoordinationd, a request structure
10234  *    is allocated on the stack (it is small, and needs to live only during
10235  *    the duration of the call to resolve_nspace_item_ext()).  It is
10236  *    initialized and inserted into the table.  Some backpressure from
10237  *    filecoordinationd is applied by limiting the numnber of entries that
10238  *    can be inserted into the table (and thus limiting the number of
10239  *    outstanding requests issued to filecoordinationd); waiting for an
10240  *    available slot is interruptible.
10241  *
10242  * => Once the request has been inserted into the table, the up-call is made
10243  *    to filecoordinationd via a MiG-generated stub.  The up-call returns
10244  *    immediately and filecoordinationd processes the request asynchronously.
10245  *
10246  * => The caller now waits for the request to complete.  Tnis is achieved by
10247  *    sleeping on the address of the request structure and waiting for
10248  *    filecoordinationd to mark the request structure as complete.  This
10249  *    is an interruptible sleep call; if interrupted, the request structure
10250  *    is removed from the table and EINTR is returned to the caller.  If
10251  *    this occurs, an advisory up-call is made to filecoordinationd with
10252  *    the request ID to indicate that the request can be aborted or
10253  *    de-prioritized at the discretion of filecoordinationd.
10254  *
10255  * => When filecoordinationd has completed the request, it signals completion
10256  *    by writing to the vfs.nspace.complete sysctl node.  Only a process
10257  *    decorated as a namespace resolver can write to this sysctl node.  The
10258  *    value is a request ID / errno tuple passed as an array of 2 uint32_t's.
10259  *    The request ID is looked up in the table, and if the request is found,
10260  *    the error code is stored in the request structure and a wakeup()
10261  *    issued on the address of the request structure.  If the request is not
10262  *    found, we simply drop the completion notification, assuming that the
10263  *    caller was interrupted.
10264  *
10265  * => When the waiting thread wakes up, it extracts the error code from the
10266  *    request structure, removes the request from the table, and returns the
10267  *    error code to the calling function.  Fini!
10268  */
10269
10270 struct nspace_resolver_request {
10271         LIST_ENTRY(nspace_resolver_request) r_hashlink;
10272         uint32_t        r_req_id;
10273         int             r_resolver_error;
10274         int             r_flags;
10275 };
10276
10277 #define RRF_COMPLETE    0x0001
10278
10279 static uint32_t
10280 next_nspace_req_id(void)
10281 {
10282         static uint32_t next_req_id;
10283
10284         return OSAddAtomic(1, &next_req_id);
10285 }
10286
10287 #define NSPACE_RESOLVER_REQ_HASHSIZE    32      /* XXX tune */
10288 #define NSPACE_RESOLVER_MAX_OUTSTANDING 256     /* XXX tune */
10289
10290 static LIST_HEAD(nspace_resolver_requesthead,
10291     nspace_resolver_request) * nspace_resolver_request_hashtbl;
10292 static u_long nspace_resolver_request_hashmask;
10293 static u_int nspace_resolver_request_count;
10294 static bool nspace_resolver_request_wait_slot;
10295 static lck_grp_t *nspace_resolver_request_lck_grp;
10296 static lck_mtx_t nspace_resolver_request_hash_mutex;
10297
10298 #define NSPACE_REQ_LOCK() \
10299         lck_mtx_lock(&nspace_resolver_request_hash_mutex)
10300 #define NSPACE_REQ_UNLOCK() \
10301         lck_mtx_unlock(&nspace_resolver_request_hash_mutex)
10302
10303 #define NSPACE_RESOLVER_HASH(req_id)    \
10304         (&nspace_resolver_request_hashtbl[(req_id) & \
10305          nspace_resolver_request_hashmask])
10306
10307 static struct nspace_resolver_request *
10308 nspace_resolver_req_lookup(uint32_t req_id)
10309 {
10310         struct nspace_resolver_requesthead *bucket;
10311         struct nspace_resolver_request *req;
10312
10313         bucket = NSPACE_RESOLVER_HASH(req_id);
10314         LIST_FOREACH(req, bucket, r_hashlink) {
10315                 if (req->r_req_id == req_id) {
10316                         return req;
10317                 }
10318         }
10319
10320         return NULL;
10321 }
10322
10323 static int
10324 nspace_resolver_req_add(struct nspace_resolver_request *req)
10325 {
10326         struct nspace_resolver_requesthead *bucket;
10327         int error;
10328
10329         while (nspace_resolver_request_count >=
10330             NSPACE_RESOLVER_MAX_OUTSTANDING) {
10331                 nspace_resolver_request_wait_slot = true;
10332                 error = msleep(&nspace_resolver_request_count,
10333                     &nspace_resolver_request_hash_mutex,
10334                     PVFS | PCATCH, "nspacerq", NULL);
10335                 if (error) {
10336                         return error;
10337                 }
10338         }
10339
10340         bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10341 #if DIAGNOSTIC
10342         assert(nspace_resolver_req_lookup(req->r_req_id) == NULL);
10343 #endif /* DIAGNOSTIC */
10344         LIST_INSERT_HEAD(bucket, req, r_hashlink);
10345         nspace_resolver_request_count++;
10346
10347         return 0;
10348 }
10349
10350 static void
10351 nspace_resolver_req_remove(struct nspace_resolver_request *req)
10352 {
10353         struct nspace_resolver_requesthead *bucket;
10354
10355         bucket = NSPACE_RESOLVER_HASH(req->r_req_id);
10356 #if DIAGNOSTIC
10357         assert(nspace_resolver_req_lookup(req->r_req_id) != NULL);
10358 #endif /* DIAGNOSTIC */
10359         LIST_REMOVE(req, r_hashlink);
10360         nspace_resolver_request_count--;
10361
10362         if (nspace_resolver_request_wait_slot) {
10363                 nspace_resolver_request_wait_slot = false;
10364                 wakeup(&nspace_resolver_request_count);
10365         }
10366 }
10367
10368 static void
10369 nspace_resolver_req_cancel(uint32_t req_id)
10370 {
10371         kern_return_t kr;
10372         mach_port_t mp;
10373
10374         // Failures here aren't fatal -- the cancellation message
10375         // sent to the resolver is merely advisory.
10376
10377         kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10378         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10379                 return;
10380         }
10381
10382         kr = send_nspace_resolve_cancel(mp, req_id);
10383         if (kr != KERN_SUCCESS) {
10384                 os_log_error(OS_LOG_DEFAULT,
10385                     "NSPACE send_nspace_resolve_cancel failure: %d", kr);
10386         }
10387
10388         ipc_port_release_send(mp);
10389 }
10390
10391 static int
10392 nspace_resolver_req_wait(struct nspace_resolver_request *req)
10393 {
10394         bool send_cancel_message = false;
10395         int error;
10396
10397         NSPACE_REQ_LOCK();
10398
10399         while ((req->r_flags & RRF_COMPLETE) == 0) {
10400                 error = msleep(req, &nspace_resolver_request_hash_mutex,
10401                     PVFS | PCATCH, "nspace", NULL);
10402                 if (error && error != ERESTART) {
10403                         req->r_resolver_error = (error == EINTR) ? EINTR :
10404                             ETIMEDOUT;
10405                         send_cancel_message = true;
10406                         break;
10407                 }
10408         }
10409
10410         nspace_resolver_req_remove(req);
10411
10412         NSPACE_REQ_UNLOCK();
10413
10414         if (send_cancel_message) {
10415                 nspace_resolver_req_cancel(req->r_req_id);
10416         }
10417
10418         return req->r_resolver_error;
10419 }
10420
10421 static void
10422 nspace_resolver_req_mark_complete(
10423         struct nspace_resolver_request *req,
10424         int resolver_error)
10425 {
10426         req->r_resolver_error = resolver_error;
10427         req->r_flags |= RRF_COMPLETE;
10428         wakeup(req);
10429 }
10430
10431 static void
10432 nspace_resolver_req_completed(uint32_t req_id, int resolver_error)
10433 {
10434         struct nspace_resolver_request *req;
10435
10436         NSPACE_REQ_LOCK();
10437
10438         // If we don't find the request corresponding to our req_id,
10439         // just drop the completion signal on the floor; it's likely
10440         // that the requester interrupted with a signal.
10441
10442         req = nspace_resolver_req_lookup(req_id);
10443         if (req) {
10444                 nspace_resolver_req_mark_complete(req, resolver_error);
10445         }
10446
10447         NSPACE_REQ_UNLOCK();
10448 }
10449
10450 static struct proc *nspace_resolver_proc;
10451
10452 static int
10453 nspace_resolver_get_proc_state(struct proc *p, int *is_resolver)
10454 {
10455         *is_resolver = ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10456             p == nspace_resolver_proc) ? 1 : 0;
10457         return 0;
10458 }
10459
10460 static int
10461 nspace_resolver_set_proc_state(struct proc *p, int is_resolver)
10462 {
10463         vfs_context_t ctx = vfs_context_current();
10464         int error = 0;
10465
10466         //
10467         // The system filecoordinationd runs as uid == 0.  This also
10468         // has the nice side-effect of filtering out filecoordinationd
10469         // running in the simulator.
10470         //
10471         if (!vfs_context_issuser(ctx)) {
10472                 return EPERM;
10473         }
10474
10475         error = priv_check_cred(vfs_context_ucred(ctx),
10476             PRIV_VFS_DATALESS_RESOLVER, 0);
10477         if (error) {
10478                 return error;
10479         }
10480
10481         if (is_resolver) {
10482                 NSPACE_REQ_LOCK();
10483
10484                 if (nspace_resolver_proc == NULL) {
10485                         proc_lock(p);
10486                         p->p_lflag |= P_LNSPACE_RESOLVER;
10487                         proc_unlock(p);
10488                         nspace_resolver_proc = p;
10489                 } else {
10490                         error = EBUSY;
10491                 }
10492
10493                 NSPACE_REQ_UNLOCK();
10494         } else {
10495                 // This is basically just like the exit case.
10496                 // nspace_resolver_exited() will verify that the
10497                 // process is the resolver, and will clear the
10498                 // global.
10499                 nspace_resolver_exited(p);
10500         }
10501
10502         return error;
10503 }
10504
10505 static int
10506 nspace_materialization_get_proc_state(struct proc *p, int *is_prevented)
10507 {
10508         if ((p->p_lflag & P_LNSPACE_RESOLVER) != 0 ||
10509             (p->p_vfs_iopolicy &
10510             P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) == 0) {
10511                 *is_prevented = 1;
10512         } else {
10513                 *is_prevented = 0;
10514         }
10515         return 0;
10516 }
10517
10518 static int
10519 nspace_materialization_set_proc_state(struct proc *p, int is_prevented)
10520 {
10521         if (p->p_lflag & P_LNSPACE_RESOLVER) {
10522                 return is_prevented ? 0 : EBUSY;
10523         }
10524
10525         if (is_prevented) {
10526                 OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES), &p->p_vfs_iopolicy);
10527         } else {
10528                 OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES, &p->p_vfs_iopolicy);
10529         }
10530         return 0;
10531 }
10532
10533 static int
10534 nspace_materialization_get_thread_state(int *is_prevented)
10535 {
10536         uthread_t ut = get_bsdthread_info(current_thread());
10537
10538         *is_prevented = (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) ? 1 : 0;
10539         return 0;
10540 }
10541
10542 static int
10543 nspace_materialization_set_thread_state(int is_prevented)
10544 {
10545         uthread_t ut = get_bsdthread_info(current_thread());
10546
10547         if (is_prevented) {
10548                 ut->uu_flag |= UT_NSPACE_NODATALESSFAULTS;
10549         } else {
10550                 ut->uu_flag &= ~UT_NSPACE_NODATALESSFAULTS;
10551         }
10552         return 0;
10553 }
10554
10555 static int
10556 nspace_materialization_is_prevented(void)
10557 {
10558         proc_t p = current_proc();
10559         uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
10560         vfs_context_t ctx = vfs_context_current();
10561
10562         /*
10563          * Kernel context ==> return EDEADLK, as we would with any random
10564          * process decorated as no-materialize.
10565          */
10566         if (ctx == vfs_context_kernel()) {
10567                 return EDEADLK;
10568         }
10569
10570         /*
10571          * If the process has the dataless-manipulation entitlement,
10572          * materialization is prevented, and depending on the kind
10573          * of file system operation, things get to proceed as if the
10574          * object is not dataless.
10575          */
10576         if (vfs_context_is_dataless_manipulator(ctx)) {
10577                 return EJUSTRETURN;
10578         }
10579
10580         /*
10581          * Per-thread decorations override any process-wide decorations.
10582          * (Foundation uses this, and this overrides even the dataless-
10583          * manipulation entitlement so as to make API contracts consistent.)
10584          */
10585         if (ut != NULL) {
10586                 if (ut->uu_flag & UT_NSPACE_NODATALESSFAULTS) {
10587                         return EDEADLK;
10588                 }
10589                 if (ut->uu_flag & UT_NSPACE_FORCEDATALESSFAULTS) {
10590                         return 0;
10591                 }
10592         }
10593
10594         /*
10595          * If the process's iopolicy specifies that dataless files
10596          * can be materialized, then we let it go ahead.
10597          */
10598         if (p->p_vfs_iopolicy & P_VFS_IOPOLICY_MATERIALIZE_DATALESS_FILES) {
10599                 return 0;
10600         }
10601
10602         /*
10603          * The default behavior is to not materialize dataless files;
10604          * return to the caller that deadlock was detected.
10605          */
10606         return EDEADLK;
10607 }
10608
10609 /* the vfs.nspace branch */
10610 SYSCTL_NODE(_vfs, OID_AUTO, nspace, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs nspace hinge");
10611
10612 static int
10613 sysctl_nspace_resolver(__unused struct sysctl_oid *oidp,
10614     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10615 {
10616         struct proc *p = req->p;
10617         int new_value, old_value, changed = 0;
10618         int error;
10619
10620         error = nspace_resolver_get_proc_state(p, &old_value);
10621         if (error) {
10622                 return error;
10623         }
10624
10625         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10626             &changed);
10627         if (error == 0 && changed) {
10628                 error = nspace_resolver_set_proc_state(p, new_value);
10629         }
10630         return error;
10631 }
10632
10633 /* decorate this process as the dataless file resolver */
10634 SYSCTL_PROC(_vfs_nspace, OID_AUTO, resolver,
10635     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10636     0, 0, sysctl_nspace_resolver, "I", "");
10637
10638 static int
10639 sysctl_nspace_prevent_materialization(__unused struct sysctl_oid *oidp,
10640     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10641 {
10642         struct proc *p = req->p;
10643         int new_value, old_value, changed = 0;
10644         int error;
10645
10646         error = nspace_materialization_get_proc_state(p, &old_value);
10647         if (error) {
10648                 return error;
10649         }
10650
10651         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10652             &changed);
10653         if (error == 0 && changed) {
10654                 error = nspace_materialization_set_proc_state(p, new_value);
10655         }
10656         return error;
10657 }
10658
10659 /* decorate this process as not wanting to materialize dataless files */
10660 SYSCTL_PROC(_vfs_nspace, OID_AUTO, prevent_materialization,
10661     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10662     0, 0, sysctl_nspace_prevent_materialization, "I", "");
10663
10664 static int
10665 sysctl_nspace_thread_prevent_materialization(__unused struct sysctl_oid *oidp,
10666     __unused void *arg1, __unused int arg2, struct sysctl_req *req)
10667 {
10668         int new_value, old_value, changed = 0;
10669         int error;
10670
10671         error = nspace_materialization_get_thread_state(&old_value);
10672         if (error) {
10673                 return error;
10674         }
10675
10676         error = sysctl_io_number(req, old_value, sizeof(int), &new_value,
10677             &changed);
10678         if (error == 0 && changed) {
10679                 error = nspace_materialization_set_thread_state(new_value);
10680         }
10681         return error;
10682 }
10683
10684 /* decorate this thread as not wanting to materialize dataless files */
10685 SYSCTL_PROC(_vfs_nspace, OID_AUTO, thread_prevent_materialization,
10686     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10687     0, 0, sysctl_nspace_thread_prevent_materialization, "I", "");
10688
10689 static int
10690 sysctl_nspace_complete(__unused struct sysctl_oid *oidp, __unused void *arg1,
10691     __unused int arg2, struct sysctl_req *req)
10692 {
10693         struct proc *p = req->p;
10694         uint32_t req_status[2] = { 0, 0 };
10695         int error, is_resolver, changed = 0;
10696
10697         error = nspace_resolver_get_proc_state(p, &is_resolver);
10698         if (error) {
10699                 return error;
10700         }
10701
10702         if (!is_resolver) {
10703                 return EPERM;
10704         }
10705
10706         error = sysctl_io_opaque(req, req_status, sizeof(req_status),
10707             &changed);
10708         if (error) {
10709                 return error;
10710         }
10711
10712         /*
10713          * req_status[0] is the req_id
10714          *
10715          * req_status[1] is the errno
10716          */
10717         if (error == 0 && changed) {
10718                 nspace_resolver_req_completed(req_status[0],
10719                     (int)req_status[1]);
10720         }
10721         return error;
10722 }
10723
10724 /* Resolver reports completed reqs here. */
10725 SYSCTL_PROC(_vfs_nspace, OID_AUTO, complete,
10726     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
10727     0, 0, sysctl_nspace_complete, "-", "");
10728
10729 #endif /* CONFIG_DATALESS_FILES */
10730
10731 #if CONFIG_DATALESS_FILES
10732 #define __no_dataless_unused    /* nothing */
10733 #else
10734 #define __no_dataless_unused    __unused
10735 #endif
10736
10737 void
10738 nspace_resolver_init(void)
10739 {
10740 #if CONFIG_DATALESS_FILES
10741         nspace_resolver_request_lck_grp =
10742             lck_grp_alloc_init("file namespace resolver", NULL);
10743
10744         lck_mtx_init(&nspace_resolver_request_hash_mutex,
10745             nspace_resolver_request_lck_grp, NULL);
10746
10747         nspace_resolver_request_hashtbl =
10748             hashinit(NSPACE_RESOLVER_REQ_HASHSIZE,
10749             M_VNODE /* XXX */, &nspace_resolver_request_hashmask);
10750 #endif /* CONFIG_DATALESS_FILES */
10751 }
10752
10753 void
10754 nspace_resolver_exited(struct proc *p __no_dataless_unused)
10755 {
10756 #if CONFIG_DATALESS_FILES
10757         struct nspace_resolver_requesthead *bucket;
10758         struct nspace_resolver_request *req;
10759         u_long idx;
10760
10761         NSPACE_REQ_LOCK();
10762
10763         if ((p->p_lflag & P_LNSPACE_RESOLVER) &&
10764             p == nspace_resolver_proc) {
10765                 for (idx = 0; idx <= nspace_resolver_request_hashmask; idx++) {
10766                         bucket = &nspace_resolver_request_hashtbl[idx];
10767                         LIST_FOREACH(req, bucket, r_hashlink) {
10768                                 nspace_resolver_req_mark_complete(req,
10769                                     ETIMEDOUT);
10770                         }
10771                 }
10772                 nspace_resolver_proc = NULL;
10773         }
10774
10775         NSPACE_REQ_UNLOCK();
10776 #endif /* CONFIG_DATALESS_FILES */
10777 }
10778
10779 int
10780 resolve_nspace_item(struct vnode *vp, uint64_t op)
10781 {
10782         return resolve_nspace_item_ext(vp, op, NULL);
10783 }
10784
10785 #define DATALESS_RESOLVER_ENTITLEMENT     \
10786         "com.apple.private.vfs.dataless-resolver"
10787 #define DATALESS_MANIPULATION_ENTITLEMENT \
10788         "com.apple.private.vfs.dataless-manipulation"
10789
10790 /*
10791  * Return TRUE if the vfs context is associated with a process entitled
10792  * for dataless manipulation.
10793  *
10794  * XXX Arguably belongs in vfs_subr.c, but is here because of the
10795  * complication around CONFIG_DATALESS_FILES.
10796  */
10797 boolean_t
10798 vfs_context_is_dataless_manipulator(vfs_context_t ctx __unused)
10799 {
10800 #if CONFIG_DATALESS_FILES
10801         assert(ctx->vc_thread == current_thread());
10802         task_t const task = current_task();
10803         return IOTaskHasEntitlement(task, DATALESS_MANIPULATION_ENTITLEMENT) ||
10804                IOTaskHasEntitlement(task, DATALESS_RESOLVER_ENTITLEMENT);
10805 #else
10806         return false;
10807 #endif /* CONFIG_DATALESS_FILES */
10808 }
10809
10810 int
10811 resolve_nspace_item_ext(
10812         struct vnode *vp __no_dataless_unused,
10813         uint64_t op __no_dataless_unused,
10814         void *arg __unused)
10815 {
10816 #if CONFIG_DATALESS_FILES
10817         int error;
10818         mach_port_t mp;
10819         char *path = NULL;
10820         int path_len;
10821         kern_return_t kr;
10822         struct nspace_resolver_request req;
10823
10824         // only allow namespace events on regular files, directories and symlinks.
10825         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
10826                 return EFTYPE;
10827         }
10828
10829         //
10830         // if this is a snapshot event and the vnode is on a
10831         // disk image just pretend nothing happened since any
10832         // change to the disk image will cause the disk image
10833         // itself to get backed up and this avoids multi-way
10834         // deadlocks between the snapshot handler and the ever
10835         // popular diskimages-helper process.  the variable
10836         // nspace_allow_virtual_devs allows this behavior to
10837         // be overridden (for use by the Mobile TimeMachine
10838         // testing infrastructure which uses disk images)
10839         //
10840         if (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) {
10841                 os_log_debug(OS_LOG_DEFAULT, "NSPACE SNAPSHOT not handled");
10842                 return ENOTSUP;
10843         }
10844
10845         error = nspace_materialization_is_prevented();
10846         if (error) {
10847                 os_log_debug(OS_LOG_DEFAULT,
10848                     "NSPACE process/thread is decorated as no-materialization");
10849                 return error;
10850         }
10851
10852         kr = host_get_filecoordinationd_port(host_priv_self(), &mp);
10853         if (kr != KERN_SUCCESS || !IPC_PORT_VALID(mp)) {
10854                 os_log_error(OS_LOG_DEFAULT, "NSPACE no port");
10855                 // Treat this like being unable to access the backing
10856                 // store server.
10857                 return ETIMEDOUT;
10858         }
10859
10860         MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
10861         if (path == NULL) {
10862                 error = ENOMEM;
10863                 goto out_release_port;
10864         }
10865         path_len = MAXPATHLEN;
10866
10867         error = vn_getpath(vp, path, &path_len);
10868         if (error == 0) {
10869                 int xxx_rdar44371223;   /* XXX Mig bug */
10870                 req.r_req_id = next_nspace_req_id();
10871                 req.r_resolver_error = 0;
10872                 req.r_flags = 0;
10873
10874                 NSPACE_REQ_LOCK();
10875                 error = nspace_resolver_req_add(&req);
10876                 NSPACE_REQ_UNLOCK();
10877                 if (error) {
10878                         goto out_release_port;
10879                 }
10880
10881                 os_log_debug(OS_LOG_DEFAULT, "NSPACE resolve_path call");
10882                 kr = send_nspace_resolve_path(mp, req.r_req_id,
10883                     current_proc()->p_pid, (uint32_t)(op & 0xffffffff),
10884                     path, &xxx_rdar44371223);
10885                 if (kr != KERN_SUCCESS) {
10886                         // Also treat this like being unable to access
10887                         // the backing store server.
10888                         os_log_error(OS_LOG_DEFAULT,
10889                             "NSPACE resolve_path failure: %d", kr);
10890                         error = ETIMEDOUT;
10891
10892                         NSPACE_REQ_LOCK();
10893                         nspace_resolver_req_remove(&req);
10894                         NSPACE_REQ_UNLOCK();
10895                         goto out_release_port;
10896                 }
10897
10898                 // Give back the memory we allocated earlier while
10899                 // we wait; we no longer need it.
10900                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10901                 path = NULL;
10902
10903                 // Request has been submitted to the resolver.
10904                 // Now (interruptibly) wait for completion.
10905                 // Upon requrn, the request will have been removed
10906                 // from the lookup table.
10907                 error = nspace_resolver_req_wait(&req);
10908         }
10909
10910 out_release_port:
10911         if (path != NULL) {
10912                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
10913         }
10914         ipc_port_release_send(mp);
10915
10916         return error;
10917 #else
10918         return ENOTSUP;
10919 #endif /* CONFIG_DATALESS_FILES */
10920 }
10921
10922 int
10923 nspace_snapshot_event(__unused vnode_t vp, __unused  time_t ctime,
10924     __unused uint64_t op_type, __unused void *arg)
10925 {
10926         return 0;
10927 }
10928
10929 #if 0
10930 static int
10931 build_volfs_path(struct vnode *vp, char *path, int *len)
10932 {
10933         struct vnode_attr va;
10934         int ret;
10935
10936         VATTR_INIT(&va);
10937         VATTR_WANTED(&va, va_fsid);
10938         VATTR_WANTED(&va, va_fileid);
10939
10940         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
10941                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
10942                 ret = -1;
10943         } else {
10944                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
10945                 ret = 0;
10946         }
10947
10948         return ret;
10949 }
10950 #endif
10951
10952 static unsigned long
10953 fsctl_bogus_command_compat(unsigned long cmd)
10954 {
10955         switch (cmd) {
10956         case IOCBASECMD(FSIOC_SYNC_VOLUME):
10957                 return FSIOC_SYNC_VOLUME;
10958         case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
10959                 return FSIOC_ROUTEFS_SETROUTEID;
10960         case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
10961                 return FSIOC_SET_PACKAGE_EXTS;
10962         case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
10963                 return FSIOC_SET_FSTYPENAME_OVERRIDE;
10964         case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
10965                 return DISK_CONDITIONER_IOC_GET;
10966         case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
10967                 return DISK_CONDITIONER_IOC_SET;
10968         case IOCBASECMD(FSIOC_FIOSEEKHOLE):
10969                 return FSIOC_FIOSEEKHOLE;
10970         case IOCBASECMD(FSIOC_FIOSEEKDATA):
10971                 return FSIOC_FIOSEEKDATA;
10972         case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
10973                 return SPOTLIGHT_IOC_GET_MOUNT_TIME;
10974         case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
10975                 return SPOTLIGHT_IOC_GET_LAST_MTIME;
10976         }
10977
10978         return cmd;
10979 }
10980
10981 static int
10982 cas_bsdflags_setattr(vnode_t vp, void *arg, vfs_context_t ctx)
10983 {
10984         return VNOP_IOCTL(vp, FSIOC_CAS_BSDFLAGS, arg, FWRITE, ctx);
10985 }
10986
10987 /*
10988  * Make a filesystem-specific control call:
10989  */
10990 /* ARGSUSED */
10991 static int
10992 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
10993 {
10994         int error = 0;
10995         boolean_t is64bit;
10996         u_int size;
10997 #define STK_PARAMS 128
10998         char stkbuf[STK_PARAMS] = {0};
10999         caddr_t data, memp;
11000         vnode_t vp = *arg_vp;
11001
11002         if (vp->v_type == VCHR || vp->v_type == VBLK) {
11003                 return ENOTTY;
11004         }
11005
11006         cmd = fsctl_bogus_command_compat(cmd);
11007
11008         size = IOCPARM_LEN(cmd);
11009         if (size > IOCPARM_MAX) {
11010                 return EINVAL;
11011         }
11012
11013         is64bit = proc_is64bit(p);
11014
11015         memp = NULL;
11016
11017         if (size > sizeof(stkbuf)) {
11018                 if ((memp = (caddr_t)kalloc(size)) == 0) {
11019                         return ENOMEM;
11020                 }
11021                 data = memp;
11022         } else {
11023                 data = &stkbuf[0];
11024         };
11025
11026         if (cmd & IOC_IN) {
11027                 if (size) {
11028                         error = copyin(udata, data, size);
11029                         if (error) {
11030                                 if (memp) {
11031                                         kfree(memp, size);
11032                                 }
11033                                 return error;
11034                         }
11035                 } else {
11036                         if (is64bit) {
11037                                 *(user_addr_t *)data = udata;
11038                         } else {
11039                                 *(uint32_t *)data = (uint32_t)udata;
11040                         }
11041                 };
11042         } else if ((cmd & IOC_OUT) && size) {
11043                 /*
11044                  * Zero the buffer so the user always
11045                  * gets back something deterministic.
11046                  */
11047                 bzero(data, size);
11048         } else if (cmd & IOC_VOID) {
11049                 if (is64bit) {
11050                         *(user_addr_t *)data = udata;
11051                 } else {
11052                         *(uint32_t *)data = (uint32_t)udata;
11053                 }
11054         }
11055
11056         /* Check to see if it's a generic command */
11057         switch (cmd) {
11058         case FSIOC_SYNC_VOLUME: {
11059                 struct vfs_attr vfa;
11060                 mount_t mp = vp->v_mount;
11061                 unsigned arg;
11062
11063
11064                 /* record vid of vp so we can drop it below. */
11065                 uint32_t vvid = vp->v_id;
11066
11067                 /*
11068                  * Then grab mount_iterref so that we can release the vnode.
11069                  * Without this, a thread may call vnode_iterate_prepare then
11070                  * get into a deadlock because we've never released the root vp
11071                  */
11072                 error = mount_iterref(mp, 0);
11073                 if (error) {
11074                         break;
11075                 }
11076                 vnode_put(vp);
11077
11078                 arg = MNT_NOWAIT;
11079                 if (*(uint32_t*)data & FSCTL_SYNC_WAIT) {
11080                         arg = MNT_WAIT;
11081                 }
11082
11083                 /*
11084                  * If the filessytem supports multiple filesytems in a
11085                  * partition (For eg APFS volumes in a container, it knows
11086                  * that the waitfor argument to VFS_SYNC are flags.
11087                  */
11088                 VFSATTR_INIT(&vfa);
11089                 VFSATTR_WANTED(&vfa, f_capabilities);
11090                 if ((vfs_getattr(mp, &vfa, vfs_context_current()) == 0) &&
11091                     VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) &&
11092                     ((vfa.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE)) &&
11093                     ((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_SHARED_SPACE))) {
11094                         arg |= MNT_VOLUME;
11095                 }
11096
11097                 /* issue the sync for this volume */
11098                 (void)sync_callback(mp, &arg);
11099
11100                 /*
11101                  * Then release the mount_iterref once we're done syncing; it's not
11102                  * needed for the VNOP_IOCTL below
11103                  */
11104                 mount_iterdrop(mp);
11105
11106                 if (arg & FSCTL_SYNC_FULLSYNC) {
11107                         /* re-obtain vnode iocount on the root vp, if possible */
11108                         error = vnode_getwithvid(vp, vvid);
11109                         if (error == 0) {
11110                                 error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
11111                                 vnode_put(vp);
11112                         }
11113                 }
11114                 /* mark the argument VP as having been released */
11115                 *arg_vp = NULL;
11116         }
11117         break;
11118
11119         case FSIOC_ROUTEFS_SETROUTEID: {
11120 #if ROUTEFS
11121                 char routepath[MAXPATHLEN];
11122                 size_t len = 0;
11123
11124                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11125                         break;
11126                 }
11127                 bzero(routepath, MAXPATHLEN);
11128                 error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
11129                 if (error) {
11130                         break;
11131                 }
11132                 error = routefs_kernel_mount(routepath);
11133                 if (error) {
11134                         break;
11135                 }
11136 #endif
11137         }
11138         break;
11139
11140         case FSIOC_SET_PACKAGE_EXTS: {
11141                 user_addr_t ext_strings;
11142                 uint32_t    num_entries;
11143                 uint32_t    max_width;
11144
11145                 if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0))) {
11146                         break;
11147                 }
11148
11149                 if ((is64bit && size != sizeof(user64_package_ext_info))
11150                     || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
11151                         // either you're 64-bit and passed a 64-bit struct or
11152                         // you're 32-bit and passed a 32-bit struct.  otherwise
11153                         // it's not ok.
11154                         error = EINVAL;
11155                         break;
11156                 }
11157
11158                 if (is64bit) {
11159                         ext_strings = ((user64_package_ext_info *)data)->strings;
11160                         num_entries = ((user64_package_ext_info *)data)->num_entries;
11161                         max_width   = ((user64_package_ext_info *)data)->max_width;
11162                 } else {
11163                         ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
11164                         num_entries = ((user32_package_ext_info *)data)->num_entries;
11165                         max_width   = ((user32_package_ext_info *)data)->max_width;
11166                 }
11167                 error = set_package_extensions_table(ext_strings, num_entries, max_width);
11168         }
11169         break;
11170
11171         case FSIOC_SET_FSTYPENAME_OVERRIDE:
11172         {
11173                 if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
11174                         break;
11175                 }
11176                 if (vp->v_mount) {
11177                         mount_lock(vp->v_mount);
11178                         if (data[0] != 0) {
11179                                 strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
11180                                 vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
11181                                 if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11182                                         vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
11183                                         vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
11184                                 }
11185                         } else {
11186                                 if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
11187                                         vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
11188                                 }
11189                                 vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
11190                                 vp->v_mount->fstypename_override[0] = '\0';
11191                         }
11192                         mount_unlock(vp->v_mount);
11193                 }
11194         }
11195         break;
11196
11197         case DISK_CONDITIONER_IOC_GET: {
11198                 error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
11199         }
11200         break;
11201
11202         case DISK_CONDITIONER_IOC_SET: {
11203                 error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
11204         }
11205         break;
11206
11207         case FSIOC_CAS_BSDFLAGS: {
11208                 struct fsioc_cas_bsdflags *cas = (struct fsioc_cas_bsdflags *)data;
11209                 struct vnode_attr va;
11210
11211                 VATTR_INIT(&va);
11212                 VATTR_SET(&va, va_flags, cas->new_flags);
11213
11214                 error = chflags0(vp, &va, cas_bsdflags_setattr, cas, ctx);
11215         }
11216         break;
11217
11218         case FSIOC_FD_ONLY_OPEN_ONCE: {
11219                 if (vnode_usecount(vp) > 1) {
11220                         error = EBUSY;
11221                 } else {
11222                         error = 0;
11223                 }
11224         }
11225         break;
11226
11227         default: {
11228                 /* other, known commands shouldn't be passed down here */
11229                 switch (cmd) {
11230                 case F_PUNCHHOLE:
11231                 case F_TRIM_ACTIVE_FILE:
11232                 case F_RDADVISE:
11233                 case F_TRANSCODEKEY:
11234                 case F_GETPROTECTIONLEVEL:
11235                 case F_GETDEFAULTPROTLEVEL:
11236                 case F_MAKECOMPRESSED:
11237                 case F_SET_GREEDY_MODE:
11238                 case F_SETSTATICCONTENT:
11239                 case F_SETIOTYPE:
11240                 case F_SETBACKINGSTORE:
11241                 case F_GETPATH_MTMINFO:
11242                 case APFSIOC_REVERT_TO_SNAPSHOT:
11243                 case FSIOC_FIOSEEKHOLE:
11244                 case FSIOC_FIOSEEKDATA:
11245                 case HFS_GET_BOOT_INFO:
11246                 case HFS_SET_BOOT_INFO:
11247                 case FIOPINSWAP:
11248                 case F_CHKCLEAN:
11249                 case F_FULLFSYNC:
11250                 case F_BARRIERFSYNC:
11251                 case F_FREEZE_FS:
11252                 case F_THAW_FS:
11253                         error = EINVAL;
11254                         goto outdrop;
11255                 }
11256                 /* Invoke the filesystem-specific code */
11257                 error = VNOP_IOCTL(vp, cmd, data, options, ctx);
11258         }
11259         } /* end switch stmt */
11260
11261         /*
11262          * if no errors, copy any data to user. Size was
11263          * already set and checked above.
11264          */
11265         if (error == 0 && (cmd & IOC_OUT) && size) {
11266                 error = copyout(data, udata, size);
11267         }
11268
11269 outdrop:
11270         if (memp) {
11271                 kfree(memp, size);
11272         }
11273
11274         return error;
11275 }
11276
11277 /* ARGSUSED */
11278 int
11279 fsctl(proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
11280 {
11281         int error;
11282         struct nameidata nd;
11283         u_long nameiflags;
11284         vnode_t vp = NULL;
11285         vfs_context_t ctx = vfs_context_current();
11286
11287         AUDIT_ARG(cmd, uap->cmd);
11288         AUDIT_ARG(value32, uap->options);
11289         /* Get the vnode for the file we are getting info on:  */
11290         nameiflags = 0;
11291         //
11292         // if we come through fsctl() then the file is by definition not open.
11293         // therefore for the FSIOC_FD_ONLY_OPEN_ONCE selector we return an error
11294         // lest the caller mistakenly thinks the only open is their own (but in
11295         // reality it's someone elses).
11296         //
11297         if (uap->cmd == FSIOC_FD_ONLY_OPEN_ONCE) {
11298                 return EINVAL;
11299         }
11300         if ((uap->options & FSOPT_NOFOLLOW) == 0) {
11301                 nameiflags |= FOLLOW;
11302         }
11303         if (uap->cmd == FSIOC_FIRMLINK_CTL) {
11304                 nameiflags |= (CN_FIRMLINK_NOFOLLOW | NOCACHE);
11305         }
11306         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
11307             UIO_USERSPACE, uap->path, ctx);
11308         if ((error = namei(&nd))) {
11309                 goto done;
11310         }
11311         vp = nd.ni_vp;
11312         nameidone(&nd);
11313
11314 #if CONFIG_MACF
11315         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
11316         if (error) {
11317                 goto done;
11318         }
11319 #endif
11320
11321         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11322
11323 done:
11324         if (vp) {
11325                 vnode_put(vp);
11326         }
11327         return error;
11328 }
11329 /* ARGSUSED */
11330 int
11331 ffsctl(proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
11332 {
11333         int error;
11334         vnode_t vp = NULL;
11335         vfs_context_t ctx = vfs_context_current();
11336         int fd = -1;
11337
11338         AUDIT_ARG(fd, uap->fd);
11339         AUDIT_ARG(cmd, uap->cmd);
11340         AUDIT_ARG(value32, uap->options);
11341
11342         /* Get the vnode for the file we are getting info on:  */
11343         if ((error = file_vnode(uap->fd, &vp))) {
11344                 return error;
11345         }
11346         fd = uap->fd;
11347         if ((error = vnode_getwithref(vp))) {
11348                 file_drop(fd);
11349                 return error;
11350         }
11351
11352 #if CONFIG_MACF
11353         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
11354                 file_drop(fd);
11355                 vnode_put(vp);
11356                 return error;
11357         }
11358 #endif
11359
11360         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
11361
11362         file_drop(fd);
11363
11364         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
11365         if (vp) {
11366                 vnode_put(vp);
11367         }
11368
11369         return error;
11370 }
11371 /* end of fsctl system call */
11372
11373 /*
11374  *  Retrieve the data of an extended attribute.
11375  */
11376 int
11377 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
11378 {
11379         vnode_t vp;
11380         struct nameidata nd;
11381         char attrname[XATTR_MAXNAMELEN + 1];
11382         vfs_context_t ctx = vfs_context_current();
11383         uio_t auio = NULL;
11384         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11385         size_t attrsize = 0;
11386         size_t namelen;
11387         u_int32_t nameiflags;
11388         int error;
11389         char uio_buf[UIO_SIZEOF(1)];
11390
11391         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11392                 return EINVAL;
11393         }
11394
11395         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11396         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
11397         if ((error = namei(&nd))) {
11398                 return error;
11399         }
11400         vp = nd.ni_vp;
11401         nameidone(&nd);
11402
11403         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11404         if (error != 0) {
11405                 goto out;
11406         }
11407         if (xattr_protected(attrname)) {
11408                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
11409                         error = EPERM;
11410                         goto out;
11411                 }
11412         }
11413         /*
11414          * the specific check for 0xffffffff is a hack to preserve
11415          * binaray compatibilty in K64 with applications that discovered
11416          * that passing in a buf pointer and a size of -1 resulted in
11417          * just the size of the indicated extended attribute being returned.
11418          * this isn't part of the documented behavior, but because of the
11419          * original implemtation's check for "uap->size > 0", this behavior
11420          * was allowed. In K32 that check turned into a signed comparison
11421          * even though uap->size is unsigned...  in K64, we blow by that
11422          * check because uap->size is unsigned and doesn't get sign smeared
11423          * in the munger for a 32 bit user app.  we also need to add a
11424          * check to limit the maximum size of the buffer being passed in...
11425          * unfortunately, the underlying fileystems seem to just malloc
11426          * the requested size even if the actual extended attribute is tiny.
11427          * because that malloc is for kernel wired memory, we have to put a
11428          * sane limit on it.
11429          *
11430          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
11431          * U64 running on K64 will yield -1 (64 bits wide)
11432          * U32/U64 running on K32 will yield -1 (32 bits wide)
11433          */
11434         if (uap->size == 0xffffffff || uap->size == (size_t)-1) {
11435                 goto no_uio;
11436         }
11437
11438         if (uap->value) {
11439                 if (uap->size > (size_t)XATTR_MAXSIZE) {
11440                         uap->size = XATTR_MAXSIZE;
11441                 }
11442
11443                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11444                     &uio_buf[0], sizeof(uio_buf));
11445                 uio_addiov(auio, uap->value, uap->size);
11446         }
11447 no_uio:
11448         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
11449 out:
11450         vnode_put(vp);
11451
11452         if (auio) {
11453                 *retval = uap->size - uio_resid(auio);
11454         } else {
11455                 *retval = (user_ssize_t)attrsize;
11456         }
11457
11458         return error;
11459 }
11460
11461 /*
11462  * Retrieve the data of an extended attribute.
11463  */
11464 int
11465 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
11466 {
11467         vnode_t vp;
11468         char attrname[XATTR_MAXNAMELEN + 1];
11469         uio_t auio = NULL;
11470         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11471         size_t attrsize = 0;
11472         size_t namelen;
11473         int error;
11474         char uio_buf[UIO_SIZEOF(1)];
11475
11476         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11477                 return EINVAL;
11478         }
11479
11480         if ((error = file_vnode(uap->fd, &vp))) {
11481                 return error;
11482         }
11483         if ((error = vnode_getwithref(vp))) {
11484                 file_drop(uap->fd);
11485                 return error;
11486         }
11487         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11488         if (error != 0) {
11489                 goto out;
11490         }
11491         if (xattr_protected(attrname)) {
11492                 error = EPERM;
11493                 goto out;
11494         }
11495         if (uap->value && uap->size > 0) {
11496                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
11497                     &uio_buf[0], sizeof(uio_buf));
11498                 uio_addiov(auio, uap->value, uap->size);
11499         }
11500
11501         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
11502 out:
11503         (void)vnode_put(vp);
11504         file_drop(uap->fd);
11505
11506         if (auio) {
11507                 *retval = uap->size - uio_resid(auio);
11508         } else {
11509                 *retval = (user_ssize_t)attrsize;
11510         }
11511         return error;
11512 }
11513
11514 /*
11515  * Set the data of an extended attribute.
11516  */
11517 int
11518 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
11519 {
11520         vnode_t vp;
11521         struct nameidata nd;
11522         char attrname[XATTR_MAXNAMELEN + 1];
11523         vfs_context_t ctx = vfs_context_current();
11524         uio_t auio = NULL;
11525         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11526         size_t namelen;
11527         u_int32_t nameiflags;
11528         int error;
11529         char uio_buf[UIO_SIZEOF(1)];
11530
11531         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11532                 return EINVAL;
11533         }
11534
11535         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11536         if (error != 0) {
11537                 if (error == EPERM) {
11538                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11539                         return ENAMETOOLONG;
11540                 }
11541                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11542                 return error;
11543         }
11544         if (xattr_protected(attrname)) {
11545                 return EPERM;
11546         }
11547         if (uap->size != 0 && uap->value == 0) {
11548                 return EINVAL;
11549         }
11550
11551         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11552         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
11553         if ((error = namei(&nd))) {
11554                 return error;
11555         }
11556         vp = nd.ni_vp;
11557         nameidone(&nd);
11558
11559         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11560             &uio_buf[0], sizeof(uio_buf));
11561         uio_addiov(auio, uap->value, uap->size);
11562
11563         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
11564 #if CONFIG_FSE
11565         if (error == 0) {
11566                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11567                     FSE_ARG_VNODE, vp,
11568                     FSE_ARG_DONE);
11569         }
11570 #endif
11571         vnode_put(vp);
11572         *retval = 0;
11573         return error;
11574 }
11575
11576 /*
11577  * Set the data of an extended attribute.
11578  */
11579 int
11580 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
11581 {
11582         vnode_t vp;
11583         char attrname[XATTR_MAXNAMELEN + 1];
11584         uio_t auio = NULL;
11585         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11586         size_t namelen;
11587         int error;
11588         char uio_buf[UIO_SIZEOF(1)];
11589 #if CONFIG_FSE
11590         vfs_context_t ctx = vfs_context_current();
11591 #endif
11592
11593         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11594                 return EINVAL;
11595         }
11596
11597         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11598         if (error != 0) {
11599                 if (error == EPERM) {
11600                         /* if the string won't fit in attrname, copyinstr emits EPERM */
11601                         return ENAMETOOLONG;
11602                 }
11603                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
11604                 return error;
11605         }
11606         if (xattr_protected(attrname)) {
11607                 return EPERM;
11608         }
11609         if (uap->size != 0 && uap->value == 0) {
11610                 return EINVAL;
11611         }
11612         if ((error = file_vnode(uap->fd, &vp))) {
11613                 return error;
11614         }
11615         if ((error = vnode_getwithref(vp))) {
11616                 file_drop(uap->fd);
11617                 return error;
11618         }
11619         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
11620             &uio_buf[0], sizeof(uio_buf));
11621         uio_addiov(auio, uap->value, uap->size);
11622
11623         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
11624 #if CONFIG_FSE
11625         if (error == 0) {
11626                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
11627                     FSE_ARG_VNODE, vp,
11628                     FSE_ARG_DONE);
11629         }
11630 #endif
11631         vnode_put(vp);
11632         file_drop(uap->fd);
11633         *retval = 0;
11634         return error;
11635 }
11636
11637 /*
11638  * Remove an extended attribute.
11639  * XXX Code duplication here.
11640  */
11641 int
11642 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
11643 {
11644         vnode_t vp;
11645         struct nameidata nd;
11646         char attrname[XATTR_MAXNAMELEN + 1];
11647         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11648         vfs_context_t ctx = vfs_context_current();
11649         size_t namelen;
11650         u_int32_t nameiflags;
11651         int error;
11652
11653         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11654                 return EINVAL;
11655         }
11656
11657         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11658         if (error != 0) {
11659                 return error;
11660         }
11661         if (xattr_protected(attrname)) {
11662                 return EPERM;
11663         }
11664         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11665         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
11666         if ((error = namei(&nd))) {
11667                 return error;
11668         }
11669         vp = nd.ni_vp;
11670         nameidone(&nd);
11671
11672         error = vn_removexattr(vp, attrname, uap->options, ctx);
11673 #if CONFIG_FSE
11674         if (error == 0) {
11675                 add_fsevent(FSE_XATTR_REMOVED, ctx,
11676                     FSE_ARG_VNODE, vp,
11677                     FSE_ARG_DONE);
11678         }
11679 #endif
11680         vnode_put(vp);
11681         *retval = 0;
11682         return error;
11683 }
11684
11685 /*
11686  * Remove an extended attribute.
11687  * XXX Code duplication here.
11688  */
11689 int
11690 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
11691 {
11692         vnode_t vp;
11693         char attrname[XATTR_MAXNAMELEN + 1];
11694         size_t namelen;
11695         int error;
11696 #if CONFIG_FSE
11697         vfs_context_t ctx = vfs_context_current();
11698 #endif
11699
11700         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11701                 return EINVAL;
11702         }
11703
11704         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
11705         if (error != 0) {
11706                 return error;
11707         }
11708         if (xattr_protected(attrname)) {
11709                 return EPERM;
11710         }
11711         if ((error = file_vnode(uap->fd, &vp))) {
11712                 return error;
11713         }
11714         if ((error = vnode_getwithref(vp))) {
11715                 file_drop(uap->fd);
11716                 return error;
11717         }
11718
11719         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
11720 #if CONFIG_FSE
11721         if (error == 0) {
11722                 add_fsevent(FSE_XATTR_REMOVED, ctx,
11723                     FSE_ARG_VNODE, vp,
11724                     FSE_ARG_DONE);
11725         }
11726 #endif
11727         vnode_put(vp);
11728         file_drop(uap->fd);
11729         *retval = 0;
11730         return error;
11731 }
11732
11733 /*
11734  * Retrieve the list of extended attribute names.
11735  * XXX Code duplication here.
11736  */
11737 int
11738 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
11739 {
11740         vnode_t vp;
11741         struct nameidata nd;
11742         vfs_context_t ctx = vfs_context_current();
11743         uio_t auio = NULL;
11744         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11745         size_t attrsize = 0;
11746         u_int32_t nameiflags;
11747         int error;
11748         char uio_buf[UIO_SIZEOF(1)];
11749
11750         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11751                 return EINVAL;
11752         }
11753
11754         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11755         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
11756         if ((error = namei(&nd))) {
11757                 return error;
11758         }
11759         vp = nd.ni_vp;
11760         nameidone(&nd);
11761         if (uap->namebuf != 0 && uap->bufsize > 0) {
11762                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
11763                     &uio_buf[0], sizeof(uio_buf));
11764                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11765         }
11766
11767         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
11768
11769         vnode_put(vp);
11770         if (auio) {
11771                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11772         } else {
11773                 *retval = (user_ssize_t)attrsize;
11774         }
11775         return error;
11776 }
11777
11778 /*
11779  * Retrieve the list of extended attribute names.
11780  * XXX Code duplication here.
11781  */
11782 int
11783 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
11784 {
11785         vnode_t vp;
11786         uio_t auio = NULL;
11787         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11788         size_t attrsize = 0;
11789         int error;
11790         char uio_buf[UIO_SIZEOF(1)];
11791
11792         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) {
11793                 return EINVAL;
11794         }
11795
11796         if ((error = file_vnode(uap->fd, &vp))) {
11797                 return error;
11798         }
11799         if ((error = vnode_getwithref(vp))) {
11800                 file_drop(uap->fd);
11801                 return error;
11802         }
11803         if (uap->namebuf != 0 && uap->bufsize > 0) {
11804                 auio = uio_createwithbuffer(1, 0, spacetype,
11805                     UIO_READ, &uio_buf[0], sizeof(uio_buf));
11806                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11807         }
11808
11809         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
11810
11811         vnode_put(vp);
11812         file_drop(uap->fd);
11813         if (auio) {
11814                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11815         } else {
11816                 *retval = (user_ssize_t)attrsize;
11817         }
11818         return error;
11819 }
11820
11821 static int
11822 fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid,
11823     vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen)
11824 {
11825         int error;
11826         struct mount *mp = NULL;
11827         vnode_t vp;
11828         int length;
11829         int bpflags;
11830         /* maximum number of times to retry build_path */
11831         unsigned int retries = 0x10;
11832
11833         if (bufsize > PAGE_SIZE) {
11834                 return EINVAL;
11835         }
11836
11837         if (buf == NULL) {
11838                 return ENOMEM;
11839         }
11840
11841 retry:
11842         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
11843                 error = ENOTSUP;  /* unexpected failure */
11844                 return ENOTSUP;
11845         }
11846
11847 unionget:
11848         if (objid == 2) {
11849                 struct vfs_attr vfsattr;
11850                 int use_vfs_root = TRUE;
11851
11852                 VFSATTR_INIT(&vfsattr);
11853                 VFSATTR_WANTED(&vfsattr, f_capabilities);
11854                 if (!(options & FSOPT_ISREALFSID) &&
11855                     vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 &&
11856                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
11857                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) &&
11858                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) {
11859                                 use_vfs_root = FALSE;
11860                         }
11861                 }
11862
11863                 if (use_vfs_root) {
11864                         error = VFS_ROOT(mp, &vp, ctx);
11865                 } else {
11866                         error = VFS_VGET(mp, objid, &vp, ctx);
11867                 }
11868         } else {
11869                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
11870         }
11871
11872         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
11873                 /*
11874                  * If the fileid isn't found and we're in a union
11875                  * mount volume, then see if the fileid is in the
11876                  * mounted-on volume.
11877                  */
11878                 struct mount *tmp = mp;
11879                 mp = vnode_mount(tmp->mnt_vnodecovered);
11880                 vfs_unbusy(tmp);
11881                 if (vfs_busy(mp, LK_NOWAIT) == 0) {
11882                         goto unionget;
11883                 }
11884         } else {
11885                 vfs_unbusy(mp);
11886         }
11887
11888         if (error) {
11889                 return error;
11890         }
11891
11892 #if CONFIG_MACF
11893         error = mac_vnode_check_fsgetpath(ctx, vp);
11894         if (error) {
11895                 vnode_put(vp);
11896                 return error;
11897         }
11898 #endif
11899
11900         /* Obtain the absolute path to this vnode. */
11901         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
11902         if (options & FSOPT_NOFIRMLINKPATH) {
11903                 bpflags |= BUILDPATH_NO_FIRMLINK;
11904         }
11905         bpflags |= BUILDPATH_CHECK_MOVED;
11906         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
11907         vnode_put(vp);
11908
11909         if (error) {
11910                 /* there was a race building the path, try a few more times */
11911                 if (error == EAGAIN) {
11912                         --retries;
11913                         if (retries > 0) {
11914                                 goto retry;
11915                         }
11916
11917                         error = ENOENT;
11918                 }
11919                 goto out;
11920         }
11921
11922         AUDIT_ARG(text, buf);
11923
11924         if (kdebug_enable) {
11925                 long dbg_parms[NUMPARMS];
11926                 int  dbg_namelen;
11927
11928                 dbg_namelen = (int)sizeof(dbg_parms);
11929
11930                 if (length < dbg_namelen) {
11931                         memcpy((char *)dbg_parms, buf, length);
11932                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
11933
11934                         dbg_namelen = length;
11935                 } else {
11936                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
11937                 }
11938
11939                 kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
11940                     KDBG_VFS_LOOKUP_FLAG_LOOKUP);
11941         }
11942
11943         *pathlen = (user_ssize_t)length; /* may be superseded by error */
11944
11945 out:
11946         return error;
11947 }
11948
11949 /*
11950  * Obtain the full pathname of a file system object by id.
11951  */
11952 static int
11953 fsgetpath_extended(user_addr_t buf, int bufsize, user_addr_t user_fsid, uint64_t objid,
11954     uint32_t options, user_ssize_t *retval)
11955 {
11956         vfs_context_t ctx = vfs_context_current();
11957         fsid_t fsid;
11958         char *realpath;
11959         int length;
11960         int error;
11961
11962         if (options & ~(FSOPT_NOFIRMLINKPATH | FSOPT_ISREALFSID)) {
11963                 return EINVAL;
11964         }
11965
11966         if ((error = copyin(user_fsid, (caddr_t)&fsid, sizeof(fsid)))) {
11967                 return error;
11968         }
11969         AUDIT_ARG(value32, fsid.val[0]);
11970         AUDIT_ARG(value64, objid);
11971         /* Restrict output buffer size for now. */
11972
11973         if (bufsize > PAGE_SIZE || bufsize <= 0) {
11974                 return EINVAL;
11975         }
11976         MALLOC(realpath, char *, bufsize, M_TEMP, M_WAITOK | M_ZERO);
11977         if (realpath == NULL) {
11978                 return ENOMEM;
11979         }
11980
11981         error = fsgetpath_internal(ctx, fsid.val[0], objid, bufsize, realpath,
11982             options, &length);
11983
11984         if (error) {
11985                 goto out;
11986         }
11987
11988         error = copyout((caddr_t)realpath, buf, length);
11989
11990         *retval = (user_ssize_t)length; /* may be superseded by error */
11991 out:
11992         if (realpath) {
11993                 FREE(realpath, M_TEMP);
11994         }
11995         return error;
11996 }
11997
11998 int
11999 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
12000 {
12001         return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12002                    0, retval);
12003 }
12004
12005 int
12006 fsgetpath_ext(__unused proc_t p, struct fsgetpath_ext_args *uap, user_ssize_t *retval)
12007 {
12008         return fsgetpath_extended(uap->buf, uap->bufsize, uap->fsid, uap->objid,
12009                    uap->options, retval);
12010 }
12011
12012 /*
12013  * Common routine to handle various flavors of statfs data heading out
12014  *      to user space.
12015  *
12016  * Returns:     0                       Success
12017  *              EFAULT
12018  */
12019 static int
12020 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
12021     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
12022     boolean_t partial_copy)
12023 {
12024         int             error;
12025         int             my_size, copy_size;
12026
12027         if (is_64_bit) {
12028                 struct user64_statfs sfs;
12029                 my_size = copy_size = sizeof(sfs);
12030                 bzero(&sfs, my_size);
12031                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12032                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12033                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12034                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
12035                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
12036                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
12037                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
12038                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
12039                 sfs.f_files = (user64_long_t)sfsp->f_files;
12040                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
12041                 sfs.f_fsid = sfsp->f_fsid;
12042                 sfs.f_owner = sfsp->f_owner;
12043                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12044                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12045                 } else {
12046                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12047                 }
12048                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12049                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12050
12051                 if (partial_copy) {
12052                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12053                 }
12054                 error = copyout((caddr_t)&sfs, bufp, copy_size);
12055         } else {
12056                 struct user32_statfs sfs;
12057
12058                 my_size = copy_size = sizeof(sfs);
12059                 bzero(&sfs, my_size);
12060
12061                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
12062                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
12063                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
12064
12065                 /*
12066                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
12067                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
12068                  * to reflect the filesystem size as best we can.
12069                  */
12070                 if ((sfsp->f_blocks > INT_MAX)
12071                     /* Hack for 4061702 . I think the real fix is for Carbon to
12072                      * look for some volume capability and not depend on hidden
12073                      * semantics agreed between a FS and carbon.
12074                      * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
12075                      * for Carbon to set bNoVolumeSizes volume attribute.
12076                      * Without this the webdavfs files cannot be copied onto
12077                      * disk as they look huge. This change should not affect
12078                      * XSAN as they should not setting these to -1..
12079                      */
12080                     && (sfsp->f_blocks != 0xffffffffffffffffULL)
12081                     && (sfsp->f_bfree != 0xffffffffffffffffULL)
12082                     && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
12083                         int             shift;
12084
12085                         /*
12086                          * Work out how far we have to shift the block count down to make it fit.
12087                          * Note that it's possible to have to shift so far that the resulting
12088                          * blocksize would be unreportably large.  At that point, we will clip
12089                          * any values that don't fit.
12090                          *
12091                          * For safety's sake, we also ensure that f_iosize is never reported as
12092                          * being smaller than f_bsize.
12093                          */
12094                         for (shift = 0; shift < 32; shift++) {
12095                                 if ((sfsp->f_blocks >> shift) <= INT_MAX) {
12096                                         break;
12097                                 }
12098                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX) {
12099                                         break;
12100                                 }
12101                         }
12102 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
12103                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
12104                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
12105                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
12106 #undef __SHIFT_OR_CLIP
12107                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
12108                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
12109                 } else {
12110                         /* filesystem is small enough to be reported honestly */
12111                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
12112                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
12113                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
12114                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
12115                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
12116                 }
12117                 sfs.f_files = (user32_long_t)sfsp->f_files;
12118                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
12119                 sfs.f_fsid = sfsp->f_fsid;
12120                 sfs.f_owner = sfsp->f_owner;
12121                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
12122                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
12123                 } else {
12124                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
12125                 }
12126                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
12127                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
12128
12129                 if (partial_copy) {
12130                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
12131                 }
12132                 error = copyout((caddr_t)&sfs, bufp, copy_size);
12133         }
12134
12135         if (sizep != NULL) {
12136                 *sizep = my_size;
12137         }
12138         return error;
12139 }
12140
12141 /*
12142  * copy stat structure into user_stat structure.
12143  */
12144 void
12145 munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
12146 {
12147         bzero(usbp, sizeof(*usbp));
12148
12149         usbp->st_dev = sbp->st_dev;
12150         usbp->st_ino = sbp->st_ino;
12151         usbp->st_mode = sbp->st_mode;
12152         usbp->st_nlink = sbp->st_nlink;
12153         usbp->st_uid = sbp->st_uid;
12154         usbp->st_gid = sbp->st_gid;
12155         usbp->st_rdev = sbp->st_rdev;
12156 #ifndef _POSIX_C_SOURCE
12157         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12158         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12159         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12160         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12161         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12162         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12163 #else
12164         usbp->st_atime = sbp->st_atime;
12165         usbp->st_atimensec = sbp->st_atimensec;
12166         usbp->st_mtime = sbp->st_mtime;
12167         usbp->st_mtimensec = sbp->st_mtimensec;
12168         usbp->st_ctime = sbp->st_ctime;
12169         usbp->st_ctimensec = sbp->st_ctimensec;
12170 #endif
12171         usbp->st_size = sbp->st_size;
12172         usbp->st_blocks = sbp->st_blocks;
12173         usbp->st_blksize = sbp->st_blksize;
12174         usbp->st_flags = sbp->st_flags;
12175         usbp->st_gen = sbp->st_gen;
12176         usbp->st_lspare = sbp->st_lspare;
12177         usbp->st_qspare[0] = sbp->st_qspare[0];
12178         usbp->st_qspare[1] = sbp->st_qspare[1];
12179 }
12180
12181 void
12182 munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
12183 {
12184         bzero(usbp, sizeof(*usbp));
12185
12186         usbp->st_dev = sbp->st_dev;
12187         usbp->st_ino = sbp->st_ino;
12188         usbp->st_mode = sbp->st_mode;
12189         usbp->st_nlink = sbp->st_nlink;
12190         usbp->st_uid = sbp->st_uid;
12191         usbp->st_gid = sbp->st_gid;
12192         usbp->st_rdev = sbp->st_rdev;
12193 #ifndef _POSIX_C_SOURCE
12194         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12195         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12196         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12197         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12198         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12199         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12200 #else
12201         usbp->st_atime = sbp->st_atime;
12202         usbp->st_atimensec = sbp->st_atimensec;
12203         usbp->st_mtime = sbp->st_mtime;
12204         usbp->st_mtimensec = sbp->st_mtimensec;
12205         usbp->st_ctime = sbp->st_ctime;
12206         usbp->st_ctimensec = sbp->st_ctimensec;
12207 #endif
12208         usbp->st_size = sbp->st_size;
12209         usbp->st_blocks = sbp->st_blocks;
12210         usbp->st_blksize = sbp->st_blksize;
12211         usbp->st_flags = sbp->st_flags;
12212         usbp->st_gen = sbp->st_gen;
12213         usbp->st_lspare = sbp->st_lspare;
12214         usbp->st_qspare[0] = sbp->st_qspare[0];
12215         usbp->st_qspare[1] = sbp->st_qspare[1];
12216 }
12217
12218 /*
12219  * copy stat64 structure into user_stat64 structure.
12220  */
12221 void
12222 munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
12223 {
12224         bzero(usbp, sizeof(*usbp));
12225
12226         usbp->st_dev = sbp->st_dev;
12227         usbp->st_ino = sbp->st_ino;
12228         usbp->st_mode = sbp->st_mode;
12229         usbp->st_nlink = sbp->st_nlink;
12230         usbp->st_uid = sbp->st_uid;
12231         usbp->st_gid = sbp->st_gid;
12232         usbp->st_rdev = sbp->st_rdev;
12233 #ifndef _POSIX_C_SOURCE
12234         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12235         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12236         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12237         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12238         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12239         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12240         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12241         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12242 #else
12243         usbp->st_atime = sbp->st_atime;
12244         usbp->st_atimensec = sbp->st_atimensec;
12245         usbp->st_mtime = sbp->st_mtime;
12246         usbp->st_mtimensec = sbp->st_mtimensec;
12247         usbp->st_ctime = sbp->st_ctime;
12248         usbp->st_ctimensec = sbp->st_ctimensec;
12249         usbp->st_birthtime = sbp->st_birthtime;
12250         usbp->st_birthtimensec = sbp->st_birthtimensec;
12251 #endif
12252         usbp->st_size = sbp->st_size;
12253         usbp->st_blocks = sbp->st_blocks;
12254         usbp->st_blksize = sbp->st_blksize;
12255         usbp->st_flags = sbp->st_flags;
12256         usbp->st_gen = sbp->st_gen;
12257         usbp->st_lspare = sbp->st_lspare;
12258         usbp->st_qspare[0] = sbp->st_qspare[0];
12259         usbp->st_qspare[1] = sbp->st_qspare[1];
12260 }
12261
12262 void
12263 munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
12264 {
12265         bzero(usbp, sizeof(*usbp));
12266
12267         usbp->st_dev = sbp->st_dev;
12268         usbp->st_ino = sbp->st_ino;
12269         usbp->st_mode = sbp->st_mode;
12270         usbp->st_nlink = sbp->st_nlink;
12271         usbp->st_uid = sbp->st_uid;
12272         usbp->st_gid = sbp->st_gid;
12273         usbp->st_rdev = sbp->st_rdev;
12274 #ifndef _POSIX_C_SOURCE
12275         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
12276         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
12277         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
12278         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
12279         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
12280         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
12281         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
12282         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
12283 #else
12284         usbp->st_atime = sbp->st_atime;
12285         usbp->st_atimensec = sbp->st_atimensec;
12286         usbp->st_mtime = sbp->st_mtime;
12287         usbp->st_mtimensec = sbp->st_mtimensec;
12288         usbp->st_ctime = sbp->st_ctime;
12289         usbp->st_ctimensec = sbp->st_ctimensec;
12290         usbp->st_birthtime = sbp->st_birthtime;
12291         usbp->st_birthtimensec = sbp->st_birthtimensec;
12292 #endif
12293         usbp->st_size = sbp->st_size;
12294         usbp->st_blocks = sbp->st_blocks;
12295         usbp->st_blksize = sbp->st_blksize;
12296         usbp->st_flags = sbp->st_flags;
12297         usbp->st_gen = sbp->st_gen;
12298         usbp->st_lspare = sbp->st_lspare;
12299         usbp->st_qspare[0] = sbp->st_qspare[0];
12300         usbp->st_qspare[1] = sbp->st_qspare[1];
12301 }
12302
12303 /*
12304  * Purge buffer cache for simulating cold starts
12305  */
12306 static int
12307 vnode_purge_callback(struct vnode *vp, __unused void *cargs)
12308 {
12309         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
12310
12311         return VNODE_RETURNED;
12312 }
12313
12314 static int
12315 vfs_purge_callback(mount_t mp, __unused void * arg)
12316 {
12317         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
12318
12319         return VFS_RETURNED;
12320 }
12321
12322 int
12323 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
12324 {
12325         if (!kauth_cred_issuser(kauth_cred_get())) {
12326                 return EPERM;
12327         }
12328
12329         vfs_iterate(0 /* flags */, vfs_purge_callback, NULL);
12330
12331         return 0;
12332 }
12333
12334 /*
12335  * gets the vnode associated with the (unnamed) snapshot directory
12336  * for a Filesystem. The snapshot directory vnode is returned with
12337  * an iocount on it.
12338  */
12339 int
12340 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
12341 {
12342         return VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx);
12343 }
12344
12345 /*
12346  * Get the snapshot vnode.
12347  *
12348  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
12349  * needs nameidone() on ndp.
12350  *
12351  * If the snapshot vnode exists it is returned in ndp->ni_vp.
12352  *
12353  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
12354  * not needed.
12355  */
12356 static int
12357 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
12358     user_addr_t name, struct nameidata *ndp, int32_t op,
12359 #if !CONFIG_TRIGGERS
12360     __unused
12361 #endif
12362     enum path_operation pathop,
12363     vfs_context_t ctx)
12364 {
12365         int error, i;
12366         caddr_t name_buf;
12367         size_t name_len;
12368         struct vfs_attr vfa;
12369
12370         *sdvpp = NULLVP;
12371         *rvpp = NULLVP;
12372
12373         error = vnode_getfromfd(ctx, dirfd, rvpp);
12374         if (error) {
12375                 return error;
12376         }
12377
12378         if (!vnode_isvroot(*rvpp)) {
12379                 error = EINVAL;
12380                 goto out;
12381         }
12382
12383         /* Make sure the filesystem supports snapshots */
12384         VFSATTR_INIT(&vfa);
12385         VFSATTR_WANTED(&vfa, f_capabilities);
12386         if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
12387             !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
12388             !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
12389             VOL_CAP_INT_SNAPSHOT)) ||
12390             !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
12391             VOL_CAP_INT_SNAPSHOT))) {
12392                 error = ENOTSUP;
12393                 goto out;
12394         }
12395
12396         error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
12397         if (error) {
12398                 goto out;
12399         }
12400
12401         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12402         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12403         if (error) {
12404                 goto out1;
12405         }
12406
12407         /*
12408          * Some sanity checks- name can't be empty, "." or ".." or have slashes.
12409          * (the length returned by copyinstr includes the terminating NUL)
12410          */
12411         if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
12412             (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
12413                 error = EINVAL;
12414                 goto out1;
12415         }
12416         for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++) {
12417                 ;
12418         }
12419         if (i < (int)name_len) {
12420                 error = EINVAL;
12421                 goto out1;
12422         }
12423
12424 #if CONFIG_MACF
12425         if (op == CREATE) {
12426                 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
12427                     name_buf);
12428         } else if (op == DELETE) {
12429                 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
12430                     name_buf);
12431         }
12432         if (error) {
12433                 goto out1;
12434         }
12435 #endif
12436
12437         /* Check if the snapshot already exists ... */
12438         NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
12439             UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
12440         ndp->ni_dvp = *sdvpp;
12441
12442         error = namei(ndp);
12443 out1:
12444         FREE(name_buf, M_TEMP);
12445 out:
12446         if (error) {
12447                 if (*sdvpp) {
12448                         vnode_put(*sdvpp);
12449                         *sdvpp = NULLVP;
12450                 }
12451                 if (*rvpp) {
12452                         vnode_put(*rvpp);
12453                         *rvpp = NULLVP;
12454                 }
12455         }
12456         return error;
12457 }
12458
12459 /*
12460  * create a filesystem snapshot (for supporting filesystems)
12461  *
12462  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
12463  * We get to the (unnamed) snapshot directory vnode and create the vnode
12464  * for the snapshot in it.
12465  *
12466  * Restrictions:
12467  *
12468  *    a) Passed in name for snapshot cannot have slashes.
12469  *    b) name can't be "." or ".."
12470  *
12471  * Since this requires superuser privileges, vnode_authorize calls are not
12472  * made.
12473  */
12474 static int
12475 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
12476     vfs_context_t ctx)
12477 {
12478         vnode_t rvp, snapdvp;
12479         int error;
12480         struct nameidata namend;
12481
12482         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
12483             OP_LINK, ctx);
12484         if (error) {
12485                 return error;
12486         }
12487
12488         if (namend.ni_vp) {
12489                 vnode_put(namend.ni_vp);
12490                 error = EEXIST;
12491         } else {
12492                 struct vnode_attr va;
12493                 vnode_t vp = NULLVP;
12494
12495                 VATTR_INIT(&va);
12496                 VATTR_SET(&va, va_type, VREG);
12497                 VATTR_SET(&va, va_mode, 0);
12498
12499                 error = vn_create(snapdvp, &vp, &namend, &va,
12500                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
12501                 if (!error && vp) {
12502                         vnode_put(vp);
12503                 }
12504         }
12505
12506         nameidone(&namend);
12507         vnode_put(snapdvp);
12508         vnode_put(rvp);
12509         return error;
12510 }
12511
12512 /*
12513  * Delete a Filesystem snapshot
12514  *
12515  * get the vnode for the unnamed snapshot directory and the snapshot and
12516  * delete the snapshot.
12517  */
12518 static int
12519 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
12520     vfs_context_t ctx)
12521 {
12522         vnode_t rvp, snapdvp;
12523         int error;
12524         struct nameidata namend;
12525
12526         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
12527             OP_UNLINK, ctx);
12528         if (error) {
12529                 goto out;
12530         }
12531
12532         error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
12533             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
12534
12535         vnode_put(namend.ni_vp);
12536         nameidone(&namend);
12537         vnode_put(snapdvp);
12538         vnode_put(rvp);
12539 out:
12540         return error;
12541 }
12542
12543 /*
12544  * Revert a filesystem to a snapshot
12545  *
12546  * Marks the filesystem to revert to the given snapshot on next mount.
12547  */
12548 static int
12549 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
12550     vfs_context_t ctx)
12551 {
12552         int error;
12553         vnode_t rvp;
12554         mount_t mp;
12555         struct fs_snapshot_revert_args revert_data;
12556         struct componentname cnp;
12557         caddr_t name_buf;
12558         size_t name_len;
12559
12560         error = vnode_getfromfd(ctx, dirfd, &rvp);
12561         if (error) {
12562                 return error;
12563         }
12564         mp = vnode_mount(rvp);
12565
12566         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12567         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12568         if (error) {
12569                 FREE(name_buf, M_TEMP);
12570                 vnode_put(rvp);
12571                 return error;
12572         }
12573
12574 #if CONFIG_MACF
12575         error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
12576         if (error) {
12577                 FREE(name_buf, M_TEMP);
12578                 vnode_put(rvp);
12579                 return error;
12580         }
12581 #endif
12582
12583         /*
12584          * Grab mount_iterref so that we can release the vnode,
12585          * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
12586          */
12587         error = mount_iterref(mp, 0);
12588         vnode_put(rvp);
12589         if (error) {
12590                 FREE(name_buf, M_TEMP);
12591                 return error;
12592         }
12593
12594         memset(&cnp, 0, sizeof(cnp));
12595         cnp.cn_pnbuf = (char *)name_buf;
12596         cnp.cn_nameiop = LOOKUP;
12597         cnp.cn_flags = ISLASTCN | HASBUF;
12598         cnp.cn_pnlen = MAXPATHLEN;
12599         cnp.cn_nameptr = cnp.cn_pnbuf;
12600         cnp.cn_namelen = (int)name_len;
12601         revert_data.sr_cnp = &cnp;
12602
12603         error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
12604         mount_iterdrop(mp);
12605         FREE(name_buf, M_TEMP);
12606
12607         if (error) {
12608                 /* If there was any error, try again using VNOP_IOCTL */
12609
12610                 vnode_t snapdvp;
12611                 struct nameidata namend;
12612
12613                 error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
12614                     OP_LOOKUP, ctx);
12615                 if (error) {
12616                         return error;
12617                 }
12618
12619
12620                 error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
12621                     0, ctx);
12622
12623                 vnode_put(namend.ni_vp);
12624                 nameidone(&namend);
12625                 vnode_put(snapdvp);
12626                 vnode_put(rvp);
12627         }
12628
12629         return error;
12630 }
12631
12632 /*
12633  * rename a Filesystem snapshot
12634  *
12635  * get the vnode for the unnamed snapshot directory and the snapshot and
12636  * rename the snapshot. This is a very specialised (and simple) case of
12637  * rename(2) (which has to deal with a lot more complications). It differs
12638  * slightly from rename(2) in that EEXIST is returned if the new name exists.
12639  */
12640 static int
12641 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
12642     __unused uint32_t flags, vfs_context_t ctx)
12643 {
12644         vnode_t rvp, snapdvp;
12645         int error, i;
12646         caddr_t newname_buf;
12647         size_t name_len;
12648         vnode_t fvp;
12649         struct nameidata *fromnd, *tond;
12650         /* carving out a chunk for structs that are too big to be on stack. */
12651         struct {
12652                 struct nameidata from_node;
12653                 struct nameidata to_node;
12654         } * __rename_data;
12655
12656         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
12657         fromnd = &__rename_data->from_node;
12658         tond = &__rename_data->to_node;
12659
12660         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
12661             OP_UNLINK, ctx);
12662         if (error) {
12663                 goto out;
12664         }
12665         fvp  = fromnd->ni_vp;
12666
12667         MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12668         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
12669         if (error) {
12670                 goto out1;
12671         }
12672
12673         /*
12674          * Some sanity checks- new name can't be empty, "." or ".." or have
12675          * slashes.
12676          * (the length returned by copyinstr includes the terminating NUL)
12677          *
12678          * The FS rename VNOP is suppossed to handle this but we'll pick it
12679          * off here itself.
12680          */
12681         if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
12682             (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
12683                 error = EINVAL;
12684                 goto out1;
12685         }
12686         for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++) {
12687                 ;
12688         }
12689         if (i < (int)name_len) {
12690                 error = EINVAL;
12691                 goto out1;
12692         }
12693
12694 #if CONFIG_MACF
12695         error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
12696             newname_buf);
12697         if (error) {
12698                 goto out1;
12699         }
12700 #endif
12701
12702         NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
12703             UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
12704         tond->ni_dvp = snapdvp;
12705
12706         error = namei(tond);
12707         if (error) {
12708                 goto out2;
12709         } else if (tond->ni_vp) {
12710                 /*
12711                  * snapshot rename behaves differently than rename(2) - if the
12712                  * new name exists, EEXIST is returned.
12713                  */
12714                 vnode_put(tond->ni_vp);
12715                 error = EEXIST;
12716                 goto out2;
12717         }
12718
12719         error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
12720             &tond->ni_cnd, ctx);
12721
12722 out2:
12723         nameidone(tond);
12724 out1:
12725         FREE(newname_buf, M_TEMP);
12726         vnode_put(fvp);
12727         vnode_put(snapdvp);
12728         vnode_put(rvp);
12729         nameidone(fromnd);
12730 out:
12731         FREE(__rename_data, M_TEMP);
12732         return error;
12733 }
12734
12735 /*
12736  * Mount a Filesystem snapshot
12737  *
12738  * get the vnode for the unnamed snapshot directory and the snapshot and
12739  * mount the snapshot.
12740  */
12741 static int
12742 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
12743     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
12744 {
12745         mount_t mp;
12746         vnode_t rvp, snapdvp, snapvp, vp, pvp;
12747         struct fs_snapshot_mount_args smnt_data;
12748         int error;
12749         struct nameidata *snapndp, *dirndp;
12750         /* carving out a chunk for structs that are too big to be on stack. */
12751         struct {
12752                 struct nameidata snapnd;
12753                 struct nameidata dirnd;
12754         } * __snapshot_mount_data;
12755
12756         MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
12757             M_TEMP, M_WAITOK);
12758         snapndp = &__snapshot_mount_data->snapnd;
12759         dirndp = &__snapshot_mount_data->dirnd;
12760
12761         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
12762             OP_LOOKUP, ctx);
12763         if (error) {
12764                 goto out;
12765         }
12766
12767         snapvp  = snapndp->ni_vp;
12768         if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
12769                 error = EIO;
12770                 goto out1;
12771         }
12772
12773         /* Get the vnode to be covered */
12774         NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
12775             UIO_USERSPACE, directory, ctx);
12776         error = namei(dirndp);
12777         if (error) {
12778                 goto out1;
12779         }
12780
12781         vp = dirndp->ni_vp;
12782         pvp = dirndp->ni_dvp;
12783         mp = vnode_mount(rvp);
12784
12785         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
12786                 error = EINVAL;
12787                 goto out2;
12788         }
12789
12790 #if CONFIG_MACF
12791         error = mac_mount_check_snapshot_mount(ctx, rvp, vp, &dirndp->ni_cnd, snapndp->ni_cnd.cn_nameptr,
12792             mp->mnt_vfsstat.f_fstypename);
12793         if (error) {
12794                 goto out2;
12795         }
12796 #endif
12797
12798         smnt_data.sm_mp  = mp;
12799         smnt_data.sm_cnp = &snapndp->ni_cnd;
12800         error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
12801             &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
12802             KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
12803
12804 out2:
12805         vnode_put(vp);
12806         vnode_put(pvp);
12807         nameidone(dirndp);
12808 out1:
12809         vnode_put(snapvp);
12810         vnode_put(snapdvp);
12811         vnode_put(rvp);
12812         nameidone(snapndp);
12813 out:
12814         FREE(__snapshot_mount_data, M_TEMP);
12815         return error;
12816 }
12817
12818 /*
12819  * Root from a snapshot of the filesystem
12820  *
12821  * Marks the filesystem to root from the given snapshot on next boot.
12822  */
12823 static int
12824 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
12825     vfs_context_t ctx)
12826 {
12827         int error;
12828         vnode_t rvp;
12829         mount_t mp;
12830         struct fs_snapshot_root_args root_data;
12831         struct componentname cnp;
12832         caddr_t name_buf;
12833         size_t name_len;
12834
12835         error = vnode_getfromfd(ctx, dirfd, &rvp);
12836         if (error) {
12837                 return error;
12838         }
12839         mp = vnode_mount(rvp);
12840
12841         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12842         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12843         if (error) {
12844                 FREE(name_buf, M_TEMP);
12845                 vnode_put(rvp);
12846                 return error;
12847         }
12848
12849         // XXX MAC checks ?
12850
12851         /*
12852          * Grab mount_iterref so that we can release the vnode,
12853          * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
12854          */
12855         error = mount_iterref(mp, 0);
12856         vnode_put(rvp);
12857         if (error) {
12858                 FREE(name_buf, M_TEMP);
12859                 return error;
12860         }
12861
12862         memset(&cnp, 0, sizeof(cnp));
12863         cnp.cn_pnbuf = (char *)name_buf;
12864         cnp.cn_nameiop = LOOKUP;
12865         cnp.cn_flags = ISLASTCN | HASBUF;
12866         cnp.cn_pnlen = MAXPATHLEN;
12867         cnp.cn_nameptr = cnp.cn_pnbuf;
12868         cnp.cn_namelen = (int)name_len;
12869         root_data.sr_cnp = &cnp;
12870
12871         error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
12872
12873         mount_iterdrop(mp);
12874         FREE(name_buf, M_TEMP);
12875
12876         return error;
12877 }
12878
12879 /*
12880  * FS snapshot operations dispatcher
12881  */
12882 int
12883 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
12884     __unused int32_t *retval)
12885 {
12886         int error;
12887         vfs_context_t ctx = vfs_context_current();
12888
12889         AUDIT_ARG(fd, uap->dirfd);
12890         AUDIT_ARG(value32, uap->op);
12891
12892         error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
12893         if (error) {
12894                 return error;
12895         }
12896
12897         /*
12898          * Enforce user authorization for snapshot modification operations
12899          */
12900         if ((uap->op != SNAPSHOT_OP_MOUNT) &&
12901             (uap->op != SNAPSHOT_OP_ROOT)) {
12902                 vnode_t dvp = NULLVP;
12903                 vnode_t devvp = NULLVP;
12904                 mount_t mp;
12905
12906                 error = vnode_getfromfd(ctx, uap->dirfd, &dvp);
12907                 if (error) {
12908                         return error;
12909                 }
12910                 mp = vnode_mount(dvp);
12911                 devvp = mp->mnt_devvp;
12912
12913                 /* get an iocount on devvp */
12914                 if (devvp == NULLVP) {
12915                         error = vnode_lookup(mp->mnt_vfsstat.f_mntfromname, 0, &devvp, ctx);
12916                         /* for mounts which arent block devices */
12917                         if (error == ENOENT) {
12918                                 error = ENXIO;
12919                         }
12920                 } else {
12921                         error = vnode_getwithref(devvp);
12922                 }
12923
12924                 if (error) {
12925                         vnode_put(dvp);
12926                         return error;
12927                 }
12928
12929                 if ((vfs_context_issuser(ctx) == 0) &&
12930                     (vnode_authorize(devvp, NULL, KAUTH_VNODE_WRITE_DATA, ctx) != 0)) {
12931                         error = EPERM;
12932                 }
12933                 vnode_put(dvp);
12934                 vnode_put(devvp);
12935
12936                 if (error) {
12937                         return error;
12938                 }
12939         }
12940
12941         switch (uap->op) {
12942         case SNAPSHOT_OP_CREATE:
12943                 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
12944                 break;
12945         case SNAPSHOT_OP_DELETE:
12946                 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
12947                 break;
12948         case SNAPSHOT_OP_RENAME:
12949                 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
12950                     uap->flags, ctx);
12951                 break;
12952         case SNAPSHOT_OP_MOUNT:
12953                 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
12954                     uap->data, uap->flags, ctx);
12955                 break;
12956         case SNAPSHOT_OP_REVERT:
12957                 error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
12958                 break;
12959 #if CONFIG_MNT_ROOTSNAP
12960         case SNAPSHOT_OP_ROOT:
12961                 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
12962                 break;
12963 #endif /* CONFIG_MNT_ROOTSNAP */
12964         default:
12965                 error = ENOSYS;
12966         }
12967
12968         return error;
12969 }