bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/xattr.h>
  98 #include <sys/fcntl.h>
  99 #include <sys/fsctl.h>
 100 #include <sys/ubc_internal.h>
 101 #include <sys/disk.h>
 102 #include <sys/content_protection.h>
 103 #include <sys/clonefile.h>
 104 #include <sys/snapshot.h>
 105 #include <sys/priv.h>
 106 #include <machine/cons.h>
 107 #include <machine/limits.h>
 108 #include <miscfs/specfs/specdev.h>
 109
 110 #include <vfs/vfs_disk_conditioner.h>
 111
 112 #include <security/audit/audit.h>
 113 #include <bsm/audit_kevents.h>
 114
 115 #include <mach/mach_types.h>
 116 #include <kern/kern_types.h>
 117 #include <kern/kalloc.h>
 118 #include <kern/task.h>
 119
 120 #include <vm/vm_pageout.h>
 121 #include <vm/vm_protos.h>
 122
 123 #include <libkern/OSAtomic.h>
 124 #include <pexpert/pexpert.h>
 125 #include <IOKit/IOBSD.h>
 126
 127 #if ROUTEFS
 128 #include <miscfs/routefs/routefs.h>
 129 #endif /* ROUTEFS */
 130
 131 #if CONFIG_MACF
 132 #include <security/mac.h>
 133 #include <security/mac_framework.h>
 134 #endif
 135
 136 #if CONFIG_FSE
 137 #define GET_PATH(x) \
 138         (x) = get_pathbuff();
 139 #define RELEASE_PATH(x) \
 140         release_pathbuff(x);
 141 #else
 142 #define GET_PATH(x)     \
 143         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 144 #define RELEASE_PATH(x) \
 145         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 146 #endif /* CONFIG_FSE */
 147
 148 #ifndef HFS_GET_BOOT_INFO
 149 #define HFS_GET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00004)
 150 #endif
 151
 152 #ifndef HFS_SET_BOOT_INFO
 153 #define HFS_SET_BOOT_INFO   (FCNTL_FS_SPECIFIC_BASE + 0x00005)
 154 #endif
 155
 156 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
 157 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
 158 #endif
 159
 160 extern void disk_conditioner_unmount(mount_t mp);
 161
 162 /* struct for checkdirs iteration */
 163 struct cdirargs {
 164         vnode_t olddp;
 165         vnode_t newdp;
 166 };
 167 /* callback  for checkdirs iteration */
 168 static int checkdirs_callback(proc_t p, void * arg);
 169
 170 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 171 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 172 void enablequotas(struct mount *mp, vfs_context_t ctx);
 173 static int getfsstat_callback(mount_t mp, void * arg);
 174 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 175 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 176 static int sync_callback(mount_t, void *);
 177 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 178                         user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 179                                                 boolean_t partial_copy);
 180 static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
 181                         user_addr_t bufp);
 182 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 183 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 184                         struct componentname *cnp, user_addr_t fsmountargs,
 185                         int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 186                         vfs_context_t ctx);
 187 void vfs_notify_mount(vnode_t pdvp);
 188
 189 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 190
 191 struct fd_vn_data * fg_vn_data_alloc(void);
 192
 193 /*
 194  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 195  * Concurrent lookups (or lookups by ids) on hard links can cause the
 196  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 197  * does) to return ENOENT as the path cannot be returned from the name cache
 198  * alone. We have no option but to retry and hope to get one namei->reverse path
 199  * generation done without an intervening lookup, lookup by id on the hard link
 200  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 201  * which currently are the MAC hooks for rename, unlink and rmdir.
 202  */
 203 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 204
 205 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
 206
 207 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
 208
 209 #ifdef CONFIG_IMGSRC_ACCESS
 210 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 211 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 212 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 213 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 214 static void mount_end_update(mount_t mp);
 215 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 216 #endif /* CONFIG_IMGSRC_ACCESS */
 217
 218 //snapshot functions
 219 #if CONFIG_MNT_ROOTSNAP
 220 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
 221 #else
 222 static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
 223 #endif
 224
 225 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 226
 227 __private_extern__
 228 int sync_internal(void);
 229
 230 __private_extern__
 231 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 232
 233 extern lck_grp_t *fd_vn_lck_grp;
 234 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 235 extern lck_attr_t *fd_vn_lck_attr;
 236
 237 /*
 238  * incremented each time a mount or unmount operation occurs
 239  * used to invalidate the cached value of the rootvp in the
 240  * mount structure utilized by cache_lookup_path
 241  */
 242 uint32_t mount_generation = 0;
 243
 244 /* counts number of mount and unmount operations */
 245 unsigned int vfs_nummntops=0;
 246
 247 extern const struct fileops vnops;
 248 #if CONFIG_APPLEDOUBLE
 249 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 250 #endif /* CONFIG_APPLEDOUBLE */
 251
 252 /*
 253  * Virtual File System System Calls
 254  */
 255
 256 #if NFSCLIENT || DEVFS || ROUTEFS
 257 /*
 258  * Private in-kernel mounting spi (NFS only, not exported)
 259  */
 260  __private_extern__
 261 boolean_t
 262 vfs_iskernelmount(mount_t mp)
 263 {
 264         return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE);
 265 }
 266
 267  __private_extern__
 268 int
 269 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 270              void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
 271 {
 272         struct nameidata nd;
 273         boolean_t did_namei;
 274         int error;
 275
 276         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 277                UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 278
 279         /*
 280          * Get the vnode to be covered if it's not supplied
 281          */
 282         if (vp == NULLVP) {
 283                 error = namei(&nd);
 284                 if (error)
 285                         return (error);
 286                 vp = nd.ni_vp;
 287                 pvp = nd.ni_dvp;
 288                 did_namei = TRUE;
 289         } else {
 290                 char *pnbuf = CAST_DOWN(char *, path);
 291
 292                 nd.ni_cnd.cn_pnbuf = pnbuf;
 293                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 294                 did_namei = FALSE;
 295         }
 296
 297         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 298                              syscall_flags, kern_flags, NULL, TRUE, ctx);
 299
 300         if (did_namei) {
 301                 vnode_put(vp);
 302                 vnode_put(pvp);
 303                 nameidone(&nd);
 304         }
 305
 306         return (error);
 307 }
 308 #endif /* NFSCLIENT || DEVFS */
 309
 310 /*
 311  * Mount a file system.
 312  */
 313 /* ARGSUSED */
 314 int
 315 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 316 {
 317         struct __mac_mount_args muap;
 318
 319         muap.type = uap->type;
 320         muap.path = uap->path;
 321         muap.flags = uap->flags;
 322         muap.data = uap->data;
 323         muap.mac_p = USER_ADDR_NULL;
 324         return (__mac_mount(p, &muap, retval));
 325 }
 326
 327 int
 328 fmount(__unused proc_t p, struct fmount_args *uap, __unused int32_t *retval)
 329 {
 330         struct componentname    cn;
 331         vfs_context_t           ctx = vfs_context_current();
 332         size_t                  dummy = 0;
 333         int                     error;
 334         int                     flags = uap->flags;
 335         char                    fstypename[MFSNAMELEN];
 336         char                    *labelstr = NULL; /* regular mount call always sets it to NULL for __mac_mount() */
 337         vnode_t                 pvp;
 338         vnode_t                 vp;
 339
 340         AUDIT_ARG(fd, uap->fd);
 341         AUDIT_ARG(fflags, flags);
 342         /* fstypename will get audited by mount_common */
 343
 344         /* Sanity check the flags */
 345         if (flags & (MNT_IMGSRC_BY_INDEX|MNT_ROOTFS)) {
 346                 return (ENOTSUP);
 347         }
 348
 349         if (flags & MNT_UNION) {
 350                 return (EPERM);
 351         }
 352
 353         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 354         if (error) {
 355                 return (error);
 356         }
 357
 358         if ((error = file_vnode(uap->fd, &vp)) != 0) {
 359                 return (error);
 360         }
 361
 362         if ((error = vnode_getwithref(vp)) != 0) {
 363                 file_drop(uap->fd);
 364                 return (error);
 365         }
 366
 367         pvp = vnode_getparent(vp);
 368         if (pvp == NULL) {
 369                 vnode_put(vp);
 370                 file_drop(uap->fd);
 371                 return (EINVAL);
 372         }
 373
 374         memset(&cn, 0, sizeof(struct componentname));
 375         MALLOC(cn.cn_pnbuf, char *, MAXPATHLEN, M_TEMP, M_WAITOK);
 376         cn.cn_pnlen = MAXPATHLEN;
 377
 378         if((error = vn_getpath(vp, cn.cn_pnbuf, &cn.cn_pnlen)) != 0) {
 379                 FREE(cn.cn_pnbuf, M_TEMP);
 380                 vnode_put(pvp);
 381                 vnode_put(vp);
 382                 file_drop(uap->fd);
 383                 return (error);
 384         }
 385
 386         error = mount_common(fstypename, pvp, vp, &cn, uap->data, flags, 0, labelstr, FALSE, ctx);
 387
 388         FREE(cn.cn_pnbuf, M_TEMP);
 389         vnode_put(pvp);
 390         vnode_put(vp);
 391         file_drop(uap->fd);
 392
 393         return (error);
 394 }
 395
 396 void
 397 vfs_notify_mount(vnode_t pdvp)
 398 {
 399         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 400         lock_vnode_and_post(pdvp, NOTE_WRITE);
 401 }
 402
 403 /*
 404  * __mac_mount:
 405  *      Mount a file system taking into account MAC label behavior.
 406  *      See mount(2) man page for more information
 407  *
 408  * Parameters:    p                        Process requesting the mount
 409  *                uap                      User argument descriptor (see below)
 410  *                retval                   (ignored)
 411  *
 412  * Indirect:      uap->type                Filesystem type
 413  *                uap->path                Path to mount
 414  *                uap->data                Mount arguments
 415  *                uap->mac_p               MAC info
 416  *                uap->flags               Mount flags
 417  *
 418  *
 419  * Returns:        0                       Success
 420  *                !0                       Not success
 421  */
 422 boolean_t root_fs_upgrade_try = FALSE;
 423
 424 int
 425 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 426 {
 427         vnode_t pvp = NULL;
 428         vnode_t vp = NULL;
 429         int need_nameidone = 0;
 430         vfs_context_t ctx = vfs_context_current();
 431         char fstypename[MFSNAMELEN];
 432         struct nameidata nd;
 433         size_t dummy=0;
 434         char *labelstr = NULL;
 435         int flags = uap->flags;
 436         int error;
 437 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 438         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 439 #else
 440 #pragma unused(p)
 441 #endif
 442         /*
 443          * Get the fs type name from user space
 444          */
 445         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 446         if (error)
 447                 return (error);
 448
 449         /*
 450          * Get the vnode to be covered
 451          */
 452         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 453                UIO_USERSPACE, uap->path, ctx);
 454         error = namei(&nd);
 455         if (error) {
 456                 goto out;
 457         }
 458         need_nameidone = 1;
 459         vp = nd.ni_vp;
 460         pvp = nd.ni_dvp;
 461
 462 #ifdef CONFIG_IMGSRC_ACCESS
 463         /* Mounting image source cannot be batched with other operations */
 464         if (flags == MNT_IMGSRC_BY_INDEX) {
 465                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 466                                                   ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 467                 goto out;
 468         }
 469 #endif /* CONFIG_IMGSRC_ACCESS */
 470
 471 #if CONFIG_MACF
 472         /*
 473          * Get the label string (if any) from user space
 474          */
 475         if (uap->mac_p != USER_ADDR_NULL) {
 476                 struct user_mac mac;
 477                 size_t ulen = 0;
 478
 479                 if (is_64bit) {
 480                         struct user64_mac mac64;
 481                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 482                         mac.m_buflen = mac64.m_buflen;
 483                         mac.m_string = mac64.m_string;
 484                 } else {
 485                         struct user32_mac mac32;
 486                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 487                         mac.m_buflen = mac32.m_buflen;
 488                         mac.m_string = mac32.m_string;
 489                 }
 490                 if (error)
 491                         goto out;
 492                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 493                     (mac.m_buflen < 2)) {
 494                         error = EINVAL;
 495                         goto out;
 496                 }
 497                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 498                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 499                 if (error) {
 500                         goto out;
 501                 }
 502                 AUDIT_ARG(mac_string, labelstr);
 503         }
 504 #endif /* CONFIG_MACF */
 505
 506         AUDIT_ARG(fflags, flags);
 507
 508 #if SECURE_KERNEL
 509         if (flags & MNT_UNION) {
 510                 /* No union mounts on release kernels */
 511                 error = EPERM;
 512                 goto out;
 513         }
 514 #endif
 515
 516         if ((vp->v_flag & VROOT) &&
 517                         (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 518                 if (!(flags & MNT_UNION)) {
 519                         flags |= MNT_UPDATE;
 520                 }
 521                 else {
 522                         /*
 523                          * For a union mount on '/', treat it as fresh
 524                          * mount instead of update.
 525                          * Otherwise, union mouting on '/' used to panic the
 526                          * system before, since mnt_vnodecovered was found to
 527                          * be NULL for '/' which is required for unionlookup
 528                          * after it gets ENOENT on union mount.
 529                          */
 530                         flags = (flags & ~(MNT_UPDATE));
 531                 }
 532
 533 #if SECURE_KERNEL
 534                 if ((flags & MNT_RDONLY) == 0) {
 535                         /* Release kernels are not allowed to mount "/" as rw */
 536                         error = EPERM;
 537                         goto out;
 538                 }
 539 #endif
 540                 /*
 541                  * See 7392553 for more details on why this check exists.
 542                  * Suffice to say: If this check is ON and something tries
 543                  * to mount the rootFS RW, we'll turn off the codesign
 544                  * bitmap optimization.
 545                  */
 546 #if CHECK_CS_VALIDATION_BITMAP
 547                 if ((flags & MNT_RDONLY) == 0 ) {
 548                         root_fs_upgrade_try = TRUE;
 549                 }
 550 #endif
 551         }
 552
 553         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 554                              labelstr, FALSE, ctx);
 555
 556 out:
 557
 558 #if CONFIG_MACF
 559         if (labelstr)
 560                 FREE(labelstr, M_MACTEMP);
 561 #endif /* CONFIG_MACF */
 562
 563         if (vp) {
 564                 vnode_put(vp);
 565         }
 566         if (pvp) {
 567                 vnode_put(pvp);
 568         }
 569         if (need_nameidone) {
 570                 nameidone(&nd);
 571         }
 572
 573         return (error);
 574 }
 575
 576 /*
 577  * common mount implementation (final stage of mounting)
 578
 579  * Arguments:
 580  *  fstypename  file system type (ie it's vfs name)
 581  *  pvp         parent of covered vnode
 582  *  vp          covered vnode
 583  *  cnp         component name (ie path) of covered vnode
 584  *  flags       generic mount flags
 585  *  fsmountargs file system specific data
 586  *  labelstr    optional MAC label
 587  *  kernelmount TRUE for mounts initiated from inside the kernel
 588  *  ctx         caller's context
 589  */
 590 static int
 591 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 592              struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 593              char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 594 {
 595 #if !CONFIG_MACF
 596 #pragma unused(labelstr)
 597 #endif
 598         struct vnode *devvp = NULLVP;
 599         struct vnode *device_vnode = NULLVP;
 600 #if CONFIG_MACF
 601         struct vnode *rvp;
 602 #endif
 603         struct mount *mp;
 604         struct vfstable *vfsp = (struct vfstable *)0;
 605         struct proc *p = vfs_context_proc(ctx);
 606         int error, flag = 0;
 607         user_addr_t devpath = USER_ADDR_NULL;
 608         int ronly = 0;
 609         int mntalloc = 0;
 610         boolean_t vfsp_ref = FALSE;
 611         boolean_t is_rwlock_locked = FALSE;
 612         boolean_t did_rele = FALSE;
 613         boolean_t have_usecount = FALSE;
 614
 615         /*
 616          * Process an update for an existing mount
 617          */
 618         if (flags & MNT_UPDATE) {
 619                 if ((vp->v_flag & VROOT) == 0) {
 620                         error = EINVAL;
 621                         goto out1;
 622                 }
 623                 mp = vp->v_mount;
 624
 625                 /* unmount in progress return error */
 626                 mount_lock_spin(mp);
 627                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 628                         mount_unlock(mp);
 629                         error = EBUSY;
 630                         goto out1;
 631                 }
 632                 mount_unlock(mp);
 633                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 634                 is_rwlock_locked = TRUE;
 635                 /*
 636                  * We only allow the filesystem to be reloaded if it
 637                  * is currently mounted read-only.
 638                  */
 639                 if ((flags & MNT_RELOAD) &&
 640                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 641                         error = ENOTSUP;
 642                         goto out1;
 643                 }
 644
 645                 /*
 646                  * If content protection is enabled, update mounts are not
 647                  * allowed to turn it off.
 648                  */
 649                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 650                            ((flags & MNT_CPROTECT) == 0)) {
 651                         error = EINVAL;
 652                         goto out1;
 653                 }
 654
 655 #ifdef CONFIG_IMGSRC_ACCESS
 656                 /* Can't downgrade the backer of the root FS */
 657                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 658                         (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 659                         error = ENOTSUP;
 660                         goto out1;
 661                 }
 662 #endif /* CONFIG_IMGSRC_ACCESS */
 663
 664                 /*
 665                  * Only root, or the user that did the original mount is
 666                  * permitted to update it.
 667                  */
 668                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 669                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 670                         goto out1;
 671                 }
 672 #if CONFIG_MACF
 673                 error = mac_mount_check_remount(ctx, mp);
 674                 if (error != 0) {
 675                         goto out1;
 676                 }
 677 #endif
 678                 /*
 679                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 680                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 681                  */
 682                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 683                         flags |= MNT_NOSUID | MNT_NODEV;
 684                         if (mp->mnt_flag & MNT_NOEXEC)
 685                                 flags |= MNT_NOEXEC;
 686                 }
 687                 flag = mp->mnt_flag;
 688
 689
 690
 691                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 692
 693                 vfsp = mp->mnt_vtable;
 694                 goto update;
 695         }
 696
 697         /*
 698          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 699          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 700          */
 701         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 702                 flags |= MNT_NOSUID | MNT_NODEV;
 703                 if (vp->v_mount->mnt_flag & MNT_NOEXEC)
 704                         flags |= MNT_NOEXEC;
 705         }
 706
 707         /* XXXAUDIT: Should we capture the type on the error path as well? */
 708         AUDIT_ARG(text, fstypename);
 709         mount_list_lock();
 710         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 711                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 712                         vfsp->vfc_refcount++;
 713                         vfsp_ref = TRUE;
 714                         break;
 715                 }
 716         mount_list_unlock();
 717         if (vfsp == NULL) {
 718                 error = ENODEV;
 719                 goto out1;
 720         }
 721
 722         /*
 723          * VFC_VFSLOCALARGS is not currently supported for kernel mounts
 724          */
 725         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
 726                 error = EINVAL;  /* unsupported request */
 727                 goto out1;
 728         }
 729
 730         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 731         if (error != 0) {
 732                 goto out1;
 733         }
 734
 735         /*
 736          * Allocate and initialize the filesystem (mount_t)
 737          */
 738         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 739                 M_MOUNT, M_WAITOK);
 740         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 741         mntalloc = 1;
 742
 743         /* Initialize the default IO constraints */
 744         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 745         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 746         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 747         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 748         mp->mnt_devblocksize = DEV_BSIZE;
 749         mp->mnt_alignmentmask = PAGE_MASK;
 750         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 751         mp->mnt_ioscale = 1;
 752         mp->mnt_ioflags = 0;
 753         mp->mnt_realrootvp = NULLVP;
 754         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 755
 756         TAILQ_INIT(&mp->mnt_vnodelist);
 757         TAILQ_INIT(&mp->mnt_workerqueue);
 758         TAILQ_INIT(&mp->mnt_newvnodes);
 759         mount_lock_init(mp);
 760         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 761         is_rwlock_locked = TRUE;
 762         mp->mnt_op = vfsp->vfc_vfsops;
 763         mp->mnt_vtable = vfsp;
 764         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 765         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 766         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 767         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 768         mp->mnt_vnodecovered = vp;
 769         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 770         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 771         mp->mnt_devbsdunit = 0;
 772
 773         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 774         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 775
 776 #if NFSCLIENT || DEVFS || ROUTEFS
 777         if (kernelmount)
 778                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 779         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0)
 780                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 781 #endif /* NFSCLIENT || DEVFS */
 782
 783 update:
 784
 785         /*
 786          * Set the mount level flags.
 787          */
 788         if (flags & MNT_RDONLY)
 789                 mp->mnt_flag |= MNT_RDONLY;
 790         else if (mp->mnt_flag & MNT_RDONLY) {
 791                 // disallow read/write upgrades of file systems that
 792                 // had the TYPENAME_OVERRIDE feature set.
 793                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 794                         error = EPERM;
 795                         goto out1;
 796                 }
 797                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 798         }
 799         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 800                           MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 801                           MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 802                           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 803                           MNT_QUARANTINE | MNT_CPROTECT);
 804
 805 #if SECURE_KERNEL
 806 #if !CONFIG_MNT_SUID
 807         /*
 808          * On release builds of iOS based platforms, always enforce NOSUID on
 809          * all mounts. We do this here because we can catch update mounts as well as
 810          * non-update mounts in this case.
 811          */
 812         mp->mnt_flag |= (MNT_NOSUID);
 813 #endif
 814 #endif
 815
 816         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 817                                  MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 818                                  MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 819                                  MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 820                                  MNT_QUARANTINE | MNT_CPROTECT);
 821
 822 #if CONFIG_MACF
 823         if (flags & MNT_MULTILABEL) {
 824                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 825                         error = EINVAL;
 826                         goto out1;
 827                 }
 828                 mp->mnt_flag |= MNT_MULTILABEL;
 829         }
 830 #endif
 831         /*
 832          * Process device path for local file systems if requested
 833          */
 834         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
 835             !(internal_flags & KERNEL_MOUNT_SNAPSHOT)) {
 836                 if (vfs_context_is64bit(ctx)) {
 837                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
 838                                 goto out1;
 839                         fsmountargs += sizeof(devpath);
 840                 } else {
 841                         user32_addr_t tmp;
 842                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
 843                                 goto out1;
 844                         /* munge into LP64 addr */
 845                         devpath = CAST_USER_ADDR_T(tmp);
 846                         fsmountargs += sizeof(tmp);
 847                 }
 848
 849                 /* Lookup device and authorize access to it */
 850                 if ((devpath)) {
 851                         struct nameidata nd;
 852
 853                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 854                         if ( (error = namei(&nd)) )
 855                                 goto out1;
 856
 857                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 858                         devvp = nd.ni_vp;
 859
 860                         nameidone(&nd);
 861
 862                         if (devvp->v_type != VBLK) {
 863                                 error = ENOTBLK;
 864                                 goto out2;
 865                         }
 866                         if (major(devvp->v_rdev) >= nblkdev) {
 867                                 error = ENXIO;
 868                                 goto out2;
 869                         }
 870                         /*
 871                         * If mount by non-root, then verify that user has necessary
 872                         * permissions on the device.
 873                         */
 874                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 875                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 876
 877                                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
 878                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 879                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0)
 880                                         goto out2;
 881                         }
 882                 }
 883                 /* On first mount, preflight and open device */
 884                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 885                         if ( (error = vnode_ref(devvp)) )
 886                                 goto out2;
 887                         /*
 888                         * Disallow multiple mounts of the same device.
 889                         * Disallow mounting of a device that is currently in use
 890                         * (except for root, which might share swap device for miniroot).
 891                         * Flush out any old buffers remaining from a previous use.
 892                         */
 893                         if ( (error = vfs_mountedon(devvp)) )
 894                                 goto out3;
 895
 896                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 897                                 error = EBUSY;
 898                                 goto out3;
 899                         }
 900                         if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, ctx)) ) {
 901                                 error = ENOTBLK;
 902                                 goto out3;
 903                         }
 904                         if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0)) )
 905                                 goto out3;
 906
 907                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 908 #if CONFIG_MACF
 909                         error = mac_vnode_check_open(ctx,
 910                             devvp,
 911                             ronly ? FREAD : FREAD|FWRITE);
 912                         if (error)
 913                                 goto out3;
 914 #endif /* MAC */
 915                         if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, ctx)) )
 916                                 goto out3;
 917
 918                         mp->mnt_devvp = devvp;
 919                         device_vnode = devvp;
 920
 921                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 922                            (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 923                            (device_vnode = mp->mnt_devvp)) {
 924                         dev_t dev;
 925                         int maj;
 926                         /*
 927                          * If upgrade to read-write by non-root, then verify
 928                          * that user has necessary permissions on the device.
 929                          */
 930                         vnode_getalways(device_vnode);
 931
 932                         if (suser(vfs_context_ucred(ctx), NULL) &&
 933                             (error = vnode_authorize(device_vnode, NULL,
 934                              KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
 935                              ctx)) != 0) {
 936                                 vnode_put(device_vnode);
 937                                 goto out2;
 938                         }
 939
 940                         /* Tell the device that we're upgrading */
 941                         dev = (dev_t)device_vnode->v_rdev;
 942                         maj = major(dev);
 943
 944                         if ((u_int)maj >= (u_int)nblkdev)
 945                                 panic("Volume mounted on a device with invalid major number.");
 946
 947                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
 948                         vnode_put(device_vnode);
 949                         device_vnode = NULLVP;
 950                         if (error != 0) {
 951                                 goto out2;
 952                         }
 953                 }
 954         }
 955 #if CONFIG_MACF
 956         if ((flags & MNT_UPDATE) == 0) {
 957                 mac_mount_label_init(mp);
 958                 mac_mount_label_associate(ctx, mp);
 959         }
 960         if (labelstr) {
 961                 if ((flags & MNT_UPDATE) != 0) {
 962                         error = mac_mount_check_label_update(ctx, mp);
 963                         if (error != 0)
 964                                 goto out3;
 965                 }
 966         }
 967 #endif
 968         /*
 969          * Mount the filesystem.
 970          */
 971         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
 972                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
 973                     (caddr_t)fsmountargs, 0, ctx);
 974         } else {
 975                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
 976         }
 977
 978         if (flags & MNT_UPDATE) {
 979                 if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 980                         mp->mnt_flag &= ~MNT_RDONLY;
 981                 mp->mnt_flag &=~
 982                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
 983                 mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 984                 if (error)
 985                         mp->mnt_flag = flag;  /* restore flag value */
 986                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
 987                 lck_rw_done(&mp->mnt_rwlock);
 988                 is_rwlock_locked = FALSE;
 989                 if (!error)
 990                         enablequotas(mp, ctx);
 991                 goto exit;
 992         }
 993
 994         /*
 995          * Put the new filesystem on the mount list after root.
 996          */
 997         if (error == 0) {
 998                 struct vfs_attr vfsattr;
 999 #if CONFIG_MACF
1000                 if (vfs_flags(mp) & MNT_MULTILABEL) {
1001                         error = VFS_ROOT(mp, &rvp, ctx);
1002                         if (error) {
1003                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
1004                                 goto out3;
1005                         }
1006                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
1007                         /*
1008                          * drop reference provided by VFS_ROOT
1009                          */
1010                         vnode_put(rvp);
1011
1012                         if (error)
1013                                 goto out3;
1014                 }
1015 #endif  /* MAC */
1016
1017                 vnode_lock_spin(vp);
1018                 CLR(vp->v_flag, VMOUNT);
1019                 vp->v_mountedhere = mp;
1020                 vnode_unlock(vp);
1021
1022                 /*
1023                  * taking the name_cache_lock exclusively will
1024                  * insure that everyone is out of the fast path who
1025                  * might be trying to use a now stale copy of
1026                  * vp->v_mountedhere->mnt_realrootvp
1027                  * bumping mount_generation causes the cached values
1028                  * to be invalidated
1029                  */
1030                 name_cache_lock();
1031                 mount_generation++;
1032                 name_cache_unlock();
1033
1034                 error = vnode_ref(vp);
1035                 if (error != 0) {
1036                         goto out4;
1037                 }
1038
1039                 have_usecount = TRUE;
1040
1041                 error = checkdirs(vp, ctx);
1042                 if (error != 0)  {
1043                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
1044                         goto out4;
1045                 }
1046                 /*
1047                  * there is no cleanup code here so I have made it void
1048                  * we need to revisit this
1049                  */
1050                 (void)VFS_START(mp, 0, ctx);
1051
1052                 if (mount_list_add(mp) != 0) {
1053                         /*
1054                          * The system is shutting down trying to umount
1055                          * everything, so fail with a plausible errno.
1056                          */
1057                         error = EBUSY;
1058                         goto out4;
1059                 }
1060                 lck_rw_done(&mp->mnt_rwlock);
1061                 is_rwlock_locked = FALSE;
1062
1063                 /* Check if this mounted file system supports EAs or named streams. */
1064                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
1065                 VFSATTR_INIT(&vfsattr);
1066                 VFSATTR_WANTED(&vfsattr, f_capabilities);
1067                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
1068                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
1069                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1070                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1071                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1072                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1073                         }
1074 #if NAMEDSTREAMS
1075                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1076                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1077                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1078                         }
1079 #endif
1080                         /* Check if this file system supports path from id lookups. */
1081                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1082                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1083                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1084                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
1085                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
1086                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
1087                         }
1088
1089                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1090                                 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1091                                 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1092                         }
1093                 }
1094                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1095                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1096                 }
1097                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1098                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1099                 }
1100                 /* increment the operations count */
1101                 OSAddAtomic(1, &vfs_nummntops);
1102                 enablequotas(mp, ctx);
1103
1104                 if (device_vnode) {
1105                         device_vnode->v_specflags |= SI_MOUNTEDON;
1106
1107                         /*
1108                          *   cache the IO attributes for the underlying physical media...
1109                          *   an error return indicates the underlying driver doesn't
1110                          *   support all the queries necessary... however, reasonable
1111                          *   defaults will have been set, so no reason to bail or care
1112                          */
1113                         vfs_init_io_attributes(device_vnode, mp);
1114                 }
1115
1116                 /* Now that mount is setup, notify the listeners */
1117                 vfs_notify_mount(pvp);
1118                 IOBSDMountChange(mp, kIOMountChangeMount);
1119
1120         } else {
1121                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1122                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1123                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1124                                         mp->mnt_vtable->vfc_name, error);
1125                 }
1126
1127                 vnode_lock_spin(vp);
1128                 CLR(vp->v_flag, VMOUNT);
1129                 vnode_unlock(vp);
1130                 mount_list_lock();
1131                 mp->mnt_vtable->vfc_refcount--;
1132                 mount_list_unlock();
1133
1134                 if (device_vnode ) {
1135                         vnode_rele(device_vnode);
1136                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD|FWRITE, ctx);
1137                 }
1138                 lck_rw_done(&mp->mnt_rwlock);
1139                 is_rwlock_locked = FALSE;
1140
1141                 /*
1142                  * if we get here, we have a mount structure that needs to be freed,
1143                  * but since the coveredvp hasn't yet been updated to point at it,
1144                  * no need to worry about other threads holding a crossref on this mp
1145                  * so it's ok to just free it
1146                  */
1147                 mount_lock_destroy(mp);
1148 #if CONFIG_MACF
1149                 mac_mount_label_destroy(mp);
1150 #endif
1151                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1152         }
1153 exit:
1154         /*
1155          * drop I/O count on the device vp if there was one
1156          */
1157         if (devpath && devvp)
1158                 vnode_put(devvp);
1159
1160         return(error);
1161
1162 /* Error condition exits */
1163 out4:
1164         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1165
1166         /*
1167          * If the mount has been placed on the covered vp,
1168          * it may have been discovered by now, so we have
1169          * to treat this just like an unmount
1170          */
1171         mount_lock_spin(mp);
1172         mp->mnt_lflag |= MNT_LDEAD;
1173         mount_unlock(mp);
1174
1175         if (device_vnode != NULLVP) {
1176                 vnode_rele(device_vnode);
1177                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1178                        ctx);
1179                 did_rele = TRUE;
1180         }
1181
1182         vnode_lock_spin(vp);
1183
1184         mp->mnt_crossref++;
1185         vp->v_mountedhere = (mount_t) 0;
1186
1187         vnode_unlock(vp);
1188
1189         if (have_usecount) {
1190                 vnode_rele(vp);
1191         }
1192 out3:
1193         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele))
1194                 vnode_rele(devvp);
1195 out2:
1196         if (devpath && devvp)
1197                 vnode_put(devvp);
1198 out1:
1199         /* Release mnt_rwlock only when it was taken */
1200         if (is_rwlock_locked == TRUE) {
1201                 lck_rw_done(&mp->mnt_rwlock);
1202         }
1203
1204         if (mntalloc) {
1205                 if (mp->mnt_crossref)
1206                         mount_dropcrossref(mp, vp, 0);
1207                 else {
1208                         mount_lock_destroy(mp);
1209 #if CONFIG_MACF
1210                         mac_mount_label_destroy(mp);
1211 #endif
1212                         FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1213                 }
1214         }
1215         if (vfsp_ref) {
1216                 mount_list_lock();
1217                 vfsp->vfc_refcount--;
1218                 mount_list_unlock();
1219         }
1220
1221         return(error);
1222 }
1223
1224 /*
1225  * Flush in-core data, check for competing mount attempts,
1226  * and set VMOUNT
1227  */
1228 int
1229 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1230 {
1231 #if !CONFIG_MACF
1232 #pragma unused(cnp,fsname)
1233 #endif
1234         struct vnode_attr va;
1235         int error;
1236
1237         if (!skip_auth) {
1238                 /*
1239                  * If the user is not root, ensure that they own the directory
1240                  * onto which we are attempting to mount.
1241                  */
1242                 VATTR_INIT(&va);
1243                 VATTR_WANTED(&va, va_uid);
1244                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1245                                 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1246                                  (!vfs_context_issuser(ctx)))) {
1247                         error = EPERM;
1248                         goto out;
1249                 }
1250         }
1251
1252         if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
1253                 goto out;
1254
1255         if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) )
1256                 goto out;
1257
1258         if (vp->v_type != VDIR) {
1259                 error = ENOTDIR;
1260                 goto out;
1261         }
1262
1263         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1264                 error = EBUSY;
1265                 goto out;
1266         }
1267
1268 #if CONFIG_MACF
1269         error = mac_mount_check_mount(ctx, vp,
1270             cnp, fsname);
1271         if (error != 0)
1272                 goto out;
1273 #endif
1274
1275         vnode_lock_spin(vp);
1276         SET(vp->v_flag, VMOUNT);
1277         vnode_unlock(vp);
1278
1279 out:
1280         return error;
1281 }
1282
1283 #if CONFIG_IMGSRC_ACCESS
1284
1285 #if DEBUG
1286 #define IMGSRC_DEBUG(args...) printf(args)
1287 #else
1288 #define IMGSRC_DEBUG(args...) do { } while(0)
1289 #endif
1290
1291 static int
1292 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1293 {
1294         struct nameidata nd;
1295         vnode_t vp, realdevvp;
1296         mode_t accessmode;
1297         int error;
1298
1299         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
1300         if ( (error = namei(&nd)) ) {
1301                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1302                 return error;
1303         }
1304
1305         vp = nd.ni_vp;
1306
1307         if (!vnode_isblk(vp)) {
1308                 IMGSRC_DEBUG("Not block device.\n");
1309                 error = ENOTBLK;
1310                 goto out;
1311         }
1312
1313         realdevvp = mp->mnt_devvp;
1314         if (realdevvp == NULLVP) {
1315                 IMGSRC_DEBUG("No device backs the mount.\n");
1316                 error = ENXIO;
1317                 goto out;
1318         }
1319
1320         error = vnode_getwithref(realdevvp);
1321         if (error != 0) {
1322                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1323                 goto out;
1324         }
1325
1326         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1327                 IMGSRC_DEBUG("Wrong dev_t.\n");
1328                 error = ENXIO;
1329                 goto out1;
1330         }
1331
1332         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1333
1334         /*
1335          * If mount by non-root, then verify that user has necessary
1336          * permissions on the device.
1337          */
1338         if (!vfs_context_issuser(ctx)) {
1339                 accessmode = KAUTH_VNODE_READ_DATA;
1340                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
1341                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1342                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1343                         IMGSRC_DEBUG("Access denied.\n");
1344                         goto out1;
1345                 }
1346         }
1347
1348         *devvpp = vp;
1349
1350 out1:
1351         vnode_put(realdevvp);
1352 out:
1353         nameidone(&nd);
1354         if (error) {
1355                 vnode_put(vp);
1356         }
1357
1358         return error;
1359 }
1360
1361 /*
1362  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1363  * and call checkdirs()
1364  */
1365 static int
1366 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1367 {
1368         int error;
1369
1370         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1371
1372         vnode_lock_spin(vp);
1373         CLR(vp->v_flag, VMOUNT);
1374         vp->v_mountedhere = mp;
1375         vnode_unlock(vp);
1376
1377         /*
1378          * taking the name_cache_lock exclusively will
1379          * insure that everyone is out of the fast path who
1380          * might be trying to use a now stale copy of
1381          * vp->v_mountedhere->mnt_realrootvp
1382          * bumping mount_generation causes the cached values
1383          * to be invalidated
1384          */
1385         name_cache_lock();
1386         mount_generation++;
1387         name_cache_unlock();
1388
1389         error = vnode_ref(vp);
1390         if (error != 0) {
1391                 goto out;
1392         }
1393
1394         error = checkdirs(vp, ctx);
1395         if (error != 0)  {
1396                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1397                 vnode_rele(vp);
1398                 goto out;
1399         }
1400
1401 out:
1402         if (error != 0) {
1403                 mp->mnt_vnodecovered = NULLVP;
1404         }
1405         return error;
1406 }
1407
1408 static void
1409 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1410 {
1411         vnode_rele(vp);
1412         vnode_lock_spin(vp);
1413         vp->v_mountedhere = (mount_t)NULL;
1414         vnode_unlock(vp);
1415
1416         mp->mnt_vnodecovered = NULLVP;
1417 }
1418
1419 static int
1420 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1421 {
1422         int error;
1423
1424         /* unmount in progress return error */
1425         mount_lock_spin(mp);
1426         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1427                 mount_unlock(mp);
1428                 return EBUSY;
1429         }
1430         mount_unlock(mp);
1431         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1432
1433         /*
1434          * We only allow the filesystem to be reloaded if it
1435          * is currently mounted read-only.
1436          */
1437         if ((flags & MNT_RELOAD) &&
1438                         ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1439                 error = ENOTSUP;
1440                 goto out;
1441         }
1442
1443         /*
1444          * Only root, or the user that did the original mount is
1445          * permitted to update it.
1446          */
1447         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1448                         (!vfs_context_issuser(ctx))) {
1449                 error = EPERM;
1450                 goto out;
1451         }
1452 #if CONFIG_MACF
1453         error = mac_mount_check_remount(ctx, mp);
1454         if (error != 0) {
1455                 goto out;
1456         }
1457 #endif
1458
1459 out:
1460         if (error) {
1461                 lck_rw_done(&mp->mnt_rwlock);
1462         }
1463
1464         return error;
1465 }
1466
1467 static void
1468 mount_end_update(mount_t mp)
1469 {
1470         lck_rw_done(&mp->mnt_rwlock);
1471 }
1472
1473 static int
1474 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1475 {
1476         vnode_t vp;
1477
1478         if (height >= MAX_IMAGEBOOT_NESTING) {
1479                 return EINVAL;
1480         }
1481
1482         vp = imgsrc_rootvnodes[height];
1483         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1484                 *rvpp = vp;
1485                 return 0;
1486         } else {
1487                 return ENOENT;
1488         }
1489 }
1490
1491 static int
1492 relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
1493                 const char *fsname, vfs_context_t ctx,
1494                 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1495 {
1496         int error;
1497         mount_t mp;
1498         boolean_t placed = FALSE;
1499         vnode_t devvp = NULLVP;
1500         struct vfstable *vfsp;
1501         user_addr_t devpath;
1502         char *old_mntonname;
1503         vnode_t rvp;
1504         uint32_t height;
1505         uint32_t flags;
1506
1507         /* If we didn't imageboot, nothing to move */
1508         if (imgsrc_rootvnodes[0] == NULLVP) {
1509                 return EINVAL;
1510         }
1511
1512         /* Only root can do this */
1513         if (!vfs_context_issuser(ctx)) {
1514                 return EPERM;
1515         }
1516
1517         IMGSRC_DEBUG("looking for root vnode.\n");
1518
1519         /*
1520          * Get root vnode of filesystem we're moving.
1521          */
1522         if (by_index) {
1523                 if (is64bit) {
1524                         struct user64_mnt_imgsrc_args mia64;
1525                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1526                         if (error != 0) {
1527                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1528                                 return error;
1529                         }
1530
1531                         height = mia64.mi_height;
1532                         flags = mia64.mi_flags;
1533                         devpath = mia64.mi_devpath;
1534                 } else {
1535                         struct user32_mnt_imgsrc_args mia32;
1536                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1537                         if (error != 0) {
1538                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1539                                 return error;
1540                         }
1541
1542                         height = mia32.mi_height;
1543                         flags = mia32.mi_flags;
1544                         devpath = mia32.mi_devpath;
1545                 }
1546         } else {
1547                 /*
1548                  * For binary compatibility--assumes one level of nesting.
1549                  */
1550                 if (is64bit) {
1551                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
1552                                 return error;
1553                 } else {
1554                         user32_addr_t tmp;
1555                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
1556                                 return error;
1557
1558                         /* munge into LP64 addr */
1559                         devpath = CAST_USER_ADDR_T(tmp);
1560                 }
1561
1562                 height = 0;
1563                 flags = 0;
1564         }
1565
1566         if (flags != 0) {
1567                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1568                 return EINVAL;
1569         }
1570
1571         error = get_imgsrc_rootvnode(height, &rvp);
1572         if (error != 0) {
1573                 IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
1574                 return error;
1575         }
1576
1577         IMGSRC_DEBUG("got root vnode.\n");
1578
1579         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1580
1581         /* Can only move once */
1582         mp = vnode_mount(rvp);
1583         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1584                 IMGSRC_DEBUG("Already moved.\n");
1585                 error = EBUSY;
1586                 goto out0;
1587         }
1588
1589         IMGSRC_DEBUG("Starting updated.\n");
1590
1591         /* Get exclusive rwlock on mount, authorize update on mp */
1592         error = mount_begin_update(mp , ctx, 0);
1593         if (error != 0) {
1594                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1595                 goto out0;
1596         }
1597
1598         /*
1599          * It can only be moved once.  Flag is set under the rwlock,
1600          * so we're now safe to proceed.
1601          */
1602         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1603                 IMGSRC_DEBUG("Already moved [2]\n");
1604                 goto out1;
1605         }
1606
1607
1608         IMGSRC_DEBUG("Preparing coveredvp.\n");
1609
1610         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1611         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1612         if (error != 0) {
1613                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1614                 goto out1;
1615         }
1616
1617         IMGSRC_DEBUG("Covered vp OK.\n");
1618
1619         /* Sanity check the name caller has provided */
1620         vfsp = mp->mnt_vtable;
1621         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1622                 IMGSRC_DEBUG("Wrong fs name.\n");
1623                 error = EINVAL;
1624                 goto out2;
1625         }
1626
1627         /* Check the device vnode and update mount-from name, for local filesystems */
1628         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1629                 IMGSRC_DEBUG("Local, doing device validation.\n");
1630
1631                 if (devpath != USER_ADDR_NULL) {
1632                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1633                         if (error) {
1634                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1635                                 goto out2;
1636                         }
1637
1638                         vnode_put(devvp);
1639                 }
1640         }
1641
1642         /*
1643          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1644          * and increment the name cache's mount generation
1645          */
1646
1647         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1648         error = place_mount_and_checkdirs(mp, vp, ctx);
1649         if (error != 0) {
1650                 goto out2;
1651         }
1652
1653         placed = TRUE;
1654
1655         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1656         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1657
1658         /* Forbid future moves */
1659         mount_lock(mp);
1660         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1661         mount_unlock(mp);
1662
1663         /* Finally, add to mount list, completely ready to go */
1664         if (mount_list_add(mp) != 0) {
1665                 /*
1666                  * The system is shutting down trying to umount
1667                  * everything, so fail with a plausible errno.
1668                  */
1669                 error = EBUSY;
1670                 goto out3;
1671         }
1672
1673         mount_end_update(mp);
1674         vnode_put(rvp);
1675         FREE(old_mntonname, M_TEMP);
1676
1677         vfs_notify_mount(pvp);
1678
1679         return 0;
1680 out3:
1681         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1682
1683         mount_lock(mp);
1684         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1685         mount_unlock(mp);
1686
1687 out2:
1688         /*
1689          * Placing the mp on the vnode clears VMOUNT,
1690          * so cleanup is different after that point
1691          */
1692         if (placed) {
1693                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1694                 undo_place_on_covered_vp(mp, vp);
1695         } else {
1696                 vnode_lock_spin(vp);
1697                 CLR(vp->v_flag, VMOUNT);
1698                 vnode_unlock(vp);
1699         }
1700 out1:
1701         mount_end_update(mp);
1702
1703 out0:
1704         vnode_put(rvp);
1705         FREE(old_mntonname, M_TEMP);
1706         return error;
1707 }
1708
1709 #endif /* CONFIG_IMGSRC_ACCESS */
1710
1711 void
1712 enablequotas(struct mount *mp, vfs_context_t ctx)
1713 {
1714         struct nameidata qnd;
1715         int type;
1716         char qfpath[MAXPATHLEN];
1717         const char *qfname = QUOTAFILENAME;
1718         const char *qfopsname = QUOTAOPSNAME;
1719         const char *qfextension[] = INITQFNAMES;
1720
1721         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1722         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0 ) {
1723                 return;
1724         }
1725         /*
1726          * Enable filesystem disk quotas if necessary.
1727          * We ignore errors as this should not interfere with final mount
1728          */
1729         for (type=0; type < MAXQUOTAS; type++) {
1730                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1731                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1732                        CAST_USER_ADDR_T(qfpath), ctx);
1733                 if (namei(&qnd) != 0)
1734                         continue;           /* option file to trigger quotas is not present */
1735                 vnode_put(qnd.ni_vp);
1736                 nameidone(&qnd);
1737                 snprintf(qfpath, sizeof(qfpath),  "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1738
1739                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1740         }
1741         return;
1742 }
1743
1744
1745 static int
1746 checkdirs_callback(proc_t p, void * arg)
1747 {
1748         struct cdirargs * cdrp = (struct cdirargs * )arg;
1749         vnode_t olddp = cdrp->olddp;
1750         vnode_t newdp = cdrp->newdp;
1751         struct filedesc *fdp;
1752         vnode_t tvp;
1753         vnode_t fdp_cvp;
1754         vnode_t fdp_rvp;
1755         int cdir_changed = 0;
1756         int rdir_changed = 0;
1757
1758         /*
1759          * XXX Also needs to iterate each thread in the process to see if it
1760          * XXX is using a per-thread current working directory, and, if so,
1761          * XXX update that as well.
1762          */
1763
1764         proc_fdlock(p);
1765         fdp = p->p_fd;
1766         if (fdp == (struct filedesc *)0) {
1767                 proc_fdunlock(p);
1768                 return(PROC_RETURNED);
1769         }
1770         fdp_cvp = fdp->fd_cdir;
1771         fdp_rvp = fdp->fd_rdir;
1772         proc_fdunlock(p);
1773
1774         if (fdp_cvp == olddp) {
1775                 vnode_ref(newdp);
1776                 tvp = fdp->fd_cdir;
1777                 fdp_cvp = newdp;
1778                 cdir_changed = 1;
1779                 vnode_rele(tvp);
1780         }
1781         if (fdp_rvp == olddp) {
1782                 vnode_ref(newdp);
1783                 tvp = fdp->fd_rdir;
1784                 fdp_rvp = newdp;
1785                 rdir_changed = 1;
1786                 vnode_rele(tvp);
1787         }
1788         if (cdir_changed || rdir_changed) {
1789                 proc_fdlock(p);
1790                 fdp->fd_cdir = fdp_cvp;
1791                 fdp->fd_rdir = fdp_rvp;
1792                 proc_fdunlock(p);
1793         }
1794         return(PROC_RETURNED);
1795 }
1796
1797
1798
1799 /*
1800  * Scan all active processes to see if any of them have a current
1801  * or root directory onto which the new filesystem has just been
1802  * mounted. If so, replace them with the new mount point.
1803  */
1804 static int
1805 checkdirs(vnode_t olddp, vfs_context_t ctx)
1806 {
1807         vnode_t newdp;
1808         vnode_t tvp;
1809         int err;
1810         struct cdirargs cdr;
1811
1812         if (olddp->v_usecount == 1)
1813                 return(0);
1814         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
1815
1816         if (err != 0) {
1817 #if DIAGNOSTIC
1818                 panic("mount: lost mount: error %d", err);
1819 #endif
1820                 return(err);
1821         }
1822
1823         cdr.olddp = olddp;
1824         cdr.newdp = newdp;
1825         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
1826         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
1827
1828         if (rootvnode == olddp) {
1829                 vnode_ref(newdp);
1830                 tvp = rootvnode;
1831                 rootvnode = newdp;
1832                 vnode_rele(tvp);
1833         }
1834
1835         vnode_put(newdp);
1836         return(0);
1837 }
1838
1839 /*
1840  * Unmount a file system.
1841  *
1842  * Note: unmount takes a path to the vnode mounted on as argument,
1843  * not special file (as before).
1844  */
1845 /* ARGSUSED */
1846 int
1847 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
1848 {
1849         vnode_t vp;
1850         struct mount *mp;
1851         int error;
1852         struct nameidata nd;
1853         vfs_context_t ctx = vfs_context_current();
1854
1855         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
1856                 UIO_USERSPACE, uap->path, ctx);
1857         error = namei(&nd);
1858         if (error)
1859                 return (error);
1860         vp = nd.ni_vp;
1861         mp = vp->v_mount;
1862         nameidone(&nd);
1863
1864 #if CONFIG_MACF
1865         error = mac_mount_check_umount(ctx, mp);
1866         if (error != 0) {
1867                 vnode_put(vp);
1868                 return (error);
1869         }
1870 #endif
1871         /*
1872          * Must be the root of the filesystem
1873          */
1874         if ((vp->v_flag & VROOT) == 0) {
1875                 vnode_put(vp);
1876                 return (EINVAL);
1877         }
1878         mount_ref(mp, 0);
1879         vnode_put(vp);
1880         /* safedounmount consumes the mount ref */
1881         return (safedounmount(mp, uap->flags, ctx));
1882 }
1883
1884 int
1885 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
1886 {
1887         mount_t mp;
1888
1889         mp = mount_list_lookupby_fsid(fsid, 0, 1);
1890         if (mp == (mount_t)0) {
1891                 return(ENOENT);
1892         }
1893         mount_ref(mp, 0);
1894         mount_iterdrop(mp);
1895         /* safedounmount consumes the mount ref */
1896         return(safedounmount(mp, flags, ctx));
1897 }
1898
1899
1900 /*
1901  * The mount struct comes with a mount ref which will be consumed.
1902  * Do the actual file system unmount, prevent some common foot shooting.
1903  */
1904 int
1905 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
1906 {
1907         int error;
1908         proc_t p = vfs_context_proc(ctx);
1909
1910         /*
1911          * If the file system is not responding and MNT_NOBLOCK
1912          * is set and not a forced unmount then return EBUSY.
1913          */
1914         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
1915                 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
1916                 error = EBUSY;
1917                 goto out;
1918         }
1919
1920         /*
1921          * Skip authorization if the mount is tagged as permissive and
1922          * this is not a forced-unmount attempt.
1923          */
1924         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
1925                 /*
1926                  * Only root, or the user that did the original mount is
1927                  * permitted to unmount this filesystem.
1928                  */
1929                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
1930                                 (error = suser(kauth_cred_get(), &p->p_acflag)))
1931                         goto out;
1932         }
1933         /*
1934          * Don't allow unmounting the root file system.
1935          */
1936         if (mp->mnt_flag & MNT_ROOTFS) {
1937                 error = EBUSY; /* the root is always busy */
1938                 goto out;
1939         }
1940
1941 #ifdef CONFIG_IMGSRC_ACCESS
1942         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
1943                 error = EBUSY;
1944                 goto out;
1945         }
1946 #endif /* CONFIG_IMGSRC_ACCESS */
1947
1948         return (dounmount(mp, flags, 1, ctx));
1949
1950 out:
1951         mount_drop(mp, 0);
1952         return(error);
1953 }
1954
1955 /*
1956  * Do the actual file system unmount.
1957  */
1958 int
1959 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
1960 {
1961         vnode_t coveredvp = (vnode_t)0;
1962         int error;
1963         int needwakeup = 0;
1964         int forcedunmount = 0;
1965         int lflags = 0;
1966         struct vnode *devvp = NULLVP;
1967 #if CONFIG_TRIGGERS
1968         proc_t p = vfs_context_proc(ctx);
1969         int did_vflush = 0;
1970         int pflags_save = 0;
1971 #endif /* CONFIG_TRIGGERS */
1972
1973 #if CONFIG_FSE
1974         if (!(flags & MNT_FORCE)) {
1975                 fsevent_unmount(mp, ctx);  /* has to come first! */
1976         }
1977 #endif
1978
1979         mount_lock(mp);
1980
1981         /*
1982          * If already an unmount in progress just return EBUSY.
1983          * Even a forced unmount cannot override.
1984          */
1985         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1986                 if (withref != 0)
1987                         mount_drop(mp, 1);
1988                 mount_unlock(mp);
1989                 return (EBUSY);
1990         }
1991
1992         if (flags & MNT_FORCE) {
1993                 forcedunmount = 1;
1994                 mp->mnt_lflag |= MNT_LFORCE;
1995         }
1996
1997 #if CONFIG_TRIGGERS
1998         if (flags & MNT_NOBLOCK && p != kernproc)
1999                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
2000 #endif
2001
2002         mp->mnt_kern_flag |= MNTK_UNMOUNT;
2003         mp->mnt_lflag |= MNT_LUNMOUNT;
2004         mp->mnt_flag &=~ MNT_ASYNC;
2005         /*
2006          * anyone currently in the fast path that
2007          * trips over the cached rootvp will be
2008          * dumped out and forced into the slow path
2009          * to regenerate a new cached value
2010          */
2011         mp->mnt_realrootvp = NULLVP;
2012         mount_unlock(mp);
2013
2014         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
2015                 /*
2016                  * Force unmount any mounts in this filesystem.
2017                  * If any unmounts fail - just leave them dangling.
2018                  * Avoids recursion.
2019                  */
2020                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
2021         }
2022
2023         /*
2024          * taking the name_cache_lock exclusively will
2025          * insure that everyone is out of the fast path who
2026          * might be trying to use a now stale copy of
2027          * vp->v_mountedhere->mnt_realrootvp
2028          * bumping mount_generation causes the cached values
2029          * to be invalidated
2030          */
2031         name_cache_lock();
2032         mount_generation++;
2033         name_cache_unlock();
2034
2035
2036         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2037         if (withref != 0)
2038                 mount_drop(mp, 0);
2039         error = 0;
2040         if (forcedunmount == 0) {
2041                 ubc_umount(mp); /* release cached vnodes */
2042                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2043                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
2044                         if (error) {
2045                                 mount_lock(mp);
2046                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2047                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2048                                 mp->mnt_lflag &= ~MNT_LFORCE;
2049                                 goto out;
2050                         }
2051                 }
2052         }
2053
2054         /* free disk_conditioner_info structure for this mount */
2055         disk_conditioner_unmount(mp);
2056
2057         IOBSDMountChange(mp, kIOMountChangeUnmount);
2058
2059 #if CONFIG_TRIGGERS
2060         vfs_nested_trigger_unmounts(mp, flags, ctx);
2061         did_vflush = 1;
2062 #endif
2063         if (forcedunmount)
2064                 lflags |= FORCECLOSE;
2065         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
2066         if ((forcedunmount == 0) && error) {
2067                 mount_lock(mp);
2068                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2069                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2070                 mp->mnt_lflag &= ~MNT_LFORCE;
2071                 goto out;
2072         }
2073
2074         /* make sure there are no one in the mount iterations or lookup */
2075         mount_iterdrain(mp);
2076
2077         error = VFS_UNMOUNT(mp, flags, ctx);
2078         if (error) {
2079                 mount_iterreset(mp);
2080                 mount_lock(mp);
2081                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
2082                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
2083                 mp->mnt_lflag &= ~MNT_LFORCE;
2084                 goto out;
2085         }
2086
2087         /* increment the operations count */
2088         if (!error)
2089                 OSAddAtomic(1, &vfs_nummntops);
2090
2091         if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
2092                 /* hold an io reference and drop the usecount before close */
2093                 devvp = mp->mnt_devvp;
2094                 vnode_getalways(devvp);
2095                 vnode_rele(devvp);
2096                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
2097                        ctx);
2098                 vnode_clearmountedon(devvp);
2099                 vnode_put(devvp);
2100         }
2101         lck_rw_done(&mp->mnt_rwlock);
2102         mount_list_remove(mp);
2103         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2104
2105         /* mark the mount point hook in the vp but not drop the ref yet */
2106         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2107                 /*
2108                  * The covered vnode needs special handling. Trying to get an
2109                  * iocount must not block here as this may lead to deadlocks
2110                  * if the Filesystem to which the covered vnode belongs is
2111                  * undergoing forced unmounts. Since we hold a usecount, the
2112                  * vnode cannot be reused (it can, however, still be terminated)
2113                  */
2114                 vnode_getalways(coveredvp);
2115                 vnode_lock_spin(coveredvp);
2116
2117                 mp->mnt_crossref++;
2118                 coveredvp->v_mountedhere = (struct mount *)0;
2119                 CLR(coveredvp->v_flag, VMOUNT);
2120
2121                 vnode_unlock(coveredvp);
2122                 vnode_put(coveredvp);
2123         }
2124
2125         mount_list_lock();
2126         mp->mnt_vtable->vfc_refcount--;
2127         mount_list_unlock();
2128
2129         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2130         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2131         mount_lock(mp);
2132         mp->mnt_lflag |= MNT_LDEAD;
2133
2134         if (mp->mnt_lflag & MNT_LWAIT) {
2135                 /*
2136                  * do the wakeup here
2137                  * in case we block in mount_refdrain
2138                  * which will drop the mount lock
2139                  * and allow anyone blocked in vfs_busy
2140                  * to wakeup and see the LDEAD state
2141                  */
2142                 mp->mnt_lflag &= ~MNT_LWAIT;
2143                 wakeup((caddr_t)mp);
2144         }
2145         mount_refdrain(mp);
2146 out:
2147         if (mp->mnt_lflag & MNT_LWAIT) {
2148                 mp->mnt_lflag &= ~MNT_LWAIT;
2149                 needwakeup = 1;
2150         }
2151
2152 #if CONFIG_TRIGGERS
2153         if (flags & MNT_NOBLOCK && p != kernproc) {
2154                 // Restore P_NOREMOTEHANG bit to its previous value
2155                 if ((pflags_save & P_NOREMOTEHANG) == 0)
2156                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2157         }
2158
2159         /*
2160          * Callback and context are set together under the mount lock, and
2161          * never cleared, so we're safe to examine them here, drop the lock,
2162          * and call out.
2163          */
2164         if (mp->mnt_triggercallback != NULL) {
2165                 mount_unlock(mp);
2166                 if (error == 0) {
2167                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2168                 } else if (did_vflush) {
2169                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2170                 }
2171         } else {
2172                 mount_unlock(mp);
2173         }
2174 #else
2175         mount_unlock(mp);
2176 #endif /* CONFIG_TRIGGERS */
2177
2178         lck_rw_done(&mp->mnt_rwlock);
2179
2180         if (needwakeup)
2181                 wakeup((caddr_t)mp);
2182
2183         if (!error) {
2184                 if ((coveredvp != NULLVP)) {
2185                         vnode_t pvp = NULLVP;
2186
2187                         /*
2188                          * The covered vnode needs special handling. Trying to
2189                          * get an iocount must not block here as this may lead
2190                          * to deadlocks if the Filesystem to which the covered
2191                          * vnode belongs is undergoing forced unmounts. Since we
2192                          * hold a usecount, the  vnode cannot be reused
2193                          * (it can, however, still be terminated).
2194                          */
2195                         vnode_getalways(coveredvp);
2196
2197                         mount_dropcrossref(mp, coveredvp, 0);
2198                         /*
2199                          * We'll _try_ to detect if this really needs to be
2200                          * done. The coveredvp can only be in termination (or
2201                          * terminated) if the coveredvp's mount point is in a
2202                          * forced unmount (or has been) since we still hold the
2203                          * ref.
2204                          */
2205                         if (!vnode_isrecycled(coveredvp)) {
2206                                 pvp = vnode_getparent(coveredvp);
2207 #if CONFIG_TRIGGERS
2208                                 if (coveredvp->v_resolve) {
2209                                         vnode_trigger_rearm(coveredvp, ctx);
2210                                 }
2211 #endif
2212                         }
2213
2214                         vnode_rele(coveredvp);
2215                         vnode_put(coveredvp);
2216                         coveredvp = NULLVP;
2217
2218                         if (pvp) {
2219                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2220                                 vnode_put(pvp);
2221                         }
2222                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2223                                 mount_lock_destroy(mp);
2224 #if CONFIG_MACF
2225                                 mac_mount_label_destroy(mp);
2226 #endif
2227                                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2228                 } else
2229                         panic("dounmount: no coveredvp");
2230         }
2231         return (error);
2232 }
2233
2234 /*
2235  * Unmount any mounts in this filesystem.
2236  */
2237 void
2238 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2239 {
2240         mount_t smp;
2241         fsid_t *fsids, fsid;
2242         int fsids_sz;
2243         int count = 0, i, m = 0;
2244         vnode_t vp;
2245
2246         mount_list_lock();
2247
2248         // Get an array to hold the submounts fsids.
2249         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2250                 count++;
2251         fsids_sz = count * sizeof(fsid_t);
2252         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2253         if (fsids == NULL) {
2254                 mount_list_unlock();
2255                 goto out;
2256         }
2257         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2258
2259         /*
2260          * Fill the array with submount fsids.
2261          * Since mounts are always added to the tail of the mount list, the
2262          * list is always in mount order.
2263          * For each mount check if the mounted-on vnode belongs to a
2264          * mount that's already added to our array of mounts to be unmounted.
2265          */
2266         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2267                 vp = smp->mnt_vnodecovered;
2268                 if (vp == NULL)
2269                         continue;
2270                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2271                 for (i = 0; i <= m; i++) {
2272                         if (fsids[i].val[0] == fsid.val[0] &&
2273                             fsids[i].val[1] == fsid.val[1]) {
2274                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2275                                 break;
2276                         }
2277                 }
2278         }
2279         mount_list_unlock();
2280
2281         // Unmount the submounts in reverse order. Ignore errors.
2282         for (i = m; i > 0; i--) {
2283                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2284                 if (smp) {
2285                         mount_ref(smp, 0);
2286                         mount_iterdrop(smp);
2287                         (void) dounmount(smp, flags, 1, ctx);
2288                 }
2289         }
2290 out:
2291         if (fsids)
2292                 FREE(fsids, M_TEMP);
2293 }
2294
2295 void
2296 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2297 {
2298         vnode_lock(dp);
2299         mp->mnt_crossref--;
2300
2301         if (mp->mnt_crossref < 0)
2302                 panic("mount cross refs -ve");
2303
2304         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2305
2306                 if (need_put)
2307                         vnode_put_locked(dp);
2308                 vnode_unlock(dp);
2309
2310                 mount_lock_destroy(mp);
2311 #if CONFIG_MACF
2312                 mac_mount_label_destroy(mp);
2313 #endif
2314                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2315                 return;
2316         }
2317         if (need_put)
2318                 vnode_put_locked(dp);
2319         vnode_unlock(dp);
2320 }
2321
2322
2323 /*
2324  * Sync each mounted filesystem.
2325  */
2326 #if DIAGNOSTIC
2327 int syncprt = 0;
2328 #endif
2329
2330 int print_vmpage_stat=0;
2331
2332 static int
2333 sync_callback(mount_t mp, __unused void *arg)
2334 {
2335         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2336                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2337
2338                 mp->mnt_flag &= ~MNT_ASYNC;
2339                 VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
2340                 if (asyncflag)
2341                         mp->mnt_flag |= MNT_ASYNC;
2342         }
2343
2344         return (VFS_RETURNED);
2345 }
2346
2347 /* ARGSUSED */
2348 int
2349 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2350 {
2351         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2352
2353         if (print_vmpage_stat) {
2354                 vm_countdirtypages();
2355         }
2356
2357 #if DIAGNOSTIC
2358         if (syncprt)
2359                 vfs_bufstats();
2360 #endif /* DIAGNOSTIC */
2361         return 0;
2362 }
2363
2364 typedef enum {
2365         SYNC_ALL = 0,
2366         SYNC_ONLY_RELIABLE_MEDIA = 1,
2367         SYNC_ONLY_UNRELIABLE_MEDIA = 2
2368 } sync_type_t;
2369
2370 static int
2371 sync_internal_callback(mount_t mp, void *arg)
2372 {
2373         if (arg) {
2374                 int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
2375                                    (mp->mnt_flag & MNT_LOCAL);
2376                 sync_type_t sync_type = *((sync_type_t *)arg);
2377
2378                 if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable)
2379                         return (VFS_RETURNED);
2380                 else if ((sync_type = SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable)
2381                         return (VFS_RETURNED);
2382         }
2383
2384         (void)sync_callback(mp, NULL);
2385
2386         return (VFS_RETURNED);
2387 }
2388
2389 int sync_thread_state = 0;
2390 int sync_timeout_seconds = 5;
2391
2392 #define SYNC_THREAD_RUN       0x0001
2393 #define SYNC_THREAD_RUNNING   0x0002
2394
2395 static void
2396 sync_thread(__unused void *arg, __unused wait_result_t wr)
2397 {
2398         sync_type_t sync_type;
2399
2400         lck_mtx_lock(sync_mtx_lck);
2401         while (sync_thread_state & SYNC_THREAD_RUN) {
2402                 sync_thread_state &= ~SYNC_THREAD_RUN;
2403                 lck_mtx_unlock(sync_mtx_lck);
2404
2405                 sync_type = SYNC_ONLY_RELIABLE_MEDIA;
2406                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2407                 sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
2408                 vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
2409
2410                 lck_mtx_lock(sync_mtx_lck);
2411         }
2412         /*
2413          * This wakeup _has_ to be issued before the lock is released otherwise
2414          * we may end up waking up a thread in sync_internal which is
2415          * expecting a wakeup from a thread it just created and not from this
2416          * thread which is about to exit.
2417          */
2418         wakeup(&sync_thread_state);
2419         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2420         lck_mtx_unlock(sync_mtx_lck);
2421
2422         if (print_vmpage_stat) {
2423                 vm_countdirtypages();
2424         }
2425
2426 #if DIAGNOSTIC
2427         if (syncprt)
2428                 vfs_bufstats();
2429 #endif /* DIAGNOSTIC */
2430 }
2431
2432 struct timeval sync_timeout_last_print = {0, 0};
2433
2434 /*
2435  * An in-kernel sync for power management to call.
2436  * This function always returns within sync_timeout seconds.
2437  */
2438 __private_extern__ int
2439 sync_internal(void)
2440 {
2441         thread_t thd;
2442         int error;
2443         int thread_created = FALSE;
2444         struct timespec ts = {sync_timeout_seconds, 0};
2445
2446         lck_mtx_lock(sync_mtx_lck);
2447         sync_thread_state |= SYNC_THREAD_RUN;
2448         if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
2449                 int kr;
2450
2451                 sync_thread_state |= SYNC_THREAD_RUNNING;
2452                 kr = kernel_thread_start(sync_thread, NULL, &thd);
2453                 if (kr != KERN_SUCCESS) {
2454                         sync_thread_state &= ~SYNC_THREAD_RUNNING;
2455                         lck_mtx_unlock(sync_mtx_lck);
2456                         printf("sync_thread failed\n");
2457                         return (0);
2458                 }
2459                 thread_created = TRUE;
2460         }
2461
2462         error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
2463             (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2464         if (error) {
2465                 struct timeval now;
2466
2467                 microtime(&now);
2468                 if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
2469                         printf("sync timed out: %d sec\n", sync_timeout_seconds);
2470                         sync_timeout_last_print.tv_sec = now.tv_sec;
2471                 }
2472         }
2473
2474         if (thread_created)
2475                 thread_deallocate(thd);
2476
2477         return (0);
2478 } /* end of sync_internal call */
2479
2480 /*
2481  * Change filesystem quotas.
2482  */
2483 #if QUOTA
2484 int
2485 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2486 {
2487         struct mount *mp;
2488         int error, quota_cmd, quota_status = 0;
2489         caddr_t datap;
2490         size_t fnamelen;
2491         struct nameidata nd;
2492         vfs_context_t ctx = vfs_context_current();
2493         struct dqblk my_dqblk = {};
2494
2495         AUDIT_ARG(uid, uap->uid);
2496         AUDIT_ARG(cmd, uap->cmd);
2497         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2498                uap->path, ctx);
2499         error = namei(&nd);
2500         if (error)
2501                 return (error);
2502         mp = nd.ni_vp->v_mount;
2503         vnode_put(nd.ni_vp);
2504         nameidone(&nd);
2505
2506         /* copyin any data we will need for downstream code */
2507         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2508
2509         switch (quota_cmd) {
2510         case Q_QUOTAON:
2511                 /* uap->arg specifies a file from which to take the quotas */
2512                 fnamelen = MAXPATHLEN;
2513                 datap = kalloc(MAXPATHLEN);
2514                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2515                 break;
2516         case Q_GETQUOTA:
2517                 /* uap->arg is a pointer to a dqblk structure. */
2518                 datap = (caddr_t) &my_dqblk;
2519                 break;
2520         case Q_SETQUOTA:
2521         case Q_SETUSE:
2522                 /* uap->arg is a pointer to a dqblk structure. */
2523                 datap = (caddr_t) &my_dqblk;
2524                 if (proc_is64bit(p)) {
2525                         struct user_dqblk       my_dqblk64;
2526                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof (my_dqblk64));
2527                         if (error == 0) {
2528                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2529                         }
2530                 }
2531                 else {
2532                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof (my_dqblk));
2533                 }
2534                 break;
2535         case Q_QUOTASTAT:
2536                 /* uap->arg is a pointer to an integer */
2537                 datap = (caddr_t) &quota_status;
2538                 break;
2539         default:
2540                 datap = NULL;
2541                 break;
2542         } /* switch */
2543
2544         if (error == 0) {
2545                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2546         }
2547
2548         switch (quota_cmd) {
2549         case Q_QUOTAON:
2550                 if (datap != NULL)
2551                         kfree(datap, MAXPATHLEN);
2552                 break;
2553         case Q_GETQUOTA:
2554                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2555                 if (error == 0) {
2556                         if (proc_is64bit(p)) {
2557                                 struct user_dqblk       my_dqblk64;
2558
2559                                 memset(&my_dqblk64, 0, sizeof(my_dqblk64));
2560                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2561                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof (my_dqblk64));
2562                         }
2563                         else {
2564                                 error = copyout(datap, uap->arg, sizeof (struct dqblk));
2565                         }
2566                 }
2567                 break;
2568         case Q_QUOTASTAT:
2569                 /* uap->arg is a pointer to an integer */
2570                 if (error == 0) {
2571                         error = copyout(datap, uap->arg, sizeof(quota_status));
2572                 }
2573                 break;
2574         default:
2575                 break;
2576         } /* switch */
2577
2578         return (error);
2579 }
2580 #else
2581 int
2582 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2583 {
2584         return (EOPNOTSUPP);
2585 }
2586 #endif /* QUOTA */
2587
2588 /*
2589  * Get filesystem statistics.
2590  *
2591  * Returns:     0                       Success
2592  *      namei:???
2593  *      vfs_update_vfsstat:???
2594  *      munge_statfs:EFAULT
2595  */
2596 /* ARGSUSED */
2597 int
2598 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2599 {
2600         struct mount *mp;
2601         struct vfsstatfs *sp;
2602         int error;
2603         struct nameidata nd;
2604         vfs_context_t ctx = vfs_context_current();
2605         vnode_t vp;
2606
2607         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2608                 UIO_USERSPACE, uap->path, ctx);
2609         error = namei(&nd);
2610         if (error != 0)
2611                 return (error);
2612         vp = nd.ni_vp;
2613         mp = vp->v_mount;
2614         sp = &mp->mnt_vfsstat;
2615         nameidone(&nd);
2616
2617 #if CONFIG_MACF
2618         error = mac_mount_check_stat(ctx, mp);
2619         if (error != 0)
2620                 return (error);
2621 #endif
2622
2623         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2624         if (error != 0) {
2625                 vnode_put(vp);
2626                 return (error);
2627         }
2628
2629         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2630         vnode_put(vp);
2631         return (error);
2632 }
2633
2634 /*
2635  * Get filesystem statistics.
2636  */
2637 /* ARGSUSED */
2638 int
2639 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2640 {
2641         vnode_t vp;
2642         struct mount *mp;
2643         struct vfsstatfs *sp;
2644         int error;
2645
2646         AUDIT_ARG(fd, uap->fd);
2647
2648         if ( (error = file_vnode(uap->fd, &vp)) )
2649                 return (error);
2650
2651         error = vnode_getwithref(vp);
2652         if (error) {
2653                 file_drop(uap->fd);
2654                 return (error);
2655         }
2656
2657         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2658
2659         mp = vp->v_mount;
2660         if (!mp) {
2661                 error = EBADF;
2662                 goto out;
2663         }
2664
2665 #if CONFIG_MACF
2666         error = mac_mount_check_stat(vfs_context_current(), mp);
2667         if (error != 0)
2668                 goto out;
2669 #endif
2670
2671         sp = &mp->mnt_vfsstat;
2672         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2673                 goto out;
2674         }
2675
2676         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2677
2678 out:
2679         file_drop(uap->fd);
2680         vnode_put(vp);
2681
2682         return (error);
2683 }
2684
2685 /*
2686  * Common routine to handle copying of statfs64 data to user space
2687  */
2688 static int
2689 statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
2690 {
2691         int error;
2692         struct statfs64 sfs;
2693
2694         bzero(&sfs, sizeof(sfs));
2695
2696         sfs.f_bsize = sfsp->f_bsize;
2697         sfs.f_iosize = (int32_t)sfsp->f_iosize;
2698         sfs.f_blocks = sfsp->f_blocks;
2699         sfs.f_bfree = sfsp->f_bfree;
2700         sfs.f_bavail = sfsp->f_bavail;
2701         sfs.f_files = sfsp->f_files;
2702         sfs.f_ffree = sfsp->f_ffree;
2703         sfs.f_fsid = sfsp->f_fsid;
2704         sfs.f_owner = sfsp->f_owner;
2705         sfs.f_type = mp->mnt_vtable->vfc_typenum;
2706         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2707         sfs.f_fssubtype = sfsp->f_fssubtype;
2708         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2709                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
2710         } else {
2711                 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
2712         }
2713         strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
2714         strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
2715
2716         error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
2717
2718         return(error);
2719 }
2720
2721 /*
2722  * Get file system statistics in 64-bit mode
2723  */
2724 int
2725 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
2726 {
2727         struct mount *mp;
2728         struct vfsstatfs *sp;
2729         int error;
2730         struct nameidata nd;
2731         vfs_context_t ctxp = vfs_context_current();
2732         vnode_t vp;
2733
2734         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2735                 UIO_USERSPACE, uap->path, ctxp);
2736         error = namei(&nd);
2737         if (error != 0)
2738                 return (error);
2739         vp = nd.ni_vp;
2740         mp = vp->v_mount;
2741         sp = &mp->mnt_vfsstat;
2742         nameidone(&nd);
2743
2744 #if CONFIG_MACF
2745         error = mac_mount_check_stat(ctxp, mp);
2746         if (error != 0)
2747                 return (error);
2748 #endif
2749
2750         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
2751         if (error != 0) {
2752                 vnode_put(vp);
2753                 return (error);
2754         }
2755
2756         error = statfs64_common(mp, sp, uap->buf);
2757         vnode_put(vp);
2758
2759         return (error);
2760 }
2761
2762 /*
2763  * Get file system statistics in 64-bit mode
2764  */
2765 int
2766 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
2767 {
2768         struct vnode *vp;
2769         struct mount *mp;
2770         struct vfsstatfs *sp;
2771         int error;
2772
2773         AUDIT_ARG(fd, uap->fd);
2774
2775         if ( (error = file_vnode(uap->fd, &vp)) )
2776                 return (error);
2777
2778         error = vnode_getwithref(vp);
2779         if (error) {
2780                 file_drop(uap->fd);
2781                 return (error);
2782         }
2783
2784         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2785
2786         mp = vp->v_mount;
2787         if (!mp) {
2788                 error = EBADF;
2789                 goto out;
2790         }
2791
2792 #if CONFIG_MACF
2793         error = mac_mount_check_stat(vfs_context_current(), mp);
2794         if (error != 0)
2795                 goto out;
2796 #endif
2797
2798         sp = &mp->mnt_vfsstat;
2799         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2800                 goto out;
2801         }
2802
2803         error = statfs64_common(mp, sp, uap->buf);
2804
2805 out:
2806         file_drop(uap->fd);
2807         vnode_put(vp);
2808
2809         return (error);
2810 }
2811
2812 struct getfsstat_struct {
2813         user_addr_t     sfsp;
2814         user_addr_t     *mp;
2815         int             count;
2816         int             maxcount;
2817         int             flags;
2818         int             error;
2819 };
2820
2821
2822 static int
2823 getfsstat_callback(mount_t mp, void * arg)
2824 {
2825
2826         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2827         struct vfsstatfs *sp;
2828         int error, my_size;
2829         vfs_context_t ctx = vfs_context_current();
2830
2831         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2832 #if CONFIG_MACF
2833                 error = mac_mount_check_stat(ctx, mp);
2834                 if (error != 0) {
2835                         fstp->error = error;
2836                         return(VFS_RETURNED_DONE);
2837                 }
2838 #endif
2839                 sp = &mp->mnt_vfsstat;
2840                 /*
2841                  * If MNT_NOWAIT is specified, do not refresh the
2842                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
2843                  */
2844                 if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2845                         (error = vfs_update_vfsstat(mp, ctx,
2846                             VFS_USER_EVENT))) {
2847                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2848                         return(VFS_RETURNED);
2849                 }
2850
2851                 /*
2852                  * Need to handle LP64 version of struct statfs
2853                  */
2854                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
2855                 if (error) {
2856                         fstp->error = error;
2857                         return(VFS_RETURNED_DONE);
2858                 }
2859                 fstp->sfsp += my_size;
2860
2861                 if (fstp->mp) {
2862 #if CONFIG_MACF
2863                         error = mac_mount_label_get(mp, *fstp->mp);
2864                         if (error) {
2865                                 fstp->error = error;
2866                                 return(VFS_RETURNED_DONE);
2867                         }
2868 #endif
2869                         fstp->mp++;
2870                 }
2871         }
2872         fstp->count++;
2873         return(VFS_RETURNED);
2874 }
2875
2876 /*
2877  * Get statistics on all filesystems.
2878  */
2879 int
2880 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
2881 {
2882         struct __mac_getfsstat_args muap;
2883
2884         muap.buf = uap->buf;
2885         muap.bufsize = uap->bufsize;
2886         muap.mac = USER_ADDR_NULL;
2887         muap.macsize = 0;
2888         muap.flags = uap->flags;
2889
2890         return (__mac_getfsstat(p, &muap, retval));
2891 }
2892
2893 /*
2894  * __mac_getfsstat: Get MAC-related file system statistics
2895  *
2896  * Parameters:    p                        (ignored)
2897  *                uap                      User argument descriptor (see below)
2898  *                retval                   Count of file system statistics (N stats)
2899  *
2900  * Indirect:      uap->bufsize             Buffer size
2901  *                uap->macsize             MAC info size
2902  *                uap->buf                 Buffer where information will be returned
2903  *                uap->mac                 MAC info
2904  *                uap->flags               File system flags
2905  *
2906  *
2907  * Returns:        0                       Success
2908  *                !0                       Not success
2909  *
2910  */
2911 int
2912 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
2913 {
2914         user_addr_t sfsp;
2915         user_addr_t *mp;
2916         size_t count, maxcount, bufsize, macsize;
2917         struct getfsstat_struct fst;
2918
2919         bufsize = (size_t) uap->bufsize;
2920         macsize = (size_t) uap->macsize;
2921
2922         if (IS_64BIT_PROCESS(p)) {
2923                 maxcount = bufsize / sizeof(struct user64_statfs);
2924         }
2925         else {
2926                 maxcount = bufsize / sizeof(struct user32_statfs);
2927         }
2928         sfsp = uap->buf;
2929         count = 0;
2930
2931         mp = NULL;
2932
2933 #if CONFIG_MACF
2934         if (uap->mac != USER_ADDR_NULL) {
2935                 u_int32_t *mp0;
2936                 int error;
2937                 unsigned int i;
2938
2939                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
2940                 if (count != maxcount)
2941                         return (EINVAL);
2942
2943                 /* Copy in the array */
2944                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
2945                 if (mp0 == NULL) {
2946                         return (ENOMEM);
2947                 }
2948
2949                 error = copyin(uap->mac, mp0, macsize);
2950                 if (error) {
2951                         FREE(mp0, M_MACTEMP);
2952                         return (error);
2953                 }
2954
2955                 /* Normalize to an array of user_addr_t */
2956                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
2957                 if (mp == NULL) {
2958                         FREE(mp0, M_MACTEMP);
2959                         return (ENOMEM);
2960                 }
2961
2962                 for (i = 0; i < count; i++) {
2963                         if (IS_64BIT_PROCESS(p))
2964                                 mp[i] = ((user_addr_t *)mp0)[i];
2965                         else
2966                                 mp[i] = (user_addr_t)mp0[i];
2967                 }
2968                 FREE(mp0, M_MACTEMP);
2969         }
2970 #endif
2971
2972
2973         fst.sfsp = sfsp;
2974         fst.mp = mp;
2975         fst.flags = uap->flags;
2976         fst.count = 0;
2977         fst.error = 0;
2978         fst.maxcount = maxcount;
2979
2980
2981         vfs_iterate(0, getfsstat_callback, &fst);
2982
2983         if (mp)
2984                 FREE(mp, M_MACTEMP);
2985
2986         if (fst.error ) {
2987                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2988                 return(fst.error);
2989         }
2990
2991         if (fst.sfsp && fst.count > fst.maxcount)
2992                 *retval = fst.maxcount;
2993         else
2994                 *retval = fst.count;
2995         return (0);
2996 }
2997
2998 static int
2999 getfsstat64_callback(mount_t mp, void * arg)
3000 {
3001         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
3002         struct vfsstatfs *sp;
3003         int error;
3004
3005         if (fstp->sfsp && fstp->count < fstp->maxcount) {
3006 #if CONFIG_MACF
3007                 error = mac_mount_check_stat(vfs_context_current(), mp);
3008                 if (error != 0) {
3009                         fstp->error = error;
3010                         return(VFS_RETURNED_DONE);
3011                 }
3012 #endif
3013                 sp = &mp->mnt_vfsstat;
3014                 /*
3015                  * If MNT_NOWAIT is specified, do not refresh the fsstat
3016                  * cache. MNT_WAIT overrides MNT_NOWAIT.
3017                  *
3018                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
3019                  * getfsstat, since the constants are out of the same
3020                  * namespace.
3021                  */
3022                 if (((fstp->flags & MNT_NOWAIT) == 0 ||
3023                      (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
3024                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
3025                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
3026                         return(VFS_RETURNED);
3027                 }
3028
3029                 error = statfs64_common(mp, sp, fstp->sfsp);
3030                 if (error) {
3031                         fstp->error = error;
3032                         return(VFS_RETURNED_DONE);
3033                 }
3034                 fstp->sfsp += sizeof(struct statfs64);
3035         }
3036         fstp->count++;
3037         return(VFS_RETURNED);
3038 }
3039
3040 /*
3041  * Get statistics on all file systems in 64 bit mode.
3042  */
3043 int
3044 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
3045 {
3046         user_addr_t sfsp;
3047         int count, maxcount;
3048         struct getfsstat_struct fst;
3049
3050         maxcount = uap->bufsize / sizeof(struct statfs64);
3051
3052         sfsp = uap->buf;
3053         count = 0;
3054
3055         fst.sfsp = sfsp;
3056         fst.flags = uap->flags;
3057         fst.count = 0;
3058         fst.error = 0;
3059         fst.maxcount = maxcount;
3060
3061         vfs_iterate(0, getfsstat64_callback, &fst);
3062
3063         if (fst.error ) {
3064                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
3065                 return(fst.error);
3066         }
3067
3068         if (fst.sfsp && fst.count > fst.maxcount)
3069                 *retval = fst.maxcount;
3070         else
3071                 *retval = fst.count;
3072
3073         return (0);
3074 }
3075
3076 /*
3077  * gets the associated vnode with the file descriptor passed.
3078  * as input
3079  *
3080  * INPUT
3081  * ctx - vfs context of caller
3082  * fd - file descriptor for which vnode is required.
3083  * vpp - Pointer to pointer to vnode to be returned.
3084  *
3085  * The vnode is returned with an iocount so any vnode obtained
3086  * by this call needs a vnode_put
3087  *
3088  */
3089 int
3090 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
3091 {
3092         int error;
3093         vnode_t vp;
3094         struct fileproc *fp;
3095         proc_t p = vfs_context_proc(ctx);
3096
3097         *vpp =  NULLVP;
3098
3099         error = fp_getfvp(p, fd, &fp, &vp);
3100         if (error)
3101                 return (error);
3102
3103         error = vnode_getwithref(vp);
3104         if (error) {
3105                 (void)fp_drop(p, fd, fp, 0);
3106                 return (error);
3107         }
3108
3109         (void)fp_drop(p, fd, fp, 0);
3110         *vpp = vp;
3111         return (error);
3112 }
3113
3114 /*
3115  * Wrapper function around namei to start lookup from a directory
3116  * specified by a file descriptor ni_dirfd.
3117  *
3118  * In addition to all the errors returned by namei, this call can
3119  * return ENOTDIR if the file descriptor does not refer to a directory.
3120  * and EBADF if the file descriptor is not valid.
3121  */
3122 int
3123 nameiat(struct nameidata *ndp, int dirfd)
3124 {
3125         if ((dirfd != AT_FDCWD) &&
3126             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
3127             !(ndp->ni_cnd.cn_flags & USEDVP)) {
3128                 int error = 0;
3129                 char c;
3130
3131                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3132                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3133                         if (error)
3134                                 return (error);
3135                 } else {
3136                         c = *((char *)(ndp->ni_dirp));
3137                 }
3138
3139                 if (c != '/') {
3140                         vnode_t dvp_at;
3141
3142                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3143                             &dvp_at);
3144                         if (error)
3145                                 return (error);
3146
3147                         if (vnode_vtype(dvp_at) != VDIR) {
3148                                 vnode_put(dvp_at);
3149                                 return (ENOTDIR);
3150                         }
3151
3152                         ndp->ni_dvp = dvp_at;
3153                         ndp->ni_cnd.cn_flags |= USEDVP;
3154                         error = namei(ndp);
3155                         ndp->ni_cnd.cn_flags &= ~USEDVP;
3156                         vnode_put(dvp_at);
3157                         return (error);
3158                 }
3159         }
3160
3161         return (namei(ndp));
3162 }
3163
3164 /*
3165  * Change current working directory to a given file descriptor.
3166  */
3167 /* ARGSUSED */
3168 static int
3169 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3170 {
3171         struct filedesc *fdp = p->p_fd;
3172         vnode_t vp;
3173         vnode_t tdp;
3174         vnode_t tvp;
3175         struct mount *mp;
3176         int error;
3177         vfs_context_t ctx = vfs_context_current();
3178
3179         AUDIT_ARG(fd, uap->fd);
3180         if (per_thread && uap->fd == -1) {
3181                 /*
3182                  * Switching back from per-thread to per process CWD; verify we
3183                  * in fact have one before proceeding.  The only success case
3184                  * for this code path is to return 0 preemptively after zapping
3185                  * the thread structure contents.
3186                  */
3187                 thread_t th = vfs_context_thread(ctx);
3188                 if (th) {
3189                         uthread_t uth = get_bsdthread_info(th);
3190                         tvp = uth->uu_cdir;
3191                         uth->uu_cdir = NULLVP;
3192                         if (tvp != NULLVP) {
3193                                 vnode_rele(tvp);
3194                                 return (0);
3195                         }
3196                 }
3197                 return (EBADF);
3198         }
3199
3200         if ( (error = file_vnode(uap->fd, &vp)) )
3201                 return(error);
3202         if ( (error = vnode_getwithref(vp)) ) {
3203                 file_drop(uap->fd);
3204                 return(error);
3205         }
3206
3207         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3208
3209         if (vp->v_type != VDIR) {
3210                 error = ENOTDIR;
3211                 goto out;
3212         }
3213
3214 #if CONFIG_MACF
3215         error = mac_vnode_check_chdir(ctx, vp);
3216         if (error)
3217                 goto out;
3218 #endif
3219         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3220         if (error)
3221                 goto out;
3222
3223         while (!error && (mp = vp->v_mountedhere) != NULL) {
3224                 if (vfs_busy(mp, LK_NOWAIT)) {
3225                         error = EACCES;
3226                         goto out;
3227                 }
3228                 error = VFS_ROOT(mp, &tdp, ctx);
3229                 vfs_unbusy(mp);
3230                 if (error)
3231                         break;
3232                 vnode_put(vp);
3233                 vp = tdp;
3234         }
3235         if (error)
3236                 goto out;
3237         if ( (error = vnode_ref(vp)) )
3238                 goto out;
3239         vnode_put(vp);
3240
3241         if (per_thread) {
3242                 thread_t th = vfs_context_thread(ctx);
3243                 if (th) {
3244                         uthread_t uth = get_bsdthread_info(th);
3245                         tvp = uth->uu_cdir;
3246                         uth->uu_cdir = vp;
3247                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3248                 } else {
3249                         vnode_rele(vp);
3250                         return (ENOENT);
3251                 }
3252         } else {
3253                 proc_fdlock(p);
3254                 tvp = fdp->fd_cdir;
3255                 fdp->fd_cdir = vp;
3256                 proc_fdunlock(p);
3257         }
3258
3259         if (tvp)
3260                 vnode_rele(tvp);
3261         file_drop(uap->fd);
3262
3263         return (0);
3264 out:
3265         vnode_put(vp);
3266         file_drop(uap->fd);
3267
3268         return(error);
3269 }
3270
3271 int
3272 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3273 {
3274         return common_fchdir(p, uap, 0);
3275 }
3276
3277 int
3278 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3279 {
3280         return common_fchdir(p, (void *)uap, 1);
3281 }
3282
3283 /*
3284  * Change current working directory (".").
3285  *
3286  * Returns:     0                       Success
3287  *      change_dir:ENOTDIR
3288  *      change_dir:???
3289  *      vnode_ref:ENOENT                No such file or directory
3290  */
3291 /* ARGSUSED */
3292 static int
3293 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3294 {
3295         struct filedesc *fdp = p->p_fd;
3296         int error;
3297         struct nameidata nd;
3298         vnode_t tvp;
3299         vfs_context_t ctx = vfs_context_current();
3300
3301         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3302                 UIO_USERSPACE, uap->path, ctx);
3303         error = change_dir(&nd, ctx);
3304         if (error)
3305                 return (error);
3306         if ( (error = vnode_ref(nd.ni_vp)) ) {
3307                 vnode_put(nd.ni_vp);
3308                 return (error);
3309         }
3310         /*
3311          * drop the iocount we picked up in change_dir
3312          */
3313         vnode_put(nd.ni_vp);
3314
3315         if (per_thread) {
3316                 thread_t th = vfs_context_thread(ctx);
3317                 if (th) {
3318                         uthread_t uth = get_bsdthread_info(th);
3319                         tvp = uth->uu_cdir;
3320                         uth->uu_cdir = nd.ni_vp;
3321                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3322                 } else {
3323                         vnode_rele(nd.ni_vp);
3324                         return (ENOENT);
3325                 }
3326         } else {
3327                 proc_fdlock(p);
3328                 tvp = fdp->fd_cdir;
3329                 fdp->fd_cdir = nd.ni_vp;
3330                 proc_fdunlock(p);
3331         }
3332
3333         if (tvp)
3334                 vnode_rele(tvp);
3335
3336         return (0);
3337 }
3338
3339
3340 /*
3341  * chdir
3342  *
3343  * Change current working directory (".") for the entire process
3344  *
3345  * Parameters:  p       Process requesting the call
3346  *              uap     User argument descriptor (see below)
3347  *              retval  (ignored)
3348  *
3349  * Indirect parameters: uap->path       Directory path
3350  *
3351  * Returns:     0                       Success
3352  *              common_chdir: ENOTDIR
3353  *              common_chdir: ENOENT    No such file or directory
3354  *              common_chdir: ???
3355  *
3356  */
3357 int
3358 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3359 {
3360         return common_chdir(p, (void *)uap, 0);
3361 }
3362
3363 /*
3364  * __pthread_chdir
3365  *
3366  * Change current working directory (".") for a single thread
3367  *
3368  * Parameters:  p       Process requesting the call
3369  *              uap     User argument descriptor (see below)
3370  *              retval  (ignored)
3371  *
3372  * Indirect parameters: uap->path       Directory path
3373  *
3374  * Returns:     0                       Success
3375  *              common_chdir: ENOTDIR
3376  *              common_chdir: ENOENT    No such file or directory
3377  *              common_chdir: ???
3378  *
3379  */
3380 int
3381 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3382 {
3383         return common_chdir(p, (void *)uap, 1);
3384 }
3385
3386
3387 /*
3388  * Change notion of root (``/'') directory.
3389  */
3390 /* ARGSUSED */
3391 int
3392 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3393 {
3394         struct filedesc *fdp = p->p_fd;
3395         int error;
3396         struct nameidata nd;
3397         vnode_t tvp;
3398         vfs_context_t ctx = vfs_context_current();
3399
3400         if ((error = suser(kauth_cred_get(), &p->p_acflag)))
3401                 return (error);
3402
3403         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3404                 UIO_USERSPACE, uap->path, ctx);
3405         error = change_dir(&nd, ctx);
3406         if (error)
3407                 return (error);
3408
3409 #if CONFIG_MACF
3410         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3411             &nd.ni_cnd);
3412         if (error) {
3413                 vnode_put(nd.ni_vp);
3414                 return (error);
3415         }
3416 #endif
3417
3418         if ( (error = vnode_ref(nd.ni_vp)) ) {
3419                 vnode_put(nd.ni_vp);
3420                 return (error);
3421         }
3422         vnode_put(nd.ni_vp);
3423
3424         proc_fdlock(p);
3425         tvp = fdp->fd_rdir;
3426         fdp->fd_rdir = nd.ni_vp;
3427         fdp->fd_flags |= FD_CHROOT;
3428         proc_fdunlock(p);
3429
3430         if (tvp != NULL)
3431                 vnode_rele(tvp);
3432
3433         return (0);
3434 }
3435
3436 /*
3437  * Common routine for chroot and chdir.
3438  *
3439  * Returns:     0                       Success
3440  *              ENOTDIR                 Not a directory
3441  *              namei:???               [anything namei can return]
3442  *              vnode_authorize:???     [anything vnode_authorize can return]
3443  */
3444 static int
3445 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3446 {
3447         vnode_t vp;
3448         int error;
3449
3450         if ((error = namei(ndp)))
3451                 return (error);
3452         nameidone(ndp);
3453         vp = ndp->ni_vp;
3454
3455         if (vp->v_type != VDIR) {
3456                 vnode_put(vp);
3457                 return (ENOTDIR);
3458         }
3459
3460 #if CONFIG_MACF
3461         error = mac_vnode_check_chdir(ctx, vp);
3462         if (error) {
3463                 vnode_put(vp);
3464                 return (error);
3465         }
3466 #endif
3467
3468         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3469         if (error) {
3470                 vnode_put(vp);
3471                 return (error);
3472         }
3473
3474         return (error);
3475 }
3476
3477 /*
3478  * Free the vnode data (for directories) associated with the file glob.
3479  */
3480 struct fd_vn_data *
3481 fg_vn_data_alloc(void)
3482 {
3483         struct fd_vn_data *fvdata;
3484
3485         /* Allocate per fd vnode data */
3486         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3487                M_FD_VN_DATA, M_WAITOK | M_ZERO);
3488         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3489         return fvdata;
3490 }
3491
3492 /*
3493  * Free the vnode data (for directories) associated with the file glob.
3494  */
3495 void
3496 fg_vn_data_free(void *fgvndata)
3497 {
3498         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3499
3500         if (fvdata->fv_buf)
3501                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3502         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3503         FREE(fvdata, M_FD_VN_DATA);
3504 }
3505
3506 /*
3507  * Check permissions, allocate an open file structure,
3508  * and call the device open routine if any.
3509  *
3510  * Returns:     0                       Success
3511  *              EINVAL
3512  *              EINTR
3513  *      falloc:ENFILE
3514  *      falloc:EMFILE
3515  *      falloc:ENOMEM
3516  *      vn_open_auth:???
3517  *      dupfdopen:???
3518  *      VNOP_ADVLOCK:???
3519  *      vnode_setsize:???
3520  *
3521  * XXX Need to implement uid, gid
3522  */
3523 int
3524 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3525     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3526     int32_t *retval)
3527 {
3528         proc_t p = vfs_context_proc(ctx);
3529         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3530         struct fileproc *fp;
3531         vnode_t vp;
3532         int flags, oflags;
3533         int type, indx, error;
3534         struct flock lf;
3535         struct vfs_context context;
3536
3537         oflags = uflags;
3538
3539         if ((oflags & O_ACCMODE) == O_ACCMODE)
3540                 return(EINVAL);
3541
3542         flags = FFLAGS(uflags);
3543         CLR(flags, FENCRYPTED);
3544         CLR(flags, FUNENCRYPTED);
3545
3546         AUDIT_ARG(fflags, oflags);
3547         AUDIT_ARG(mode, vap->va_mode);
3548
3549         if ((error = falloc_withalloc(p,
3550             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3551                 return (error);
3552         }
3553         uu->uu_dupfd = -indx - 1;
3554
3555         if ((error = vn_open_auth(ndp, &flags, vap))) {
3556                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)){        /* XXX from fdopen */
3557                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3558                                 fp_drop(p, indx, NULL, 0);
3559                                 *retval = indx;
3560                                 return (0);
3561                         }
3562                 }
3563                 if (error == ERESTART)
3564                         error = EINTR;
3565                 fp_free(p, indx, fp);
3566                 return (error);
3567         }
3568         uu->uu_dupfd = 0;
3569         vp = ndp->ni_vp;
3570
3571         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3572         fp->f_fglob->fg_ops = &vnops;
3573         fp->f_fglob->fg_data = (caddr_t)vp;
3574
3575         if (flags & (O_EXLOCK | O_SHLOCK)) {
3576                 lf.l_whence = SEEK_SET;
3577                 lf.l_start = 0;
3578                 lf.l_len = 0;
3579                 if (flags & O_EXLOCK)
3580                         lf.l_type = F_WRLCK;
3581                 else
3582                         lf.l_type = F_RDLCK;
3583                 type = F_FLOCK;
3584                 if ((flags & FNONBLOCK) == 0)
3585                         type |= F_WAIT;
3586 #if CONFIG_MACF
3587                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3588                     F_SETLK, &lf);
3589                 if (error)
3590                         goto bad;
3591 #endif
3592                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL)))
3593                         goto bad;
3594                 fp->f_fglob->fg_flag |= FHASLOCK;
3595         }
3596
3597 #if DEVELOPMENT || DEBUG
3598         /*
3599          * XXX VSWAP: Check for entitlements or special flag here
3600          * so we can restrict access appropriately.
3601          */
3602 #else /* DEVELOPMENT || DEBUG */
3603
3604         if (vnode_isswap(vp) && (flags & (FWRITE | O_TRUNC)) && (ctx != vfs_context_kernel())) {
3605                 /* block attempt to write/truncate swapfile */
3606                 error = EPERM;
3607                 goto bad;
3608         }
3609 #endif /* DEVELOPMENT || DEBUG */
3610
3611         /* try to truncate by setting the size attribute */
3612         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0))
3613                 goto bad;
3614
3615         /*
3616          * For directories we hold some additional information in the fd.
3617          */
3618         if (vnode_vtype(vp) == VDIR) {
3619                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3620         } else {
3621                 fp->f_fglob->fg_vn_data = NULL;
3622         }
3623
3624         vnode_put(vp);
3625
3626         /*
3627          * The first terminal open (without a O_NOCTTY) by a session leader
3628          * results in it being set as the controlling terminal.
3629          */
3630         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3631             !(flags & O_NOCTTY)) {
3632                 int tmp = 0;
3633
3634                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3635                     (caddr_t)&tmp, ctx);
3636         }
3637
3638         proc_fdlock(p);
3639         if (flags & O_CLOEXEC)
3640                 *fdflags(p, indx) |= UF_EXCLOSE;
3641         if (flags & O_CLOFORK)
3642                 *fdflags(p, indx) |= UF_FORKCLOSE;
3643         procfdtbl_releasefd(p, indx, NULL);
3644
3645 #if CONFIG_SECLUDED_MEMORY
3646         if (secluded_for_filecache &&
3647             FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
3648             vnode_vtype(vp) == VREG) {
3649                 memory_object_control_t moc;
3650
3651                 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
3652
3653                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
3654                         /* nothing to do... */
3655                 } else if (fp->f_fglob->fg_flag & FWRITE) {
3656                         /* writable -> no longer  eligible for secluded pages */
3657                         memory_object_mark_eligible_for_secluded(moc,
3658                                                                  FALSE);
3659                 } else if (secluded_for_filecache == 1) {
3660                         char pathname[32] = { 0, };
3661                         size_t copied;
3662                         /* XXX FBDP: better way to detect /Applications/ ? */
3663                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3664                                 copyinstr(ndp->ni_dirp,
3665                                           pathname,
3666                                           sizeof (pathname),
3667                                           &copied);
3668                         } else {
3669                                 copystr(CAST_DOWN(void *, ndp->ni_dirp),
3670                                         pathname,
3671                                         sizeof (pathname),
3672                                         &copied);
3673                         }
3674                         pathname[sizeof (pathname) - 1] = '\0';
3675                         if (strncmp(pathname,
3676                                     "/Applications/",
3677                                     strlen("/Applications/")) == 0 &&
3678                             strncmp(pathname,
3679                                     "/Applications/Camera.app/",
3680                                     strlen("/Applications/Camera.app/")) != 0) {
3681                                 /*
3682                                  * not writable
3683                                  * AND from "/Applications/"
3684                                  * AND not from "/Applications/Camera.app/"
3685                                  * ==> eligible for secluded
3686                                  */
3687                                 memory_object_mark_eligible_for_secluded(moc,
3688                                                                          TRUE);
3689                         }
3690                 } else if (secluded_for_filecache == 2) {
3691 #if __arm64__
3692 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_arm64"
3693 #elif __arm__
3694 #define DYLD_SHARED_CACHE_NAME "dyld_shared_cache_armv7"
3695 #else
3696 /* not implemented... */
3697 #endif
3698                         if (!strncmp(vp->v_name,
3699                                      DYLD_SHARED_CACHE_NAME,
3700                                      strlen(DYLD_SHARED_CACHE_NAME)) ||
3701                             !strncmp(vp->v_name,
3702                                      "dyld",
3703                                      strlen(vp->v_name)) ||
3704                             !strncmp(vp->v_name,
3705                                      "launchd",
3706                                      strlen(vp->v_name)) ||
3707                             !strncmp(vp->v_name,
3708                                      "Camera",
3709                                      strlen(vp->v_name)) ||
3710                             !strncmp(vp->v_name,
3711                                      "mediaserverd",
3712                                      strlen(vp->v_name)) ||
3713                             !strncmp(vp->v_name,
3714                                      "SpringBoard",
3715                                      strlen(vp->v_name)) ||
3716                             !strncmp(vp->v_name,
3717                                      "backboardd",
3718                                      strlen(vp->v_name))) {
3719                                 /*
3720                                  * This file matters when launching Camera:
3721                                  * do not store its contents in the secluded
3722                                  * pool that will be drained on Camera launch.
3723                                  */
3724                                 memory_object_mark_eligible_for_secluded(moc,
3725                                                                          FALSE);
3726                         }
3727                 }
3728         }
3729 #endif /* CONFIG_SECLUDED_MEMORY */
3730
3731         fp_drop(p, indx, fp, 1);
3732         proc_fdunlock(p);
3733
3734         *retval = indx;
3735
3736         return (0);
3737 bad:
3738         context = *vfs_context_current();
3739         context.vc_ucred = fp->f_fglob->fg_cred;
3740
3741         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
3742             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
3743                 lf.l_whence = SEEK_SET;
3744                 lf.l_start = 0;
3745                 lf.l_len = 0;
3746                 lf.l_type = F_UNLCK;
3747
3748                 (void)VNOP_ADVLOCK(
3749                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
3750         }
3751
3752         vn_close(vp, fp->f_fglob->fg_flag, &context);
3753         vnode_put(vp);
3754         fp_free(p, indx, fp);
3755
3756         return (error);
3757 }
3758
3759 /*
3760  * While most of the *at syscall handlers can call nameiat() which
3761  * is a wrapper around namei, the use of namei and initialisation
3762  * of nameidata are far removed and in different functions  - namei
3763  * gets called in vn_open_auth for open1. So we'll just do here what
3764  * nameiat() does.
3765  */
3766 static int
3767 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3768     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
3769     int dirfd)
3770 {
3771         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
3772                 int error;
3773                 char c;
3774
3775                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3776                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3777                         if (error)
3778                                 return (error);
3779                 } else {
3780                         c = *((char *)(ndp->ni_dirp));
3781                 }
3782
3783                 if (c != '/') {
3784                         vnode_t dvp_at;
3785
3786                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3787                             &dvp_at);
3788                         if (error)
3789                                 return (error);
3790
3791                         if (vnode_vtype(dvp_at) != VDIR) {
3792                                 vnode_put(dvp_at);
3793                                 return (ENOTDIR);
3794                         }
3795
3796                         ndp->ni_dvp = dvp_at;
3797                         ndp->ni_cnd.cn_flags |= USEDVP;
3798                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
3799                             retval);
3800                         vnode_put(dvp_at);
3801                         return (error);
3802                 }
3803         }
3804
3805         return (open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval));
3806 }
3807
3808 /*
3809  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
3810  *
3811  * Parameters:  p                       Process requesting the open
3812  *              uap                     User argument descriptor (see below)
3813  *              retval                  Pointer to an area to receive the
3814  *                                      return calue from the system call
3815  *
3816  * Indirect:    uap->path               Path to open (same as 'open')
3817  *              uap->flags              Flags to open (same as 'open'
3818  *              uap->uid                UID to set, if creating
3819  *              uap->gid                GID to set, if creating
3820  *              uap->mode               File mode, if creating (same as 'open')
3821  *              uap->xsecurity          ACL to set, if creating
3822  *
3823  * Returns:     0                       Success
3824  *              !0                      errno value
3825  *
3826  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3827  *
3828  * XXX:         We should enummerate the possible errno values here, and where
3829  *              in the code they originated.
3830  */
3831 int
3832 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
3833 {
3834         struct filedesc *fdp = p->p_fd;
3835         int ciferror;
3836         kauth_filesec_t xsecdst;
3837         struct vnode_attr va;
3838         struct nameidata nd;
3839         int cmode;
3840
3841         AUDIT_ARG(owner, uap->uid, uap->gid);
3842
3843         xsecdst = NULL;
3844         if ((uap->xsecurity != USER_ADDR_NULL) &&
3845             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
3846                 return ciferror;
3847
3848         VATTR_INIT(&va);
3849         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3850         VATTR_SET(&va, va_mode, cmode);
3851         if (uap->uid != KAUTH_UID_NONE)
3852                 VATTR_SET(&va, va_uid, uap->uid);
3853         if (uap->gid != KAUTH_GID_NONE)
3854                 VATTR_SET(&va, va_gid, uap->gid);
3855         if (xsecdst != NULL)
3856                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3857
3858         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3859                uap->path, vfs_context_current());
3860
3861         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
3862                          fileproc_alloc_init, NULL, retval);
3863         if (xsecdst != NULL)
3864                 kauth_filesec_free(xsecdst);
3865
3866         return ciferror;
3867 }
3868
3869 /*
3870  * Go through the data-protected atomically controlled open (2)
3871  *
3872  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
3873  */
3874 int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval) {
3875         int flags = uap->flags;
3876         int class = uap->class;
3877         int dpflags = uap->dpflags;
3878
3879         /*
3880          * Follow the same path as normal open(2)
3881          * Look up the item if it exists, and acquire the vnode.
3882          */
3883         struct filedesc *fdp = p->p_fd;
3884         struct vnode_attr va;
3885         struct nameidata nd;
3886         int cmode;
3887         int error;
3888
3889         VATTR_INIT(&va);
3890         /* Mask off all but regular access permissions */
3891         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3892         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3893
3894         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3895                uap->path, vfs_context_current());
3896
3897         /*
3898          * Initialize the extra fields in vnode_attr to pass down our
3899          * extra fields.
3900          * 1. target cprotect class.
3901          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
3902          */
3903         if (flags & O_CREAT) {
3904                /* lower level kernel code validates that the class is valid before applying it. */
3905                if (class != PROTECTION_CLASS_DEFAULT) {
3906                        /*
3907                         * PROTECTION_CLASS_DEFAULT implies that we make the class for this
3908                         * file behave the same as open (2)
3909                         */
3910                        VATTR_SET(&va, va_dataprotect_class, class);
3911                }
3912         }
3913
3914         if (dpflags & (O_DP_GETRAWENCRYPTED|O_DP_GETRAWUNENCRYPTED)) {
3915                 if ( flags & (O_RDWR | O_WRONLY)) {
3916                         /* Not allowed to write raw encrypted bytes */
3917                         return EINVAL;
3918                 }
3919                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
3920                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
3921                 }
3922                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
3923                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
3924                 }
3925         }
3926
3927         error = open1(vfs_context_current(), &nd, uap->flags, &va,
3928                       fileproc_alloc_init, NULL, retval);
3929
3930         return error;
3931 }
3932
3933 static int
3934 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
3935     int fd, enum uio_seg segflg, int *retval)
3936 {
3937         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
3938         struct vnode_attr va;
3939         struct nameidata nd;
3940         int cmode;
3941
3942         VATTR_INIT(&va);
3943         /* Mask off all but regular access permissions */
3944         cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3945         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3946
3947         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
3948             segflg, path, ctx);
3949
3950         return (open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
3951             retval, fd));
3952 }
3953
3954 int
3955 open(proc_t p, struct open_args *uap, int32_t *retval)
3956 {
3957         __pthread_testcancel(1);
3958         return(open_nocancel(p, (struct open_nocancel_args *)uap, retval));
3959 }
3960
3961 int
3962 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
3963     int32_t *retval)
3964 {
3965         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3966             uap->mode, AT_FDCWD, UIO_USERSPACE, retval));
3967 }
3968
3969 int
3970 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
3971                 int32_t *retval)
3972 {
3973         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3974             uap->mode, uap->fd, UIO_USERSPACE, retval));
3975 }
3976
3977 int
3978 openat(proc_t p, struct openat_args *uap, int32_t *retval)
3979 {
3980         __pthread_testcancel(1);
3981         return(openat_nocancel(p, (struct openat_nocancel_args *)uap, retval));
3982 }
3983
3984 /*
3985  * openbyid_np: open a file given a file system id and a file system object id
3986  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
3987  *      file systems that don't support object ids it is a node id (uint64_t).
3988  *
3989  * Parameters:  p                       Process requesting the open
3990  *              uap                     User argument descriptor (see below)
3991  *              retval                  Pointer to an area to receive the
3992  *                                      return calue from the system call
3993  *
3994  * Indirect:    uap->path               Path to open (same as 'open')
3995  *
3996  *              uap->fsid               id of target file system
3997  *              uap->objid              id of target file system object
3998  *              uap->flags              Flags to open (same as 'open')
3999  *
4000  * Returns:     0                       Success
4001  *              !0                      errno value
4002  *
4003  *
4004  * XXX:         We should enummerate the possible errno values here, and where
4005  *              in the code they originated.
4006  */
4007 int
4008 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
4009 {
4010         fsid_t fsid;
4011         uint64_t objid;
4012         int error;
4013         char *buf = NULL;
4014         int buflen = MAXPATHLEN;
4015         int pathlen = 0;
4016         vfs_context_t ctx = vfs_context_current();
4017
4018         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
4019                 return (error);
4020         }
4021
4022         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
4023                 return (error);
4024         }
4025
4026         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
4027         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
4028                 return (error);
4029         }
4030
4031         AUDIT_ARG(value32, fsid.val[0]);
4032         AUDIT_ARG(value64, objid);
4033
4034         /*resolve path from fsis, objid*/
4035         do {
4036                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
4037                 if (buf == NULL) {
4038                         return (ENOMEM);
4039                 }
4040
4041                 error = fsgetpath_internal(
4042                         ctx, fsid.val[0], objid,
4043                         buflen, buf, &pathlen);
4044
4045                 if (error) {
4046                         FREE(buf, M_TEMP);
4047                         buf = NULL;
4048                 }
4049         } while (error == ENOSPC && (buflen += MAXPATHLEN));
4050
4051         if (error) {
4052                 return error;
4053         }
4054
4055         buf[pathlen] = 0;
4056
4057         error = openat_internal(
4058                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
4059
4060         FREE(buf, M_TEMP);
4061
4062         return error;
4063 }
4064
4065
4066 /*
4067  * Create a special file.
4068  */
4069 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
4070
4071 int
4072 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
4073 {
4074         struct vnode_attr va;
4075         vfs_context_t ctx = vfs_context_current();
4076         int error;
4077         struct nameidata nd;
4078         vnode_t vp, dvp;
4079
4080         VATTR_INIT(&va);
4081         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4082         VATTR_SET(&va, va_rdev, uap->dev);
4083
4084         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
4085         if ((uap->mode & S_IFMT) == S_IFIFO)
4086                 return(mkfifo1(ctx, uap->path, &va));
4087
4088         AUDIT_ARG(mode, uap->mode);
4089         AUDIT_ARG(value32, uap->dev);
4090
4091         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
4092                 return (error);
4093         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
4094                 UIO_USERSPACE, uap->path, ctx);
4095         error = namei(&nd);
4096         if (error)
4097                 return (error);
4098         dvp = nd.ni_dvp;
4099         vp = nd.ni_vp;
4100
4101         if (vp != NULL) {
4102                 error = EEXIST;
4103                 goto out;
4104         }
4105
4106         switch (uap->mode & S_IFMT) {
4107         case S_IFCHR:
4108                 VATTR_SET(&va, va_type, VCHR);
4109                 break;
4110         case S_IFBLK:
4111                 VATTR_SET(&va, va_type, VBLK);
4112                 break;
4113         default:
4114                 error = EINVAL;
4115                 goto out;
4116         }
4117
4118 #if CONFIG_MACF
4119         error = mac_vnode_check_create(ctx,
4120             nd.ni_dvp, &nd.ni_cnd, &va);
4121         if (error)
4122                 goto out;
4123 #endif
4124
4125         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
4126                 goto out;
4127
4128         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0)
4129                 goto out;
4130
4131         if (vp) {
4132                 int     update_flags = 0;
4133
4134                 // Make sure the name & parent pointers are hooked up
4135                 if (vp->v_name == NULL)
4136                         update_flags |= VNODE_UPDATE_NAME;
4137                 if (vp->v_parent == NULLVP)
4138                         update_flags |= VNODE_UPDATE_PARENT;
4139
4140                 if (update_flags)
4141                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4142
4143 #if CONFIG_FSE
4144                 add_fsevent(FSE_CREATE_FILE, ctx,
4145                     FSE_ARG_VNODE, vp,
4146                     FSE_ARG_DONE);
4147 #endif
4148         }
4149
4150 out:
4151         /*
4152          * nameidone has to happen before we vnode_put(dvp)
4153          * since it may need to release the fs_nodelock on the dvp
4154          */
4155         nameidone(&nd);
4156
4157         if (vp)
4158                 vnode_put(vp);
4159         vnode_put(dvp);
4160
4161         return (error);
4162 }
4163
4164 /*
4165  * Create a named pipe.
4166  *
4167  * Returns:     0                       Success
4168  *              EEXIST
4169  *      namei:???
4170  *      vnode_authorize:???
4171  *      vn_create:???
4172  */
4173 static int
4174 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
4175 {
4176         vnode_t vp, dvp;
4177         int error;
4178         struct nameidata nd;
4179
4180         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
4181                 UIO_USERSPACE, upath, ctx);
4182         error = namei(&nd);
4183         if (error)
4184                 return (error);
4185         dvp = nd.ni_dvp;
4186         vp = nd.ni_vp;
4187
4188         /* check that this is a new file and authorize addition */
4189         if (vp != NULL) {
4190                 error = EEXIST;
4191                 goto out;
4192         }
4193         VATTR_SET(vap, va_type, VFIFO);
4194
4195         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0)
4196                 goto out;
4197
4198         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4199 out:
4200         /*
4201          * nameidone has to happen before we vnode_put(dvp)
4202          * since it may need to release the fs_nodelock on the dvp
4203          */
4204         nameidone(&nd);
4205
4206         if (vp)
4207                 vnode_put(vp);
4208         vnode_put(dvp);
4209
4210         return error;
4211 }
4212
4213
4214 /*
4215  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4216  *
4217  * Parameters:  p                       Process requesting the open
4218  *              uap                     User argument descriptor (see below)
4219  *              retval                  (Ignored)
4220  *
4221  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
4222  *              uap->uid                UID to set
4223  *              uap->gid                GID to set
4224  *              uap->mode               File mode to set (same as 'mkfifo')
4225  *              uap->xsecurity          ACL to set, if creating
4226  *
4227  * Returns:     0                       Success
4228  *              !0                      errno value
4229  *
4230  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4231  *
4232  * XXX:         We should enummerate the possible errno values here, and where
4233  *              in the code they originated.
4234  */
4235 int
4236 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4237 {
4238         int ciferror;
4239         kauth_filesec_t xsecdst;
4240         struct vnode_attr va;
4241
4242         AUDIT_ARG(owner, uap->uid, uap->gid);
4243
4244         xsecdst = KAUTH_FILESEC_NONE;
4245         if (uap->xsecurity != USER_ADDR_NULL) {
4246                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
4247                         return ciferror;
4248         }
4249
4250         VATTR_INIT(&va);
4251         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4252         if (uap->uid != KAUTH_UID_NONE)
4253                 VATTR_SET(&va, va_uid, uap->uid);
4254         if (uap->gid != KAUTH_GID_NONE)
4255                 VATTR_SET(&va, va_gid, uap->gid);
4256         if (xsecdst != KAUTH_FILESEC_NONE)
4257                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4258
4259         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4260
4261         if (xsecdst != KAUTH_FILESEC_NONE)
4262                 kauth_filesec_free(xsecdst);
4263         return ciferror;
4264 }
4265
4266 /* ARGSUSED */
4267 int
4268 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4269 {
4270         struct vnode_attr va;
4271
4272         VATTR_INIT(&va);
4273         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4274
4275         return(mkfifo1(vfs_context_current(), uap->path, &va));
4276 }
4277
4278
4279 static char *
4280 my_strrchr(char *p, int ch)
4281 {
4282         char *save;
4283
4284         for (save = NULL;; ++p) {
4285                 if (*p == ch)
4286                         save = p;
4287                 if (!*p)
4288                         return(save);
4289         }
4290         /* NOTREACHED */
4291 }
4292
4293 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4294
4295 int
4296 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4297 {
4298         int ret, len = _len;
4299
4300         *truncated_path = 0;
4301         ret = vn_getpath(dvp, path, &len);
4302         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4303                 if (leafname) {
4304                         path[len-1] = '/';
4305                         len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + 1;
4306                         if (len > MAXPATHLEN) {
4307                                 char *ptr;
4308
4309                                 // the string got truncated!
4310                                 *truncated_path = 1;
4311                                 ptr = my_strrchr(path, '/');
4312                                 if (ptr) {
4313                                         *ptr = '\0';   // chop off the string at the last directory component
4314                                 }
4315                                 len = strlen(path) + 1;
4316                         }
4317                 }
4318         } else if (ret == 0) {
4319                 *truncated_path = 1;
4320         } else if (ret != 0) {
4321                 struct vnode *mydvp=dvp;
4322
4323                 if (ret != ENOSPC) {
4324                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4325                                dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4326                 }
4327                 *truncated_path = 1;
4328
4329                 do {
4330                         if (mydvp->v_parent != NULL) {
4331                                 mydvp = mydvp->v_parent;
4332                         } else if (mydvp->v_mount) {
4333                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4334                                 break;
4335                         } else {
4336                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4337                                 strlcpy(path, "/", _len);
4338                                 len = 2;
4339                                 mydvp = NULL;
4340                         }
4341
4342                         if (mydvp == NULL) {
4343                                 break;
4344                         }
4345
4346                         len = _len;
4347                         ret = vn_getpath(mydvp, path, &len);
4348                 } while (ret == ENOSPC);
4349         }
4350
4351         return len;
4352 }
4353
4354
4355 /*
4356  * Make a hard file link.
4357  *
4358  * Returns:     0                       Success
4359  *              EPERM
4360  *              EEXIST
4361  *              EXDEV
4362  *      namei:???
4363  *      vnode_authorize:???
4364  *      VNOP_LINK:???
4365  */
4366 /* ARGSUSED */
4367 static int
4368 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4369     user_addr_t link, int flag, enum uio_seg segflg)
4370 {
4371         vnode_t vp, dvp, lvp;
4372         struct nameidata nd;
4373         int follow;
4374         int error;
4375 #if CONFIG_FSE
4376         fse_info finfo;
4377 #endif
4378         int need_event, has_listeners;
4379         char *target_path = NULL;
4380         int truncated=0;
4381
4382         vp = dvp = lvp = NULLVP;
4383
4384         /* look up the object we are linking to */
4385         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4386         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4387             segflg, path, ctx);
4388
4389         error = nameiat(&nd, fd1);
4390         if (error)
4391                 return (error);
4392         vp = nd.ni_vp;
4393
4394         nameidone(&nd);
4395
4396         /*
4397          * Normally, linking to directories is not supported.
4398          * However, some file systems may have limited support.
4399          */
4400         if (vp->v_type == VDIR) {
4401                 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4402                         error = EPERM;   /* POSIX */
4403                         goto out;
4404                 }
4405
4406                 /* Linking to a directory requires ownership. */
4407                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4408                         struct vnode_attr dva;
4409
4410                         VATTR_INIT(&dva);
4411                         VATTR_WANTED(&dva, va_uid);
4412                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4413                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4414                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4415                                 error = EACCES;
4416                                 goto out;
4417                         }
4418                 }
4419         }
4420
4421         /* lookup the target node */
4422 #if CONFIG_TRIGGERS
4423         nd.ni_op = OP_LINK;
4424 #endif
4425         nd.ni_cnd.cn_nameiop = CREATE;
4426         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4427         nd.ni_dirp = link;
4428         error = nameiat(&nd, fd2);
4429         if (error != 0)
4430                 goto out;
4431         dvp = nd.ni_dvp;
4432         lvp = nd.ni_vp;
4433
4434 #if CONFIG_MACF
4435         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0)
4436                 goto out2;
4437 #endif
4438
4439         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4440         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0)
4441                 goto out2;
4442
4443         /* target node must not exist */
4444         if (lvp != NULLVP) {
4445                 error = EEXIST;
4446                 goto out2;
4447         }
4448         /* cannot link across mountpoints */
4449         if (vnode_mount(vp) != vnode_mount(dvp)) {
4450                 error = EXDEV;
4451                 goto out2;
4452         }
4453
4454         /* authorize creation of the target note */
4455         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
4456                 goto out2;
4457
4458         /* and finally make the link */
4459         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4460         if (error)
4461                 goto out2;
4462
4463 #if CONFIG_MACF
4464         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4465 #endif
4466
4467 #if CONFIG_FSE
4468         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4469 #else
4470         need_event = 0;
4471 #endif
4472         has_listeners = kauth_authorize_fileop_has_listeners();
4473
4474         if (need_event || has_listeners) {
4475                 char *link_to_path = NULL;
4476                 int len, link_name_len;
4477
4478                 /* build the path to the new link file */
4479                 GET_PATH(target_path);
4480                 if (target_path == NULL) {
4481                         error = ENOMEM;
4482                         goto out2;
4483                 }
4484
4485                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4486
4487                 if (has_listeners) {
4488                         /* build the path to file we are linking to */
4489                         GET_PATH(link_to_path);
4490                         if (link_to_path == NULL) {
4491                                 error = ENOMEM;
4492                                 goto out2;
4493                         }
4494
4495                         link_name_len = MAXPATHLEN;
4496                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4497                                 /*
4498                                  * Call out to allow 3rd party notification of rename.
4499                                  * Ignore result of kauth_authorize_fileop call.
4500                                  */
4501                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4502                                                        (uintptr_t)link_to_path,
4503                                                        (uintptr_t)target_path);
4504                         }
4505                         if (link_to_path != NULL) {
4506                                 RELEASE_PATH(link_to_path);
4507                         }
4508                 }
4509 #if CONFIG_FSE
4510                 if (need_event) {
4511                         /* construct fsevent */
4512                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4513                                 if (truncated) {
4514                                         finfo.mode |= FSE_TRUNCATED_PATH;
4515                                 }
4516
4517                                 // build the path to the destination of the link
4518                                 add_fsevent(FSE_CREATE_FILE, ctx,
4519                                             FSE_ARG_STRING, len, target_path,
4520                                             FSE_ARG_FINFO, &finfo,
4521                                             FSE_ARG_DONE);
4522                         }
4523                         if (vp->v_parent) {
4524                             add_fsevent(FSE_STAT_CHANGED, ctx,
4525                                 FSE_ARG_VNODE, vp->v_parent,
4526                                 FSE_ARG_DONE);
4527                         }
4528                 }
4529 #endif
4530         }
4531 out2:
4532         /*
4533          * nameidone has to happen before we vnode_put(dvp)
4534          * since it may need to release the fs_nodelock on the dvp
4535          */
4536         nameidone(&nd);
4537         if (target_path != NULL) {
4538                 RELEASE_PATH(target_path);
4539         }
4540 out:
4541         if (lvp)
4542                 vnode_put(lvp);
4543         if (dvp)
4544                 vnode_put(dvp);
4545         vnode_put(vp);
4546         return (error);
4547 }
4548
4549 int
4550 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4551 {
4552         return (linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4553             AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE));
4554 }
4555
4556 int
4557 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4558 {
4559         if (uap->flag & ~AT_SYMLINK_FOLLOW)
4560                 return (EINVAL);
4561
4562         return (linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4563             uap->fd2, uap->link, uap->flag, UIO_USERSPACE));
4564 }
4565
4566 /*
4567  * Make a symbolic link.
4568  *
4569  * We could add support for ACLs here too...
4570  */
4571 /* ARGSUSED */
4572 static int
4573 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4574     user_addr_t link, enum uio_seg segflg)
4575 {
4576         struct vnode_attr va;
4577         char *path;
4578         int error;
4579         struct nameidata nd;
4580         vnode_t vp, dvp;
4581         size_t dummy=0;
4582         proc_t p;
4583
4584         error = 0;
4585         if (UIO_SEG_IS_USER_SPACE(segflg)) {
4586                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
4587                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
4588         } else {
4589                 path = (char *)path_data;
4590         }
4591         if (error)
4592                 goto out;
4593         AUDIT_ARG(text, path);  /* This is the link string */
4594
4595         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
4596             segflg, link, ctx);
4597
4598         error = nameiat(&nd, fd);
4599         if (error)
4600                 goto out;
4601         dvp = nd.ni_dvp;
4602         vp = nd.ni_vp;
4603
4604         p = vfs_context_proc(ctx);
4605         VATTR_INIT(&va);
4606         VATTR_SET(&va, va_type, VLNK);
4607         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
4608
4609 #if CONFIG_MACF
4610         error = mac_vnode_check_create(ctx,
4611                         dvp, &nd.ni_cnd, &va);
4612 #endif
4613         if (error != 0) {
4614             goto skipit;
4615         }
4616
4617         if (vp != NULL) {
4618             error = EEXIST;
4619             goto skipit;
4620         }
4621
4622         /* authorize */
4623         if (error == 0)
4624                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
4625         /* get default ownership, etc. */
4626         if (error == 0)
4627                 error = vnode_authattr_new(dvp, &va, 0, ctx);
4628         if (error == 0)
4629                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
4630
4631 #if CONFIG_MACF
4632         if (error == 0 && vp)
4633                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
4634 #endif
4635
4636         /* do fallback attribute handling */
4637         if (error == 0 && vp)
4638                 error = vnode_setattr_fallback(vp, &va, ctx);
4639
4640         if (error == 0) {
4641                 int     update_flags = 0;
4642
4643                 /*check if a new vnode was created, else try to get one*/
4644                 if (vp == NULL) {
4645                         nd.ni_cnd.cn_nameiop = LOOKUP;
4646 #if CONFIG_TRIGGERS
4647                         nd.ni_op = OP_LOOKUP;
4648 #endif
4649                         nd.ni_cnd.cn_flags = 0;
4650                         error = nameiat(&nd, fd);
4651                         vp = nd.ni_vp;
4652
4653                         if (vp == NULL)
4654                                 goto skipit;
4655                 }
4656
4657 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
4658                 /* call out to allow 3rd party notification of rename.
4659                  * Ignore result of kauth_authorize_fileop call.
4660                  */
4661                 if (kauth_authorize_fileop_has_listeners() &&
4662                     namei(&nd) == 0) {
4663                         char *new_link_path = NULL;
4664                         int             len;
4665
4666                         /* build the path to the new link file */
4667                         new_link_path = get_pathbuff();
4668                         len = MAXPATHLEN;
4669                         vn_getpath(dvp, new_link_path, &len);
4670                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
4671                                 new_link_path[len - 1] = '/';
4672                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN-len);
4673                         }
4674
4675                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
4676                                            (uintptr_t)path, (uintptr_t)new_link_path);
4677                         if (new_link_path != NULL)
4678                                 release_pathbuff(new_link_path);
4679                 }
4680 #endif
4681                 // Make sure the name & parent pointers are hooked up
4682                 if (vp->v_name == NULL)
4683                         update_flags |= VNODE_UPDATE_NAME;
4684                 if (vp->v_parent == NULLVP)
4685                         update_flags |= VNODE_UPDATE_PARENT;
4686
4687                 if (update_flags)
4688                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4689
4690 #if CONFIG_FSE
4691                 add_fsevent(FSE_CREATE_FILE, ctx,
4692                             FSE_ARG_VNODE, vp,
4693                             FSE_ARG_DONE);
4694 #endif
4695         }
4696
4697 skipit:
4698         /*
4699          * nameidone has to happen before we vnode_put(dvp)
4700          * since it may need to release the fs_nodelock on the dvp
4701          */
4702         nameidone(&nd);
4703
4704         if (vp)
4705                 vnode_put(vp);
4706         vnode_put(dvp);
4707 out:
4708         if (path && (path != (char *)path_data))
4709                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
4710
4711         return (error);
4712 }
4713
4714 int
4715 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
4716 {
4717         return (symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
4718             uap->link, UIO_USERSPACE));
4719 }
4720
4721 int
4722 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
4723     __unused int32_t *retval)
4724 {
4725         return (symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
4726             uap->path2, UIO_USERSPACE));
4727 }
4728
4729 /*
4730  * Delete a whiteout from the filesystem.
4731  * No longer supported.
4732  */
4733 int
4734 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
4735 {
4736         return (ENOTSUP);
4737 }
4738
4739 /*
4740  * Delete a name from the filesystem.
4741  */
4742 /* ARGSUSED */
4743 static int
4744 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
4745     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
4746 {
4747         struct nameidata nd;
4748         vnode_t vp, dvp;
4749         int error;
4750         struct componentname *cnp;
4751         char  *path = NULL;
4752         int  len=0;
4753 #if CONFIG_FSE
4754         fse_info  finfo;
4755         struct vnode_attr va;
4756 #endif
4757         int flags;
4758         int need_event;
4759         int has_listeners;
4760         int truncated_path;
4761         int batched;
4762         struct vnode_attr *vap;
4763         int do_retry;
4764         int retry_count = 0;
4765         int cn_flags;
4766
4767         cn_flags = LOCKPARENT;
4768         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH))
4769                 cn_flags |= AUDITVNPATH1;
4770         /* If a starting dvp is passed, it trumps any fd passed. */
4771         if (start_dvp)
4772                 cn_flags |= USEDVP;
4773
4774 #if NAMEDRSRCFORK
4775         /* unlink or delete is allowed on rsrc forks and named streams */
4776         cn_flags |= CN_ALLOWRSRCFORK;
4777 #endif
4778
4779 retry:
4780         do_retry = 0;
4781         flags = 0;
4782         need_event = 0;
4783         has_listeners = 0;
4784         truncated_path = 0;
4785         vap = NULL;
4786
4787         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
4788
4789         nd.ni_dvp = start_dvp;
4790         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
4791         cnp = &nd.ni_cnd;
4792
4793 continue_lookup:
4794         error = nameiat(&nd, fd);
4795         if (error)
4796                 return (error);
4797
4798         dvp = nd.ni_dvp;
4799         vp = nd.ni_vp;
4800
4801
4802         /* With Carbon delete semantics, busy files cannot be deleted */
4803         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
4804                 flags |= VNODE_REMOVE_NODELETEBUSY;
4805         }
4806
4807         /* Skip any potential upcalls if told to. */
4808         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
4809                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
4810         }
4811
4812         if (vp) {
4813                 batched = vnode_compound_remove_available(vp);
4814                 /*
4815                  * The root of a mounted filesystem cannot be deleted.
4816                  */
4817                 if (vp->v_flag & VROOT) {
4818                         error = EBUSY;
4819                 }
4820
4821 #if DEVELOPMENT || DEBUG
4822         /*
4823          * XXX VSWAP: Check for entitlements or special flag here
4824          * so we can restrict access appropriately.
4825          */
4826 #else /* DEVELOPMENT || DEBUG */
4827
4828                 if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
4829                         error = EPERM;
4830                         goto out;
4831                 }
4832 #endif /* DEVELOPMENT || DEBUG */
4833
4834                 if (!batched) {
4835                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
4836                         if (error) {
4837                                 if (error == ENOENT) {
4838                                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4839                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4840                                                 do_retry = 1;
4841                                                 retry_count++;
4842                                         }
4843                                 }
4844                                 goto out;
4845                         }
4846                 }
4847         } else {
4848                 batched = 1;
4849
4850                 if (!vnode_compound_remove_available(dvp)) {
4851                         panic("No vp, but no compound remove?");
4852                 }
4853         }
4854
4855 #if CONFIG_FSE
4856         need_event = need_fsevent(FSE_DELETE, dvp);
4857         if (need_event) {
4858                 if (!batched) {
4859                         if ((vp->v_flag & VISHARDLINK) == 0) {
4860                                 /* XXX need to get these data in batched VNOP */
4861                                 get_fse_info(vp, &finfo, ctx);
4862                         }
4863                 } else {
4864                         error = vfs_get_notify_attributes(&va);
4865                         if (error) {
4866                                 goto out;
4867                         }
4868
4869                         vap = &va;
4870                 }
4871         }
4872 #endif
4873         has_listeners = kauth_authorize_fileop_has_listeners();
4874         if (need_event || has_listeners) {
4875                 if (path == NULL) {
4876                         GET_PATH(path);
4877                         if (path == NULL) {
4878                                 error = ENOMEM;
4879                                 goto out;
4880                         }
4881                 }
4882                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
4883         }
4884
4885 #if NAMEDRSRCFORK
4886         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK)
4887                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
4888         else
4889 #endif
4890         {
4891                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
4892                 vp = nd.ni_vp;
4893                 if (error == EKEEPLOOKING) {
4894                         if (!batched) {
4895                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
4896                         }
4897
4898                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
4899                                 panic("EKEEPLOOKING, but continue flag not set?");
4900                         }
4901
4902                         if (vnode_isdir(vp)) {
4903                                 error = EISDIR;
4904                                 goto out;
4905                         }
4906                         goto continue_lookup;
4907                 } else if (error == ENOENT && batched) {
4908                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4909                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4910                                 /*
4911                                  * For compound VNOPs, the authorization callback may
4912                                  * return ENOENT in case of racing hardlink lookups
4913                                  * hitting the name  cache, redrive the lookup.
4914                                  */
4915                                 do_retry = 1;
4916                                 retry_count += 1;
4917                                 goto out;
4918                         }
4919                 }
4920         }
4921
4922         /*
4923          * Call out to allow 3rd party notification of delete.
4924          * Ignore result of kauth_authorize_fileop call.
4925          */
4926         if (!error) {
4927                 if (has_listeners) {
4928                         kauth_authorize_fileop(vfs_context_ucred(ctx),
4929                                 KAUTH_FILEOP_DELETE,
4930                                 (uintptr_t)vp,
4931                                 (uintptr_t)path);
4932                 }
4933
4934                 if (vp->v_flag & VISHARDLINK) {
4935                     //
4936                     // if a hardlink gets deleted we want to blow away the
4937                     // v_parent link because the path that got us to this
4938                     // instance of the link is no longer valid.  this will
4939                     // force the next call to get the path to ask the file
4940                     // system instead of just following the v_parent link.
4941                     //
4942                     vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
4943                 }
4944
4945 #if CONFIG_FSE
4946                 if (need_event) {
4947                         if (vp->v_flag & VISHARDLINK) {
4948                                 get_fse_info(vp, &finfo, ctx);
4949                         } else if (vap) {
4950                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
4951                         }
4952                         if (truncated_path) {
4953                                 finfo.mode |= FSE_TRUNCATED_PATH;
4954                         }
4955                         add_fsevent(FSE_DELETE, ctx,
4956                                                 FSE_ARG_STRING, len, path,
4957                                                 FSE_ARG_FINFO, &finfo,
4958                                                 FSE_ARG_DONE);
4959                 }
4960 #endif
4961         }
4962
4963 out:
4964         if (path != NULL)
4965                 RELEASE_PATH(path);
4966
4967 #if NAMEDRSRCFORK
4968         /* recycle the deleted rsrc fork vnode to force a reclaim, which
4969          * will cause its shadow file to go away if necessary.
4970          */
4971          if (vp && (vnode_isnamedstream(vp)) &&
4972                 (vp->v_parent != NULLVP) &&
4973                 vnode_isshadow(vp)) {
4974                         vnode_recycle(vp);
4975          }
4976 #endif
4977         /*
4978          * nameidone has to happen before we vnode_put(dvp)
4979          * since it may need to release the fs_nodelock on the dvp
4980          */
4981         nameidone(&nd);
4982         vnode_put(dvp);
4983         if (vp) {
4984                 vnode_put(vp);
4985         }
4986
4987         if (do_retry) {
4988                 goto retry;
4989         }
4990
4991         return (error);
4992 }
4993
4994 int
4995 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
4996     enum uio_seg segflg, int unlink_flags)
4997 {
4998         return (unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
4999             unlink_flags));
5000 }
5001
5002 /*
5003  * Delete a name from the filesystem using Carbon semantics.
5004  */
5005 int
5006 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
5007 {
5008         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5009             uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY));
5010 }
5011
5012 /*
5013  * Delete a name from the filesystem using POSIX semantics.
5014  */
5015 int
5016 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
5017 {
5018         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
5019             uap->path, UIO_USERSPACE, 0));
5020 }
5021
5022 int
5023 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
5024 {
5025         if (uap->flag & ~AT_REMOVEDIR)
5026                 return (EINVAL);
5027
5028         if (uap->flag & AT_REMOVEDIR)
5029                 return (rmdirat_internal(vfs_context_current(), uap->fd,
5030                     uap->path, UIO_USERSPACE));
5031         else
5032                 return (unlinkat_internal(vfs_context_current(), uap->fd,
5033                     NULLVP, uap->path, UIO_USERSPACE, 0));
5034 }
5035
5036 /*
5037  * Reposition read/write file offset.
5038  */
5039 int
5040 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
5041 {
5042         struct fileproc *fp;
5043         vnode_t vp;
5044         struct vfs_context *ctx;
5045         off_t offset = uap->offset, file_size;
5046         int error;
5047
5048         if ( (error = fp_getfvp(p,uap->fd, &fp, &vp)) ) {
5049                 if (error == ENOTSUP)
5050                         return (ESPIPE);
5051                 return (error);
5052         }
5053         if (vnode_isfifo(vp)) {
5054                 file_drop(uap->fd);
5055                 return(ESPIPE);
5056         }
5057
5058
5059         ctx = vfs_context_current();
5060 #if CONFIG_MACF
5061         if (uap->whence == L_INCR && uap->offset == 0)
5062                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
5063                     fp->f_fglob);
5064         else
5065                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
5066                     fp->f_fglob);
5067         if (error) {
5068                 file_drop(uap->fd);
5069                 return (error);
5070         }
5071 #endif
5072         if ( (error = vnode_getwithref(vp)) ) {
5073                 file_drop(uap->fd);
5074                 return(error);
5075         }
5076
5077         switch (uap->whence) {
5078         case L_INCR:
5079                 offset += fp->f_fglob->fg_offset;
5080                 break;
5081         case L_XTND:
5082                 if ((error = vnode_size(vp, &file_size, ctx)) != 0)
5083                         break;
5084                 offset += file_size;
5085                 break;
5086         case L_SET:
5087                 break;
5088         case SEEK_HOLE:
5089         error = VNOP_IOCTL(vp, FSIOC_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
5090                 break;
5091         case SEEK_DATA:
5092         error = VNOP_IOCTL(vp, FSIOC_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
5093                 break;
5094         default:
5095                 error = EINVAL;
5096         }
5097         if (error == 0) {
5098                 if (uap->offset > 0 && offset < 0) {
5099                         /* Incremented/relative move past max size */
5100                         error = EOVERFLOW;
5101                 } else {
5102                         /*
5103                          * Allow negative offsets on character devices, per
5104                          * POSIX 1003.1-2001.  Most likely for writing disk
5105                          * labels.
5106                          */
5107                         if (offset < 0 && vp->v_type != VCHR) {
5108                                 /* Decremented/relative move before start */
5109                                 error = EINVAL;
5110                         } else {
5111                                 /* Success */
5112                                 fp->f_fglob->fg_offset = offset;
5113                                 *retval = fp->f_fglob->fg_offset;
5114                         }
5115                 }
5116         }
5117
5118         /*
5119          * An lseek can affect whether data is "available to read."  Use
5120          * hint of NOTE_NONE so no EVFILT_VNODE events fire
5121          */
5122         post_event_if_success(vp, error, NOTE_NONE);
5123         (void)vnode_put(vp);
5124         file_drop(uap->fd);
5125         return (error);
5126 }
5127
5128
5129 /*
5130  * Check access permissions.
5131  *
5132  * Returns:     0                       Success
5133  *              vnode_authorize:???
5134  */
5135 static int
5136 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
5137 {
5138         kauth_action_t action;
5139         int error;
5140
5141         /*
5142          * If just the regular access bits, convert them to something
5143          * that vnode_authorize will understand.
5144          */
5145         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
5146                 action = 0;
5147                 if (uflags & R_OK)
5148                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
5149                 if (uflags & W_OK) {
5150                         if (vnode_isdir(vp)) {
5151                                 action |= KAUTH_VNODE_ADD_FILE |
5152                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
5153                                 /* might want delete rights here too */
5154                         } else {
5155                                 action |= KAUTH_VNODE_WRITE_DATA;
5156                         }
5157                 }
5158                 if (uflags & X_OK) {
5159                         if (vnode_isdir(vp)) {
5160                                 action |= KAUTH_VNODE_SEARCH;
5161                         } else {
5162                                 action |= KAUTH_VNODE_EXECUTE;
5163                         }
5164                 }
5165         } else {
5166                 /* take advantage of definition of uflags */
5167                 action = uflags >> 8;
5168         }
5169
5170 #if CONFIG_MACF
5171         error = mac_vnode_check_access(ctx, vp, uflags);
5172         if (error)
5173                 return (error);
5174 #endif /* MAC */
5175
5176         /* action == 0 means only check for existence */
5177         if (action != 0) {
5178                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
5179         } else {
5180                 error = 0;
5181         }
5182
5183         return(error);
5184 }
5185
5186
5187
5188 /*
5189  * access_extended: Check access permissions in bulk.
5190  *
5191  * Description: uap->entries            Pointer to an array of accessx
5192  *                                      descriptor structs, plus one or
5193  *                                      more NULL terminated strings (see
5194  *                                      "Notes" section below).
5195  *              uap->size               Size of the area pointed to by
5196  *                                      uap->entries.
5197  *              uap->results            Pointer to the results array.
5198  *
5199  * Returns:     0                       Success
5200  *              ENOMEM                  Insufficient memory
5201  *              EINVAL                  Invalid arguments
5202  *              namei:EFAULT            Bad address
5203  *              namei:ENAMETOOLONG      Filename too long
5204  *              namei:ENOENT            No such file or directory
5205  *              namei:ELOOP             Too many levels of symbolic links
5206  *              namei:EBADF             Bad file descriptor
5207  *              namei:ENOTDIR           Not a directory
5208  *              namei:???
5209  *              access1:
5210  *
5211  * Implicit returns:
5212  *              uap->results            Array contents modified
5213  *
5214  * Notes:       The uap->entries are structured as an arbitrary length array
5215  *              of accessx descriptors, followed by one or more NULL terminated
5216  *              strings
5217  *
5218  *                      struct accessx_descriptor[0]
5219  *                      ...
5220  *                      struct accessx_descriptor[n]
5221  *                      char name_data[0];
5222  *
5223  *              We determine the entry count by walking the buffer containing
5224  *              the uap->entries argument descriptor.  For each descriptor we
5225  *              see, the valid values for the offset ad_name_offset will be
5226  *              in the byte range:
5227  *
5228  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
5229  *                                              to
5230  *                              [ uap->entries + uap->size - 2 ]
5231  *
5232  *              since we must have at least one string, and the string must
5233  *              be at least one character plus the NULL terminator in length.
5234  *
5235  * XXX:         Need to support the check-as uid argument
5236  */
5237 int
5238 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5239 {
5240         struct accessx_descriptor *input = NULL;
5241         errno_t *result = NULL;
5242         errno_t error = 0;
5243         int wantdelete = 0;
5244         unsigned int desc_max, desc_actual, i, j;
5245         struct vfs_context context;
5246         struct nameidata nd;
5247         int niopts;
5248         vnode_t vp = NULL;
5249         vnode_t dvp = NULL;
5250 #define ACCESSX_MAX_DESCR_ON_STACK 10
5251         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5252
5253         context.vc_ucred = NULL;
5254
5255         /*
5256          * Validate parameters; if valid, copy the descriptor array and string
5257          * arguments into local memory.  Before proceeding, the following
5258          * conditions must have been met:
5259          *
5260          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5261          * o    There must be sufficient room in the request for at least one
5262          *      descriptor and a one yte NUL terminated string.
5263          * o    The allocation of local storage must not fail.
5264          */
5265         if (uap->size > ACCESSX_MAX_TABLESIZE)
5266                 return(ENOMEM);
5267         if (uap->size < (sizeof(struct accessx_descriptor) + 2))
5268                 return(EINVAL);
5269         if (uap->size <= sizeof (stack_input)) {
5270                 input = stack_input;
5271         } else {
5272         MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5273         if (input == NULL) {
5274                 error = ENOMEM;
5275                 goto out;
5276         }
5277         }
5278         error = copyin(uap->entries, input, uap->size);
5279         if (error)
5280                 goto out;
5281
5282         AUDIT_ARG(opaque, input, uap->size);
5283
5284         /*
5285          * Force NUL termination of the copyin buffer to avoid nami() running
5286          * off the end.  If the caller passes us bogus data, they may get a
5287          * bogus result.
5288          */
5289         ((char *)input)[uap->size - 1] = 0;
5290
5291         /*
5292          * Access is defined as checking against the process' real identity,
5293          * even if operations are checking the effective identity.  This
5294          * requires that we use a local vfs context.
5295          */
5296         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5297         context.vc_thread = current_thread();
5298
5299         /*
5300          * Find out how many entries we have, so we can allocate the result
5301          * array by walking the list and adjusting the count downward by the
5302          * earliest string offset we see.
5303          */
5304         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5305         desc_actual = desc_max;
5306         for (i = 0; i < desc_actual; i++) {
5307                 /*
5308                  * Take the offset to the name string for this entry and
5309                  * convert to an input array index, which would be one off
5310                  * the end of the array if this entry was the lowest-addressed
5311                  * name string.
5312                  */
5313                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5314
5315                 /*
5316                  * An offset greater than the max allowable offset is an error.
5317                  * It is also an error for any valid entry to point
5318                  * to a location prior to the end of the current entry, if
5319                  * it's not a reference to the string of the previous entry.
5320                  */
5321                 if (j > desc_max || (j != 0 && j <= i)) {
5322                         error = EINVAL;
5323                         goto out;
5324                 }
5325
5326                 /* Also do not let ad_name_offset point to something beyond the size of the input */
5327                 if (input[i].ad_name_offset >= uap->size) {
5328                         error = EINVAL;
5329                         goto out;
5330                 }
5331
5332                 /*
5333                  * An offset of 0 means use the previous descriptor's offset;
5334                  * this is used to chain multiple requests for the same file
5335                  * to avoid multiple lookups.
5336                  */
5337                 if (j == 0) {
5338                         /* This is not valid for the first entry */
5339                         if (i == 0) {
5340                                 error = EINVAL;
5341                                 goto out;
5342                         }
5343                         continue;
5344                 }
5345
5346                 /*
5347                  * If the offset of the string for this descriptor is before
5348                  * what we believe is the current actual last descriptor,
5349                  * then we need to adjust our estimate downward; this permits
5350                  * the string table following the last descriptor to be out
5351                  * of order relative to the descriptor list.
5352                  */
5353                 if (j < desc_actual)
5354                         desc_actual = j;
5355         }
5356
5357         /*
5358          * We limit the actual number of descriptors we are willing to process
5359          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5360          * requested does not exceed this limit,
5361          */
5362         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5363                 error = ENOMEM;
5364                 goto out;
5365         }
5366         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK | M_ZERO);
5367         if (result == NULL) {
5368                 error = ENOMEM;
5369                 goto out;
5370         }
5371
5372         /*
5373          * Do the work by iterating over the descriptor entries we know to
5374          * at least appear to contain valid data.
5375          */
5376         error = 0;
5377         for (i = 0; i < desc_actual; i++) {
5378                 /*
5379                  * If the ad_name_offset is 0, then we use the previous
5380                  * results to make the check; otherwise, we are looking up
5381                  * a new file name.
5382                  */
5383                 if (input[i].ad_name_offset != 0) {
5384                         /* discard old vnodes */
5385                         if (vp) {
5386                                 vnode_put(vp);
5387                                 vp = NULL;
5388                         }
5389                         if (dvp) {
5390                                 vnode_put(dvp);
5391                                 dvp = NULL;
5392                         }
5393
5394                         /*
5395                          * Scan forward in the descriptor list to see if we
5396                          * need the parent vnode.  We will need it if we are
5397                          * deleting, since we must have rights  to remove
5398                          * entries in the parent directory, as well as the
5399                          * rights to delete the object itself.
5400                          */
5401                         wantdelete = input[i].ad_flags & _DELETE_OK;
5402                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++)
5403                                 if (input[j].ad_flags & _DELETE_OK)
5404                                         wantdelete = 1;
5405
5406                         niopts = FOLLOW | AUDITVNPATH1;
5407
5408                         /* need parent for vnode_authorize for deletion test */
5409                         if (wantdelete)
5410                                 niopts |= WANTPARENT;
5411
5412                         /* do the lookup */
5413                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5414                                CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5415                                &context);
5416                         error = namei(&nd);
5417                         if (!error) {
5418                                 vp = nd.ni_vp;
5419                                 if (wantdelete)
5420                                         dvp = nd.ni_dvp;
5421                         }
5422                         nameidone(&nd);
5423                 }
5424
5425                 /*
5426                  * Handle lookup errors.
5427                  */
5428                 switch(error) {
5429                 case ENOENT:
5430                 case EACCES:
5431                 case EPERM:
5432                 case ENOTDIR:
5433                         result[i] = error;
5434                         break;
5435                 case 0:
5436                         /* run this access check */
5437                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5438                         break;
5439                 default:
5440                         /* fatal lookup error */
5441
5442                         goto out;
5443                 }
5444         }
5445
5446         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5447
5448         /* copy out results */
5449         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5450
5451 out:
5452         if (input && input != stack_input)
5453                 FREE(input, M_TEMP);
5454         if (result)
5455                 FREE(result, M_TEMP);
5456         if (vp)
5457                 vnode_put(vp);
5458         if (dvp)
5459                 vnode_put(dvp);
5460         if (IS_VALID_CRED(context.vc_ucred))
5461                 kauth_cred_unref(&context.vc_ucred);
5462         return(error);
5463 }
5464
5465
5466 /*
5467  * Returns:     0                       Success
5468  *              namei:EFAULT            Bad address
5469  *              namei:ENAMETOOLONG      Filename too long
5470  *              namei:ENOENT            No such file or directory
5471  *              namei:ELOOP             Too many levels of symbolic links
5472  *              namei:EBADF             Bad file descriptor
5473  *              namei:ENOTDIR           Not a directory
5474  *              namei:???
5475  *              access1:
5476  */
5477 static int
5478 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5479     int flag, enum uio_seg segflg)
5480 {
5481         int error;
5482         struct nameidata nd;
5483         int niopts;
5484         struct vfs_context context;
5485 #if NAMEDRSRCFORK
5486         int is_namedstream = 0;
5487 #endif
5488
5489         /*
5490          * Unless the AT_EACCESS option is used, Access is defined as checking
5491          * against the process' real identity, even if operations are checking
5492          * the effective identity.  So we need to tweak the credential
5493          * in the context for that case.
5494          */
5495         if (!(flag & AT_EACCESS))
5496                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5497         else
5498                 context.vc_ucred = ctx->vc_ucred;
5499         context.vc_thread = ctx->vc_thread;
5500
5501
5502         niopts = FOLLOW | AUDITVNPATH1;
5503         /* need parent for vnode_authorize for deletion test */
5504         if (amode & _DELETE_OK)
5505                 niopts |= WANTPARENT;
5506         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5507                path, &context);
5508
5509 #if NAMEDRSRCFORK
5510         /* access(F_OK) calls are allowed for resource forks. */
5511         if (amode == F_OK)
5512                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5513 #endif
5514         error = nameiat(&nd, fd);
5515         if (error)
5516                 goto out;
5517
5518 #if NAMEDRSRCFORK
5519         /* Grab reference on the shadow stream file vnode to
5520          * force an inactive on release which will mark it
5521          * for recycle.
5522          */
5523         if (vnode_isnamedstream(nd.ni_vp) &&
5524             (nd.ni_vp->v_parent != NULLVP) &&
5525             vnode_isshadow(nd.ni_vp)) {
5526                 is_namedstream = 1;
5527                 vnode_ref(nd.ni_vp);
5528         }
5529 #endif
5530
5531         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
5532
5533 #if NAMEDRSRCFORK
5534         if (is_namedstream) {
5535                 vnode_rele(nd.ni_vp);
5536         }
5537 #endif
5538
5539         vnode_put(nd.ni_vp);
5540         if (amode & _DELETE_OK)
5541                 vnode_put(nd.ni_dvp);
5542         nameidone(&nd);
5543
5544 out:
5545         if (!(flag & AT_EACCESS))
5546                 kauth_cred_unref(&context.vc_ucred);
5547         return (error);
5548 }
5549
5550 int
5551 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
5552 {
5553         return (faccessat_internal(vfs_context_current(), AT_FDCWD,
5554             uap->path, uap->flags, 0, UIO_USERSPACE));
5555 }
5556
5557 int
5558 faccessat(__unused proc_t p, struct faccessat_args *uap,
5559           __unused int32_t *retval)
5560 {
5561         if (uap->flag & ~AT_EACCESS)
5562                 return (EINVAL);
5563
5564         return (faccessat_internal(vfs_context_current(), uap->fd,
5565             uap->path, uap->amode, uap->flag, UIO_USERSPACE));
5566 }
5567
5568 /*
5569  * Returns:     0                       Success
5570  *              EFAULT
5571  *      copyout:EFAULT
5572  *      namei:???
5573  *      vn_stat:???
5574  */
5575 static int
5576 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
5577     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
5578     enum uio_seg segflg, int fd, int flag)
5579 {
5580         struct nameidata nd;
5581         int follow;
5582         union {
5583                 struct stat sb;
5584                 struct stat64 sb64;
5585         } source = {};
5586         union {
5587                 struct user64_stat user64_sb;
5588                 struct user32_stat user32_sb;
5589                 struct user64_stat64 user64_sb64;
5590                 struct user32_stat64 user32_sb64;
5591         } dest = {};
5592         caddr_t sbp;
5593         int error, my_size;
5594         kauth_filesec_t fsec;
5595         size_t xsecurity_bufsize;
5596         void * statptr;
5597
5598         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5599         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
5600             segflg, path, ctx);
5601
5602 #if NAMEDRSRCFORK
5603         int is_namedstream = 0;
5604         /* stat calls are allowed for resource forks. */
5605         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5606 #endif
5607         error = nameiat(&nd, fd);
5608         if (error)
5609                 return (error);
5610         fsec = KAUTH_FILESEC_NONE;
5611
5612         statptr = (void *)&source;
5613
5614 #if NAMEDRSRCFORK
5615         /* Grab reference on the shadow stream file vnode to
5616          * force an inactive on release which will mark it
5617          * for recycle.
5618          */
5619         if (vnode_isnamedstream(nd.ni_vp) &&
5620             (nd.ni_vp->v_parent != NULLVP) &&
5621             vnode_isshadow(nd.ni_vp)) {
5622                 is_namedstream = 1;
5623                 vnode_ref(nd.ni_vp);
5624         }
5625 #endif
5626
5627         error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
5628
5629 #if NAMEDRSRCFORK
5630         if (is_namedstream) {
5631                 vnode_rele(nd.ni_vp);
5632         }
5633 #endif
5634         vnode_put(nd.ni_vp);
5635         nameidone(&nd);
5636
5637         if (error)
5638                 return (error);
5639         /* Zap spare fields */
5640         if (isstat64 != 0) {
5641                 source.sb64.st_lspare = 0;
5642                 source.sb64.st_qspare[0] = 0LL;
5643                 source.sb64.st_qspare[1] = 0LL;
5644                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5645                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
5646                         my_size = sizeof(dest.user64_sb64);
5647                         sbp = (caddr_t)&dest.user64_sb64;
5648                 } else {
5649                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
5650                         my_size = sizeof(dest.user32_sb64);
5651                         sbp = (caddr_t)&dest.user32_sb64;
5652                 }
5653                 /*
5654                  * Check if we raced (post lookup) against the last unlink of a file.
5655                  */
5656                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
5657                         source.sb64.st_nlink = 1;
5658                 }
5659         } else {
5660                 source.sb.st_lspare = 0;
5661                 source.sb.st_qspare[0] = 0LL;
5662                 source.sb.st_qspare[1] = 0LL;
5663                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5664                         munge_user64_stat(&source.sb, &dest.user64_sb);
5665                         my_size = sizeof(dest.user64_sb);
5666                         sbp = (caddr_t)&dest.user64_sb;
5667                 } else {
5668                         munge_user32_stat(&source.sb, &dest.user32_sb);
5669                         my_size = sizeof(dest.user32_sb);
5670                         sbp = (caddr_t)&dest.user32_sb;
5671                 }
5672
5673                 /*
5674                  * Check if we raced (post lookup) against the last unlink of a file.
5675                  */
5676                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
5677                         source.sb.st_nlink = 1;
5678                 }
5679         }
5680         if ((error = copyout(sbp, ub, my_size)) != 0)
5681                 goto out;
5682
5683         /* caller wants extended security information? */
5684         if (xsecurity != USER_ADDR_NULL) {
5685
5686                 /* did we get any? */
5687                 if (fsec == KAUTH_FILESEC_NONE) {
5688                         if (susize(xsecurity_size, 0) != 0) {
5689                                 error = EFAULT;
5690                                 goto out;
5691                         }
5692                 } else {
5693                         /* find the user buffer size */
5694                         xsecurity_bufsize = fusize(xsecurity_size);
5695
5696                         /* copy out the actual data size */
5697                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
5698                                 error = EFAULT;
5699                                 goto out;
5700                         }
5701
5702                         /* if the caller supplied enough room, copy out to it */
5703                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec))
5704                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
5705                 }
5706         }
5707 out:
5708         if (fsec != KAUTH_FILESEC_NONE)
5709                 kauth_filesec_free(fsec);
5710         return (error);
5711 }
5712
5713 /*
5714  * stat_extended: Get file status; with extended security (ACL).
5715  *
5716  * Parameters:    p                       (ignored)
5717  *                uap                     User argument descriptor (see below)
5718  *                retval                  (ignored)
5719  *
5720  * Indirect:      uap->path               Path of file to get status from
5721  *                uap->ub                 User buffer (holds file status info)
5722  *                uap->xsecurity          ACL to get (extended security)
5723  *                uap->xsecurity_size     Size of ACL
5724  *
5725  * Returns:        0                      Success
5726  *                !0                      errno value
5727  *
5728  */
5729 int
5730 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
5731     __unused int32_t *retval)
5732 {
5733         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5734             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5735             0));
5736 }
5737
5738 /*
5739  * Returns:     0                       Success
5740  *      fstatat_internal:???            [see fstatat_internal() in this file]
5741  */
5742 int
5743 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
5744 {
5745         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5746             0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0));
5747 }
5748
5749 int
5750 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
5751 {
5752         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5753             0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0));
5754 }
5755
5756 /*
5757  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
5758  *
5759  * Parameters:    p                       (ignored)
5760  *                uap                     User argument descriptor (see below)
5761  *                retval                  (ignored)
5762  *
5763  * Indirect:      uap->path               Path of file to get status from
5764  *                uap->ub                 User buffer (holds file status info)
5765  *                uap->xsecurity          ACL to get (extended security)
5766  *                uap->xsecurity_size     Size of ACL
5767  *
5768  * Returns:        0                      Success
5769  *                !0                      errno value
5770  *
5771  */
5772 int
5773 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
5774 {
5775         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5776             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5777             0));
5778 }
5779
5780 /*
5781  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
5782  *
5783  * Parameters:    p                       (ignored)
5784  *                uap                     User argument descriptor (see below)
5785  *                retval                  (ignored)
5786  *
5787  * Indirect:      uap->path               Path of file to get status from
5788  *                uap->ub                 User buffer (holds file status info)
5789  *                uap->xsecurity          ACL to get (extended security)
5790  *                uap->xsecurity_size     Size of ACL
5791  *
5792  * Returns:        0                      Success
5793  *                !0                      errno value
5794  *
5795  */
5796 int
5797 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
5798 {
5799         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5800             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5801             AT_SYMLINK_NOFOLLOW));
5802 }
5803
5804 /*
5805  * Get file status; this version does not follow links.
5806  */
5807 int
5808 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
5809 {
5810         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5811             0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5812 }
5813
5814 int
5815 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
5816 {
5817         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5818             0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5819 }
5820
5821 /*
5822  * lstat64_extended: Get file status; can handle large inode numbers; does not
5823  * follow links; with extended security (ACL).
5824  *
5825  * Parameters:    p                       (ignored)
5826  *                uap                     User argument descriptor (see below)
5827  *                retval                  (ignored)
5828  *
5829  * Indirect:      uap->path               Path of file to get status from
5830  *                uap->ub                 User buffer (holds file status info)
5831  *                uap->xsecurity          ACL to get (extended security)
5832  *                uap->xsecurity_size     Size of ACL
5833  *
5834  * Returns:        0                      Success
5835  *                !0                      errno value
5836  *
5837  */
5838 int
5839 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
5840 {
5841         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5842             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5843             AT_SYMLINK_NOFOLLOW));
5844 }
5845
5846 int
5847 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
5848 {
5849         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5850                 return (EINVAL);
5851
5852         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5853             0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag));
5854 }
5855
5856 int
5857 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
5858     __unused int32_t *retval)
5859 {
5860         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5861                 return (EINVAL);
5862
5863         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5864             0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag));
5865 }
5866
5867 /*
5868  * Get configurable pathname variables.
5869  *
5870  * Returns:     0                       Success
5871  *      namei:???
5872  *      vn_pathconf:???
5873  *
5874  * Notes:       Global implementation  constants are intended to be
5875  *              implemented in this function directly; all other constants
5876  *              are per-FS implementation, and therefore must be handled in
5877  *              each respective FS, instead.
5878  *
5879  * XXX We implement some things globally right now that should actually be
5880  * XXX per-FS; we will need to deal with this at some point.
5881  */
5882 /* ARGSUSED */
5883 int
5884 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
5885 {
5886         int error;
5887         struct nameidata nd;
5888         vfs_context_t ctx = vfs_context_current();
5889
5890         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
5891                 UIO_USERSPACE, uap->path, ctx);
5892         error = namei(&nd);
5893         if (error)
5894                 return (error);
5895
5896         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
5897
5898         vnode_put(nd.ni_vp);
5899         nameidone(&nd);
5900         return (error);
5901 }
5902
5903 /*
5904  * Return target name of a symbolic link.
5905  */
5906 /* ARGSUSED */
5907 static int
5908 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
5909     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
5910     int *retval)
5911 {
5912         vnode_t vp;
5913         uio_t auio;
5914         int error;
5915         struct nameidata nd;
5916         char uio_buf[ UIO_SIZEOF(1) ];
5917
5918         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
5919             seg, path, ctx);
5920
5921         error = nameiat(&nd, fd);
5922         if (error)
5923                 return (error);
5924         vp = nd.ni_vp;
5925
5926         nameidone(&nd);
5927
5928         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
5929                                     &uio_buf[0], sizeof(uio_buf));
5930         uio_addiov(auio, buf, bufsize);
5931         if (vp->v_type != VLNK) {
5932                 error = EINVAL;
5933         } else {
5934 #if CONFIG_MACF
5935                 error = mac_vnode_check_readlink(ctx, vp);
5936 #endif
5937                 if (error == 0)
5938                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
5939                                                 ctx);
5940                 if (error == 0)
5941                         error = VNOP_READLINK(vp, auio, ctx);
5942         }
5943         vnode_put(vp);
5944
5945         *retval = bufsize - (int)uio_resid(auio);
5946         return (error);
5947 }
5948
5949 int
5950 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
5951 {
5952         enum uio_seg procseg;
5953
5954         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5955         return (readlinkat_internal(vfs_context_current(), AT_FDCWD,
5956             CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
5957             uap->count, procseg, retval));
5958 }
5959
5960 int
5961 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
5962 {
5963         enum uio_seg procseg;
5964
5965         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5966         return (readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
5967             procseg, uap->buf, uap->bufsize, procseg, retval));
5968 }
5969
5970 /*
5971  * Change file flags.
5972  *
5973  * NOTE: this will vnode_put() `vp'
5974  */
5975 static int
5976 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
5977 {
5978         struct vnode_attr va;
5979         kauth_action_t action;
5980         int error;
5981
5982         VATTR_INIT(&va);
5983         VATTR_SET(&va, va_flags, flags);
5984
5985 #if CONFIG_MACF
5986         error = mac_vnode_check_setflags(ctx, vp, flags);
5987         if (error)
5988                 goto out;
5989 #endif
5990
5991         /* request authorisation, disregard immutability */
5992         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5993                 goto out;
5994         /*
5995          * Request that the auth layer disregard those file flags it's allowed to when
5996          * authorizing this operation; we need to do this in order to be able to
5997          * clear immutable flags.
5998          */
5999         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0))
6000                 goto out;
6001         error = vnode_setattr(vp, &va, ctx);
6002
6003 #if CONFIG_MACF
6004         if (error == 0)
6005                 mac_vnode_notify_setflags(ctx, vp, flags);
6006 #endif
6007
6008         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
6009                 error = ENOTSUP;
6010         }
6011 out:
6012         vnode_put(vp);
6013         return(error);
6014 }
6015
6016 /*
6017  * Change flags of a file given a path name.
6018  */
6019 /* ARGSUSED */
6020 int
6021 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
6022 {
6023         vnode_t vp;
6024         vfs_context_t ctx = vfs_context_current();
6025         int error;
6026         struct nameidata nd;
6027
6028         AUDIT_ARG(fflags, uap->flags);
6029         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6030                 UIO_USERSPACE, uap->path, ctx);
6031         error = namei(&nd);
6032         if (error)
6033                 return (error);
6034         vp = nd.ni_vp;
6035         nameidone(&nd);
6036
6037         /* we don't vnode_put() here because chflags1 does internally */
6038         error = chflags1(vp, uap->flags, ctx);
6039
6040         return(error);
6041 }
6042
6043 /*
6044  * Change flags of a file given a file descriptor.
6045  */
6046 /* ARGSUSED */
6047 int
6048 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
6049 {
6050         vnode_t vp;
6051         int error;
6052
6053         AUDIT_ARG(fd, uap->fd);
6054         AUDIT_ARG(fflags, uap->flags);
6055         if ( (error = file_vnode(uap->fd, &vp)) )
6056                 return (error);
6057
6058         if ((error = vnode_getwithref(vp))) {
6059                 file_drop(uap->fd);
6060                 return(error);
6061         }
6062
6063         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6064
6065         /* we don't vnode_put() here because chflags1 does internally */
6066         error = chflags1(vp, uap->flags, vfs_context_current());
6067
6068         file_drop(uap->fd);
6069         return (error);
6070 }
6071
6072 /*
6073  * Change security information on a filesystem object.
6074  *
6075  * Returns:     0                       Success
6076  *              EPERM                   Operation not permitted
6077  *              vnode_authattr:???      [anything vnode_authattr can return]
6078  *              vnode_authorize:???     [anything vnode_authorize can return]
6079  *              vnode_setattr:???       [anything vnode_setattr can return]
6080  *
6081  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
6082  *              translated to EPERM before being returned.
6083  */
6084 static int
6085 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
6086 {
6087         kauth_action_t action;
6088         int error;
6089
6090         AUDIT_ARG(mode, vap->va_mode);
6091         /* XXX audit new args */
6092
6093 #if NAMEDSTREAMS
6094         /* chmod calls are not allowed for resource forks. */
6095         if (vp->v_flag & VISNAMEDSTREAM) {
6096                 return (EPERM);
6097         }
6098 #endif
6099
6100 #if CONFIG_MACF
6101         if (VATTR_IS_ACTIVE(vap, va_mode) &&
6102             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0)
6103                 return (error);
6104
6105         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
6106                 if ((error = mac_vnode_check_setowner(ctx, vp,
6107                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6108                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1)))
6109                         return (error);
6110         }
6111
6112         if (VATTR_IS_ACTIVE(vap, va_acl) &&
6113             (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl)))
6114                 return (error);
6115 #endif
6116
6117         /* make sure that the caller is allowed to set this security information */
6118         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
6119             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6120                 if (error == EACCES)
6121                         error = EPERM;
6122                 return(error);
6123         }
6124
6125         if ((error = vnode_setattr(vp, vap, ctx)) != 0)
6126                 return (error);
6127
6128 #if CONFIG_MACF
6129         if (VATTR_IS_ACTIVE(vap, va_mode))
6130                 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
6131
6132         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))
6133                 mac_vnode_notify_setowner(ctx, vp,
6134                         VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
6135                         VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
6136
6137         if (VATTR_IS_ACTIVE(vap, va_acl))
6138                 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
6139 #endif
6140
6141         return (error);
6142 }
6143
6144
6145 /*
6146  * Change mode of a file given a path name.
6147  *
6148  * Returns:     0                       Success
6149  *              namei:???               [anything namei can return]
6150  *              chmod_vnode:???         [anything chmod_vnode can return]
6151  */
6152 static int
6153 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
6154     int fd, int flag, enum uio_seg segflg)
6155 {
6156         struct nameidata nd;
6157         int follow, error;
6158
6159         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6160         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
6161             segflg, path, ctx);
6162         if ((error = nameiat(&nd, fd)))
6163                 return (error);
6164         error = chmod_vnode(ctx, nd.ni_vp, vap);
6165         vnode_put(nd.ni_vp);
6166         nameidone(&nd);
6167         return(error);
6168 }
6169
6170 /*
6171  * chmod_extended: Change the mode of a file given a path name; with extended
6172  * argument list (including extended security (ACL)).
6173  *
6174  * Parameters:  p                       Process requesting the open
6175  *              uap                     User argument descriptor (see below)
6176  *              retval                  (ignored)
6177  *
6178  * Indirect:    uap->path               Path to object (same as 'chmod')
6179  *              uap->uid                UID to set
6180  *              uap->gid                GID to set
6181  *              uap->mode               File mode to set (same as 'chmod')
6182  *              uap->xsecurity          ACL to set (or delete)
6183  *
6184  * Returns:     0                       Success
6185  *              !0                      errno value
6186  *
6187  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
6188  *
6189  * XXX:         We should enummerate the possible errno values here, and where
6190  *              in the code they originated.
6191  */
6192 int
6193 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
6194 {
6195         int error;
6196         struct vnode_attr va;
6197         kauth_filesec_t xsecdst;
6198
6199         AUDIT_ARG(owner, uap->uid, uap->gid);
6200
6201         VATTR_INIT(&va);
6202         if (uap->mode != -1)
6203                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6204         if (uap->uid != KAUTH_UID_NONE)
6205                 VATTR_SET(&va, va_uid, uap->uid);
6206         if (uap->gid != KAUTH_GID_NONE)
6207                 VATTR_SET(&va, va_gid, uap->gid);
6208
6209         xsecdst = NULL;
6210         switch(uap->xsecurity) {
6211                 /* explicit remove request */
6212         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6213                 VATTR_SET(&va, va_acl, NULL);
6214                 break;
6215                 /* not being set */
6216         case USER_ADDR_NULL:
6217                 break;
6218         default:
6219                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
6220                         return(error);
6221                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6222                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6223         }
6224
6225         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6226             UIO_USERSPACE);
6227
6228         if (xsecdst != NULL)
6229                 kauth_filesec_free(xsecdst);
6230         return(error);
6231 }
6232
6233 /*
6234  * Returns:     0                       Success
6235  *              chmodat:???             [anything chmodat can return]
6236  */
6237 static int
6238 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6239     int flag, enum uio_seg segflg)
6240 {
6241         struct vnode_attr va;
6242
6243         VATTR_INIT(&va);
6244         VATTR_SET(&va, va_mode, mode & ALLPERMS);
6245
6246         return (chmodat(ctx, path, &va, fd, flag, segflg));
6247 }
6248
6249 int
6250 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6251 {
6252         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6253             AT_FDCWD, 0, UIO_USERSPACE));
6254 }
6255
6256 int
6257 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6258 {
6259         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6260                 return (EINVAL);
6261
6262         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6263             uap->fd, uap->flag, UIO_USERSPACE));
6264 }
6265
6266 /*
6267  * Change mode of a file given a file descriptor.
6268  */
6269 static int
6270 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6271 {
6272         vnode_t vp;
6273         int error;
6274
6275         AUDIT_ARG(fd, fd);
6276
6277         if ((error = file_vnode(fd, &vp)) != 0)
6278                 return (error);
6279         if ((error = vnode_getwithref(vp)) != 0) {
6280                 file_drop(fd);
6281                 return(error);
6282         }
6283         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6284
6285         error = chmod_vnode(vfs_context_current(), vp, vap);
6286         (void)vnode_put(vp);
6287         file_drop(fd);
6288
6289         return (error);
6290 }
6291
6292 /*
6293  * fchmod_extended: Change mode of a file given a file descriptor; with
6294  * extended argument list (including extended security (ACL)).
6295  *
6296  * Parameters:    p                       Process requesting to change file mode
6297  *                uap                     User argument descriptor (see below)
6298  *                retval                  (ignored)
6299  *
6300  * Indirect:      uap->mode               File mode to set (same as 'chmod')
6301  *                uap->uid                UID to set
6302  *                uap->gid                GID to set
6303  *                uap->xsecurity          ACL to set (or delete)
6304  *                uap->fd                 File descriptor of file to change mode
6305  *
6306  * Returns:        0                      Success
6307  *                !0                      errno value
6308  *
6309  */
6310 int
6311 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6312 {
6313         int error;
6314         struct vnode_attr va;
6315         kauth_filesec_t xsecdst;
6316
6317         AUDIT_ARG(owner, uap->uid, uap->gid);
6318
6319         VATTR_INIT(&va);
6320         if (uap->mode != -1)
6321                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6322         if (uap->uid != KAUTH_UID_NONE)
6323                 VATTR_SET(&va, va_uid, uap->uid);
6324         if (uap->gid != KAUTH_GID_NONE)
6325                 VATTR_SET(&va, va_gid, uap->gid);
6326
6327         xsecdst = NULL;
6328         switch(uap->xsecurity) {
6329         case USER_ADDR_NULL:
6330                 VATTR_SET(&va, va_acl, NULL);
6331                 break;
6332         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6333                 VATTR_SET(&va, va_acl, NULL);
6334                 break;
6335                 /* not being set */
6336         case CAST_USER_ADDR_T(-1):
6337                 break;
6338         default:
6339                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
6340                         return(error);
6341                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6342         }
6343
6344         error = fchmod1(p, uap->fd, &va);
6345
6346
6347         switch(uap->xsecurity) {
6348         case USER_ADDR_NULL:
6349         case CAST_USER_ADDR_T(-1):
6350                 break;
6351         default:
6352                 if (xsecdst != NULL)
6353                         kauth_filesec_free(xsecdst);
6354         }
6355         return(error);
6356 }
6357
6358 int
6359 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6360 {
6361         struct vnode_attr va;
6362
6363         VATTR_INIT(&va);
6364         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6365
6366         return(fchmod1(p, uap->fd, &va));
6367 }
6368
6369
6370 /*
6371  * Set ownership given a path name.
6372  */
6373 /* ARGSUSED */
6374 static int
6375 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6376    gid_t gid, int flag, enum uio_seg segflg)
6377 {
6378         vnode_t vp;
6379         struct vnode_attr va;
6380         int error;
6381         struct nameidata nd;
6382         int follow;
6383         kauth_action_t action;
6384
6385         AUDIT_ARG(owner, uid, gid);
6386
6387         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6388         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6389             path, ctx);
6390         error = nameiat(&nd, fd);
6391         if (error)
6392                 return (error);
6393         vp = nd.ni_vp;
6394
6395         nameidone(&nd);
6396
6397         VATTR_INIT(&va);
6398         if (uid != (uid_t)VNOVAL)
6399                 VATTR_SET(&va, va_uid, uid);
6400         if (gid != (gid_t)VNOVAL)
6401                 VATTR_SET(&va, va_gid, gid);
6402
6403 #if CONFIG_MACF
6404         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6405         if (error)
6406                 goto out;
6407 #endif
6408
6409         /* preflight and authorize attribute changes */
6410         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6411                 goto out;
6412         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6413                 goto out;
6414         error = vnode_setattr(vp, &va, ctx);
6415
6416 #if CONFIG_MACF
6417         if (error == 0)
6418                 mac_vnode_notify_setowner(ctx, vp, uid, gid);
6419 #endif
6420
6421 out:
6422         /*
6423          * EACCES is only allowed from namei(); permissions failure should
6424          * return EPERM, so we need to translate the error code.
6425          */
6426         if (error == EACCES)
6427                 error = EPERM;
6428
6429         vnode_put(vp);
6430         return (error);
6431 }
6432
6433 int
6434 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
6435 {
6436         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6437             uap->uid, uap->gid, 0, UIO_USERSPACE));
6438 }
6439
6440 int
6441 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
6442 {
6443         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6444             uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE));
6445 }
6446
6447 int
6448 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
6449 {
6450         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6451                 return (EINVAL);
6452
6453         return (fchownat_internal(vfs_context_current(), uap->fd, uap->path,
6454             uap->uid, uap->gid, uap->flag, UIO_USERSPACE));
6455 }
6456
6457 /*
6458  * Set ownership given a file descriptor.
6459  */
6460 /* ARGSUSED */
6461 int
6462 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
6463 {
6464         struct vnode_attr va;
6465         vfs_context_t ctx = vfs_context_current();
6466         vnode_t vp;
6467         int error;
6468         kauth_action_t action;
6469
6470         AUDIT_ARG(owner, uap->uid, uap->gid);
6471         AUDIT_ARG(fd, uap->fd);
6472
6473         if ( (error = file_vnode(uap->fd, &vp)) )
6474                 return (error);
6475
6476         if ( (error = vnode_getwithref(vp)) ) {
6477                 file_drop(uap->fd);
6478                 return(error);
6479         }
6480         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6481
6482         VATTR_INIT(&va);
6483         if (uap->uid != VNOVAL)
6484                 VATTR_SET(&va, va_uid, uap->uid);
6485         if (uap->gid != VNOVAL)
6486                 VATTR_SET(&va, va_gid, uap->gid);
6487
6488 #if NAMEDSTREAMS
6489         /* chown calls are not allowed for resource forks. */
6490         if (vp->v_flag & VISNAMEDSTREAM) {
6491                 error = EPERM;
6492                 goto out;
6493         }
6494 #endif
6495
6496 #if CONFIG_MACF
6497         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
6498         if (error)
6499                 goto out;
6500 #endif
6501
6502         /* preflight and authorize attribute changes */
6503         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6504                 goto out;
6505         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6506                 if (error == EACCES)
6507                         error = EPERM;
6508                 goto out;
6509         }
6510         error = vnode_setattr(vp, &va, ctx);
6511
6512 #if CONFIG_MACF
6513         if (error == 0)
6514                 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
6515 #endif
6516
6517 out:
6518         (void)vnode_put(vp);
6519         file_drop(uap->fd);
6520         return (error);
6521 }
6522
6523 static int
6524 getutimes(user_addr_t usrtvp, struct timespec *tsp)
6525 {
6526         int error;
6527
6528         if (usrtvp == USER_ADDR_NULL) {
6529                 struct timeval old_tv;
6530                 /* XXX Y2038 bug because of microtime argument */
6531                 microtime(&old_tv);
6532                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
6533                 tsp[1] = tsp[0];
6534         } else {
6535                 if (IS_64BIT_PROCESS(current_proc())) {
6536                         struct user64_timeval tv[2];
6537                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6538                         if (error)
6539                                 return (error);
6540                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6541                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6542                 } else {
6543                         struct user32_timeval tv[2];
6544                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6545                         if (error)
6546                                 return (error);
6547                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6548                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6549                 }
6550         }
6551         return 0;
6552 }
6553
6554 static int
6555 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
6556         int nullflag)
6557 {
6558         int error;
6559         struct vnode_attr va;
6560         kauth_action_t action;
6561
6562         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6563
6564         VATTR_INIT(&va);
6565         VATTR_SET(&va, va_access_time, ts[0]);
6566         VATTR_SET(&va, va_modify_time, ts[1]);
6567         if (nullflag)
6568                 va.va_vaflags |= VA_UTIMES_NULL;
6569
6570 #if NAMEDSTREAMS
6571         /* utimes calls are not allowed for resource forks. */
6572         if (vp->v_flag & VISNAMEDSTREAM) {
6573                 error = EPERM;
6574                 goto out;
6575         }
6576 #endif
6577
6578 #if CONFIG_MACF
6579         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
6580         if (error)
6581                 goto out;
6582 #endif
6583         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6584                 if (!nullflag && error == EACCES)
6585                         error = EPERM;
6586                 goto out;
6587         }
6588
6589         /* since we may not need to auth anything, check here */
6590         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6591                 if (!nullflag && error == EACCES)
6592                         error = EPERM;
6593                 goto out;
6594         }
6595         error = vnode_setattr(vp, &va, ctx);
6596
6597 #if CONFIG_MACF
6598         if (error == 0)
6599                 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
6600 #endif
6601
6602 out:
6603         return error;
6604 }
6605
6606 /*
6607  * Set the access and modification times of a file.
6608  */
6609 /* ARGSUSED */
6610 int
6611 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
6612 {
6613         struct timespec ts[2];
6614         user_addr_t usrtvp;
6615         int error;
6616         struct nameidata nd;
6617         vfs_context_t ctx = vfs_context_current();
6618
6619         /*
6620          * AUDIT: Needed to change the order of operations to do the
6621          * name lookup first because auditing wants the path.
6622          */
6623         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6624                 UIO_USERSPACE, uap->path, ctx);
6625         error = namei(&nd);
6626         if (error)
6627                 return (error);
6628         nameidone(&nd);
6629
6630         /*
6631          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
6632          * the current time instead.
6633          */
6634         usrtvp = uap->tptr;
6635         if ((error = getutimes(usrtvp, ts)) != 0)
6636                 goto out;
6637
6638         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
6639
6640 out:
6641         vnode_put(nd.ni_vp);
6642         return (error);
6643 }
6644
6645 /*
6646  * Set the access and modification times of a file.
6647  */
6648 /* ARGSUSED */
6649 int
6650 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
6651 {
6652         struct timespec ts[2];
6653         vnode_t vp;
6654         user_addr_t usrtvp;
6655         int error;
6656
6657         AUDIT_ARG(fd, uap->fd);
6658         usrtvp = uap->tptr;
6659         if ((error = getutimes(usrtvp, ts)) != 0)
6660                 return (error);
6661         if ((error = file_vnode(uap->fd, &vp)) != 0)
6662                 return (error);
6663         if((error = vnode_getwithref(vp))) {
6664                 file_drop(uap->fd);
6665                 return(error);
6666         }
6667
6668         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
6669         vnode_put(vp);
6670         file_drop(uap->fd);
6671         return(error);
6672 }
6673
6674 /*
6675  * Truncate a file given its path name.
6676  */
6677 /* ARGSUSED */
6678 int
6679 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
6680 {
6681         vnode_t vp;
6682         struct vnode_attr va;
6683         vfs_context_t ctx = vfs_context_current();
6684         int error;
6685         struct nameidata nd;
6686         kauth_action_t action;
6687
6688         if (uap->length < 0)
6689                 return(EINVAL);
6690         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
6691                 UIO_USERSPACE, uap->path, ctx);
6692         if ((error = namei(&nd)))
6693                 return (error);
6694         vp = nd.ni_vp;
6695
6696         nameidone(&nd);
6697
6698         VATTR_INIT(&va);
6699         VATTR_SET(&va, va_data_size, uap->length);
6700
6701 #if CONFIG_MACF
6702         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
6703         if (error)
6704                 goto out;
6705 #endif
6706
6707         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6708                 goto out;
6709         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6710                 goto out;
6711         error = vnode_setattr(vp, &va, ctx);
6712
6713 #if CONFIG_MACF
6714         if (error == 0)
6715                 mac_vnode_notify_truncate(ctx, NOCRED, vp);
6716 #endif
6717
6718 out:
6719         vnode_put(vp);
6720         return (error);
6721 }
6722
6723 /*
6724  * Truncate a file given a file descriptor.
6725  */
6726 /* ARGSUSED */
6727 int
6728 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
6729 {
6730         vfs_context_t ctx = vfs_context_current();
6731         struct vnode_attr va;
6732         vnode_t vp;
6733         struct fileproc *fp;
6734         int error ;
6735         int fd = uap->fd;
6736
6737         AUDIT_ARG(fd, uap->fd);
6738         if (uap->length < 0)
6739                 return(EINVAL);
6740
6741         if ( (error = fp_lookup(p,fd,&fp,0)) ) {
6742                 return(error);
6743         }
6744
6745         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
6746         case DTYPE_PSXSHM:
6747                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
6748                 goto out;
6749         case DTYPE_VNODE:
6750                 break;
6751         default:
6752                 error = EINVAL;
6753                 goto out;
6754         }
6755
6756         vp = (vnode_t)fp->f_fglob->fg_data;
6757
6758         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
6759                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
6760                 error = EINVAL;
6761                 goto out;
6762         }
6763
6764         if ((error = vnode_getwithref(vp)) != 0) {
6765                 goto out;
6766         }
6767
6768         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6769
6770 #if CONFIG_MACF
6771         error = mac_vnode_check_truncate(ctx,
6772             fp->f_fglob->fg_cred, vp);
6773         if (error) {
6774                 (void)vnode_put(vp);
6775                 goto out;
6776         }
6777 #endif
6778         VATTR_INIT(&va);
6779         VATTR_SET(&va, va_data_size, uap->length);
6780         error = vnode_setattr(vp, &va, ctx);
6781
6782 #if CONFIG_MACF
6783         if (error == 0)
6784                 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
6785 #endif
6786
6787         (void)vnode_put(vp);
6788 out:
6789         file_drop(fd);
6790         return (error);
6791 }
6792
6793
6794 /*
6795  * Sync an open file with synchronized I/O _file_ integrity completion
6796  */
6797 /* ARGSUSED */
6798 int
6799 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
6800 {
6801         __pthread_testcancel(1);
6802         return(fsync_common(p, uap, MNT_WAIT));
6803 }
6804
6805
6806 /*
6807  * Sync an open file with synchronized I/O _file_ integrity completion
6808  *
6809  * Notes:       This is a legacy support function that does not test for
6810  *              thread cancellation points.
6811  */
6812 /* ARGSUSED */
6813 int
6814 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
6815 {
6816         return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT));
6817 }
6818
6819
6820 /*
6821  * Sync an open file with synchronized I/O _data_ integrity completion
6822  */
6823 /* ARGSUSED */
6824 int
6825 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
6826 {
6827         __pthread_testcancel(1);
6828         return(fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT));
6829 }
6830
6831
6832 /*
6833  * fsync_common
6834  *
6835  * Common fsync code to support both synchronized I/O file integrity completion
6836  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
6837  *
6838  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
6839  * will only guarantee that the file data contents are retrievable.  If
6840  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
6841  * includes additional metadata unnecessary for retrieving the file data
6842  * contents, such as atime, mtime, ctime, etc., also be committed to stable
6843  * storage.
6844  *
6845  * Parameters:  p                               The process
6846  *              uap->fd                         The descriptor to synchronize
6847  *              flags                           The data integrity flags
6848  *
6849  * Returns:     int                             Success
6850  *      fp_getfvp:EBADF                         Bad file descriptor
6851  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
6852  *      VNOP_FSYNC:???                          unspecified
6853  *
6854  * Notes:       We use struct fsync_args because it is a short name, and all
6855  *              caller argument structures are otherwise identical.
6856  */
6857 static int
6858 fsync_common(proc_t p, struct fsync_args *uap, int flags)
6859 {
6860         vnode_t vp;
6861         struct fileproc *fp;
6862         vfs_context_t ctx = vfs_context_current();
6863         int error;
6864
6865         AUDIT_ARG(fd, uap->fd);
6866
6867         if ( (error = fp_getfvp(p, uap->fd, &fp, &vp)) )
6868                 return (error);
6869         if ( (error = vnode_getwithref(vp)) ) {
6870                 file_drop(uap->fd);
6871                 return(error);
6872         }
6873
6874         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6875
6876         error = VNOP_FSYNC(vp, flags, ctx);
6877
6878 #if NAMEDRSRCFORK
6879         /* Sync resource fork shadow file if necessary. */
6880         if ((error == 0) &&
6881             (vp->v_flag & VISNAMEDSTREAM) &&
6882             (vp->v_parent != NULLVP) &&
6883             vnode_isshadow(vp) &&
6884             (fp->f_flags & FP_WRITTEN)) {
6885                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
6886         }
6887 #endif
6888
6889         (void)vnode_put(vp);
6890         file_drop(uap->fd);
6891         return (error);
6892 }
6893
6894 /*
6895  * Duplicate files.  Source must be a file, target must be a file or
6896  * must not exist.
6897  *
6898  * XXX Copyfile authorisation checking is woefully inadequate, and will not
6899  *     perform inheritance correctly.
6900  */
6901 /* ARGSUSED */
6902 int
6903 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
6904 {
6905         vnode_t tvp, fvp, tdvp, sdvp;
6906         struct nameidata fromnd, tond;
6907         int error;
6908         vfs_context_t ctx = vfs_context_current();
6909 #if CONFIG_MACF
6910         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
6911         struct vnode_attr va;
6912 #endif
6913
6914         /* Check that the flags are valid. */
6915
6916         if (uap->flags & ~CPF_MASK) {
6917                 return(EINVAL);
6918         }
6919
6920         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
6921                 UIO_USERSPACE, uap->from, ctx);
6922         if ((error = namei(&fromnd)))
6923                 return (error);
6924         fvp = fromnd.ni_vp;
6925
6926         NDINIT(&tond, CREATE, OP_LINK,
6927                LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6928                UIO_USERSPACE, uap->to, ctx);
6929         if ((error = namei(&tond))) {
6930                 goto out1;
6931         }
6932         tdvp = tond.ni_dvp;
6933         tvp = tond.ni_vp;
6934
6935         if (tvp != NULL) {
6936                 if (!(uap->flags & CPF_OVERWRITE)) {
6937                         error = EEXIST;
6938                         goto out;
6939                 }
6940         }
6941
6942         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
6943                 error = EISDIR;
6944                 goto out;
6945         }
6946
6947         /* This calls existing MAC hooks for open  */
6948         if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
6949             NULL))) {
6950                 goto out;
6951         }
6952
6953         if (tvp) {
6954                 /*
6955                  * See unlinkat_internal for an explanation of the potential
6956                  * ENOENT from the MAC hook but the gist is that the MAC hook
6957                  * can fail because vn_getpath isn't able to return the full
6958                  * path. We choose to ignore this failure.
6959                  */
6960                 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
6961                 if (error && error != ENOENT)
6962                         goto out;
6963                 error = 0;
6964         }
6965
6966 #if CONFIG_MACF
6967         VATTR_INIT(&va);
6968         VATTR_SET(&va, va_type, fvp->v_type);
6969         /* Mask off all but regular access permissions */
6970         VATTR_SET(&va, va_mode,
6971             ((((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
6972         error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
6973         if (error)
6974                 goto out;
6975 #endif /* CONFIG_MACF */
6976
6977         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
6978                 goto out;
6979
6980         if (fvp == tdvp)
6981                 error = EINVAL;
6982         /*
6983          * If source is the same as the destination (that is the
6984          * same inode number) then there is nothing to do.
6985          * (fixed to have POSIX semantics - CSM 3/2/98)
6986          */
6987         if (fvp == tvp)
6988                 error = -1;
6989         if (!error)
6990                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
6991 out:
6992         sdvp = tond.ni_startdir;
6993         /*
6994          * nameidone has to happen before we vnode_put(tdvp)
6995          * since it may need to release the fs_nodelock on the tdvp
6996          */
6997         nameidone(&tond);
6998
6999         if (tvp)
7000                 vnode_put(tvp);
7001         vnode_put(tdvp);
7002         vnode_put(sdvp);
7003 out1:
7004         vnode_put(fvp);
7005
7006         nameidone(&fromnd);
7007
7008         if (error == -1)
7009                 return (0);
7010         return (error);
7011 }
7012
7013 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
7014
7015 /*
7016  * Helper function for doing clones. The caller is expected to provide an
7017  * iocounted source vnode and release it.
7018  */
7019 static int
7020 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
7021     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
7022 {
7023         vnode_t tvp, tdvp;
7024         struct nameidata tond;
7025         int error;
7026         int follow;
7027         boolean_t free_src_acl;
7028         boolean_t attr_cleanup;
7029         enum vtype v_type;
7030         kauth_action_t action;
7031         struct componentname *cnp;
7032         uint32_t defaulted;
7033         struct vnode_attr va;
7034         struct vnode_attr nva;
7035         uint32_t vnop_flags;
7036
7037         v_type = vnode_vtype(fvp);
7038         switch (v_type) {
7039         case VLNK:
7040                 /* FALLTHRU */
7041         case VREG:
7042                 action = KAUTH_VNODE_ADD_FILE;
7043                 break;
7044         case VDIR:
7045                 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
7046                     fvp->v_mountedhere) {
7047                         return (EINVAL);
7048                 }
7049                 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
7050                 break;
7051         default:
7052                 return (EINVAL);
7053         }
7054
7055         AUDIT_ARG(fd2, dst_dirfd);
7056         AUDIT_ARG(value32, flags);
7057
7058         follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7059         NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
7060             UIO_USERSPACE, dst, ctx);
7061         if ((error = nameiat(&tond, dst_dirfd)))
7062                 return (error);
7063         cnp = &tond.ni_cnd;
7064         tdvp = tond.ni_dvp;
7065         tvp = tond.ni_vp;
7066
7067         free_src_acl = FALSE;
7068         attr_cleanup = FALSE;
7069
7070         if (tvp != NULL) {
7071                 error = EEXIST;
7072                 goto out;
7073         }
7074
7075         if (vnode_mount(tdvp) != vnode_mount(fvp)) {
7076                 error = EXDEV;
7077                 goto out;
7078         }
7079
7080 #if CONFIG_MACF
7081         if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp)))
7082                 goto out;
7083 #endif
7084         if ((error = vnode_authorize(tdvp, NULL, action, ctx)))
7085                 goto out;
7086
7087         action = KAUTH_VNODE_GENERIC_READ_BITS;
7088         if (data_read_authorised)
7089                 action &= ~KAUTH_VNODE_READ_DATA;
7090         if ((error = vnode_authorize(fvp, NULL, action, ctx)))
7091                 goto out;
7092
7093         /*
7094          * certain attributes may need to be changed from the source, we ask for
7095          * those here.
7096          */
7097         VATTR_INIT(&va);
7098         VATTR_WANTED(&va, va_uid);
7099         VATTR_WANTED(&va, va_gid);
7100         VATTR_WANTED(&va, va_mode);
7101         VATTR_WANTED(&va, va_flags);
7102         VATTR_WANTED(&va, va_acl);
7103
7104         if ((error = vnode_getattr(fvp, &va, ctx)) != 0)
7105                 goto out;
7106
7107         VATTR_INIT(&nva);
7108         VATTR_SET(&nva, va_type, v_type);
7109         if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
7110                 VATTR_SET(&nva, va_acl, va.va_acl);
7111                 free_src_acl = TRUE;
7112         }
7113
7114         /* Handle ACL inheritance, initialize vap. */
7115         if (v_type == VLNK) {
7116                 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
7117         } else {
7118                 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
7119                 if (error)
7120                         goto out;
7121                 attr_cleanup = TRUE;
7122         }
7123
7124         vnop_flags = VNODE_CLONEFILE_DEFAULT;
7125         /*
7126          * We've got initial values for all security parameters,
7127          * If we are superuser, then we can change owners to be the
7128          * same as the source. Both superuser and the owner have default
7129          * WRITE_SECURITY privileges so all other fields can be taken
7130          * from source as well.
7131          */
7132         if (!(flags & CLONE_NOOWNERCOPY) && vfs_context_issuser(ctx)) {
7133                 if (VATTR_IS_SUPPORTED(&va, va_uid))
7134                         VATTR_SET(&nva, va_uid, va.va_uid);
7135                 if (VATTR_IS_SUPPORTED(&va, va_gid))
7136                         VATTR_SET(&nva, va_gid, va.va_gid);
7137         } else {
7138                 vnop_flags |= VNODE_CLONEFILE_NOOWNERCOPY;
7139         }
7140
7141         if (VATTR_IS_SUPPORTED(&va, va_mode))
7142                 VATTR_SET(&nva, va_mode, va.va_mode);
7143         if (VATTR_IS_SUPPORTED(&va, va_flags)) {
7144                 VATTR_SET(&nva, va_flags,
7145                     ((va.va_flags & ~(UF_DATAVAULT | SF_RESTRICTED)) | /* Turn off from source */
7146                     (nva.va_flags & (UF_DATAVAULT | SF_RESTRICTED))));
7147         }
7148
7149         error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva, vnop_flags, ctx);
7150
7151         if (!error && tvp) {
7152                 int     update_flags = 0;
7153 #if CONFIG_FSE
7154                 int fsevent;
7155 #endif /* CONFIG_FSE */
7156
7157 #if CONFIG_MACF
7158                 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
7159                     VNODE_LABEL_CREATE, ctx);
7160 #endif
7161                 /*
7162                  * If some of the requested attributes weren't handled by the
7163                  * VNOP, use our fallback code.
7164                  */
7165                 if (!VATTR_ALL_SUPPORTED(&va))
7166                         (void)vnode_setattr_fallback(tvp, &nva, ctx);
7167
7168                 // Make sure the name & parent pointers are hooked up
7169                 if (tvp->v_name == NULL)
7170                         update_flags |= VNODE_UPDATE_NAME;
7171                 if (tvp->v_parent == NULLVP)
7172                         update_flags |= VNODE_UPDATE_PARENT;
7173
7174                 if (update_flags) {
7175                         (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
7176                             cnp->cn_namelen, cnp->cn_hash, update_flags);
7177                 }
7178
7179 #if CONFIG_FSE
7180                 switch (vnode_vtype(tvp)) {
7181                 case VLNK:
7182                         /* FALLTHRU */
7183                 case VREG:
7184                         fsevent = FSE_CREATE_FILE;
7185                         break;
7186                 case VDIR:
7187                         fsevent = FSE_CREATE_DIR;
7188                         break;
7189                 default:
7190                         goto out;
7191                 }
7192
7193                 if (need_fsevent(fsevent, tvp)) {
7194                         /*
7195                          * The following is a sequence of three explicit events.
7196                          * A pair of FSE_CLONE events representing the source and destination
7197                          * followed by an FSE_CREATE_[FILE | DIR] for the destination.
7198                          * fseventsd may coalesce the destination clone and create events
7199                          * into a single event resulting in the following sequence for a client
7200                          * FSE_CLONE (src)
7201                          * FSE_CLONE | FSE_CREATE (dst)
7202                          */
7203                         add_fsevent(FSE_CLONE, ctx, FSE_ARG_VNODE, fvp, FSE_ARG_VNODE, tvp,
7204                             FSE_ARG_DONE);
7205                         add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
7206                             FSE_ARG_DONE);
7207                 }
7208 #endif /* CONFIG_FSE */
7209         }
7210
7211 out:
7212         if (attr_cleanup)
7213                 vn_attribute_cleanup(&nva, defaulted);
7214         if (free_src_acl && va.va_acl)
7215                 kauth_acl_free(va.va_acl);
7216         nameidone(&tond);
7217         if (tvp)
7218                 vnode_put(tvp);
7219         vnode_put(tdvp);
7220         return (error);
7221 }
7222
7223 /*
7224  * clone files or directories, target must not exist.
7225  */
7226 /* ARGSUSED */
7227 int
7228 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7229     __unused int32_t *retval)
7230 {
7231         vnode_t fvp;
7232         struct nameidata fromnd;
7233         int follow;
7234         int error;
7235         vfs_context_t ctx = vfs_context_current();
7236
7237         /* Check that the flags are valid. */
7238         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY))
7239                 return (EINVAL);
7240
7241         AUDIT_ARG(fd, uap->src_dirfd);
7242
7243         follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7244         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7245             UIO_USERSPACE, uap->src, ctx);
7246         if ((error = nameiat(&fromnd, uap->src_dirfd)))
7247                 return (error);
7248
7249         fvp = fromnd.ni_vp;
7250         nameidone(&fromnd);
7251
7252         error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7253             uap->flags, ctx);
7254
7255         vnode_put(fvp);
7256         return (error);
7257 }
7258
7259 int
7260 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7261     __unused int32_t *retval)
7262 {
7263         vnode_t fvp;
7264         struct fileproc *fp;
7265         int error;
7266         vfs_context_t ctx = vfs_context_current();
7267
7268         /* Check that the flags are valid. */
7269         if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY))
7270                 return (EINVAL);
7271
7272         AUDIT_ARG(fd, uap->src_fd);
7273         error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7274         if (error)
7275                 return (error);
7276
7277         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7278                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7279                 error = EBADF;
7280                 goto out;
7281         }
7282
7283         if ((error = vnode_getwithref(fvp)))
7284                 goto out;
7285
7286         AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7287
7288         error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7289             uap->flags, ctx);
7290
7291         vnode_put(fvp);
7292 out:
7293         file_drop(uap->src_fd);
7294         return (error);
7295 }
7296
7297 /*
7298  * Rename files.  Source and destination must either both be directories,
7299  * or both not be directories.  If target is a directory, it must be empty.
7300  */
7301 /* ARGSUSED */
7302 static int
7303 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
7304     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
7305 {
7306         if (flags & ~VFS_RENAME_FLAGS_MASK)
7307                 return EINVAL;
7308
7309         if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL))
7310                 return EINVAL;
7311
7312         vnode_t tvp, tdvp;
7313         vnode_t fvp, fdvp;
7314         struct nameidata *fromnd, *tond;
7315         int error;
7316         int do_retry;
7317         int retry_count;
7318         int mntrename;
7319         int need_event;
7320         const char *oname = NULL;
7321         char *from_name = NULL, *to_name = NULL;
7322         int from_len=0, to_len=0;
7323         int holding_mntlock;
7324         mount_t locked_mp = NULL;
7325         vnode_t oparent = NULLVP;
7326 #if CONFIG_FSE
7327         fse_info from_finfo, to_finfo;
7328 #endif
7329         int from_truncated=0, to_truncated;
7330         int batched = 0;
7331         struct vnode_attr *fvap, *tvap;
7332         int continuing = 0;
7333         /* carving out a chunk for structs that are too big to be on stack. */
7334         struct {
7335                 struct nameidata from_node, to_node;
7336                 struct vnode_attr fv_attr, tv_attr;
7337         } * __rename_data;
7338         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
7339         fromnd = &__rename_data->from_node;
7340         tond = &__rename_data->to_node;
7341
7342         holding_mntlock = 0;
7343         do_retry = 0;
7344         retry_count = 0;
7345 retry:
7346         fvp = tvp = NULL;
7347         fdvp = tdvp = NULL;
7348         fvap = tvap = NULL;
7349         mntrename = FALSE;
7350
7351         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
7352             segflg, from, ctx);
7353         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
7354
7355         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7356             segflg, to, ctx);
7357         tond->ni_flag = NAMEI_COMPOUNDRENAME;
7358
7359 continue_lookup:
7360         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
7361                 if ( (error = nameiat(fromnd, fromfd)) )
7362                         goto out1;
7363                 fdvp = fromnd->ni_dvp;
7364                 fvp  = fromnd->ni_vp;
7365
7366                 if (fvp && fvp->v_type == VDIR)
7367                         tond->ni_cnd.cn_flags |= WILLBEDIR;
7368         }
7369
7370         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
7371                 if ( (error = nameiat(tond, tofd)) ) {
7372                         /*
7373                          * Translate error code for rename("dir1", "dir2/.").
7374                          */
7375                         if (error == EISDIR && fvp->v_type == VDIR)
7376                                 error = EINVAL;
7377                         goto out1;
7378                 }
7379                 tdvp = tond->ni_dvp;
7380                 tvp  = tond->ni_vp;
7381         }
7382
7383 #if DEVELOPMENT || DEBUG
7384         /*
7385          * XXX VSWAP: Check for entitlements or special flag here
7386          * so we can restrict access appropriately.
7387          */
7388 #else /* DEVELOPMENT || DEBUG */
7389
7390         if (fromnd->ni_vp && vnode_isswap(fromnd->ni_vp) && (ctx != vfs_context_kernel())) {
7391                 error = EPERM;
7392                 goto out1;
7393         }
7394
7395         if (tond->ni_vp && vnode_isswap(tond->ni_vp) && (ctx != vfs_context_kernel())) {
7396                 error = EPERM;
7397                 goto out1;
7398         }
7399 #endif /* DEVELOPMENT || DEBUG */
7400
7401         if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
7402                 error = ENOENT;
7403                 goto out1;
7404         }
7405
7406         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
7407                 error = EEXIST;
7408                 goto out1;
7409         }
7410
7411         batched = vnode_compound_rename_available(fdvp);
7412
7413 #if CONFIG_FSE
7414         need_event = need_fsevent(FSE_RENAME, fdvp);
7415         if (need_event) {
7416                 if (fvp) {
7417                         get_fse_info(fvp, &from_finfo, ctx);
7418                 } else {
7419                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
7420                         if (error) {
7421                                 goto out1;
7422                         }
7423
7424                         fvap = &__rename_data->fv_attr;
7425                 }
7426
7427                 if (tvp) {
7428                         get_fse_info(tvp, &to_finfo, ctx);
7429                 } else if (batched) {
7430                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
7431                         if (error) {
7432                                 goto out1;
7433                         }
7434
7435                         tvap = &__rename_data->tv_attr;
7436                 }
7437         }
7438 #else
7439         need_event = 0;
7440 #endif /* CONFIG_FSE */
7441
7442         if (need_event || kauth_authorize_fileop_has_listeners()) {
7443                 if (from_name == NULL) {
7444                         GET_PATH(from_name);
7445                         if (from_name == NULL) {
7446                                 error = ENOMEM;
7447                                 goto out1;
7448                         }
7449                 }
7450
7451                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
7452
7453                 if (to_name == NULL) {
7454                         GET_PATH(to_name);
7455                         if (to_name == NULL) {
7456                                 error = ENOMEM;
7457                                 goto out1;
7458                         }
7459                 }
7460
7461                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
7462         }
7463         if (!fvp) {
7464                 /*
7465                  * Claim: this check will never reject a valid rename.
7466                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
7467                  * Suppose fdvp and tdvp are not on the same mount.
7468                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
7469                  *      then you can't move it to within another dir on the same mountpoint.
7470                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
7471                  *
7472                  * If this check passes, then we are safe to pass these vnodes to the same FS.
7473                  */
7474                 if (fdvp->v_mount != tdvp->v_mount) {
7475                         error = EXDEV;
7476                         goto out1;
7477                 }
7478                 goto skipped_lookup;
7479         }
7480
7481         if (!batched) {
7482                 error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
7483                 if (error) {
7484                         if (error == ENOENT) {
7485                                 assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7486                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7487                                         /*
7488                                          * We encountered a race where after doing the namei, tvp stops
7489                                          * being valid. If so, simply re-drive the rename call from the
7490                                          * top.
7491                                          */
7492                                         do_retry = 1;
7493                                         retry_count += 1;
7494                                 }
7495                         }
7496                         goto out1;
7497                 }
7498         }
7499
7500         /*
7501          * If the source and destination are the same (i.e. they're
7502          * links to the same vnode) and the target file system is
7503          * case sensitive, then there is nothing to do.
7504          *
7505          * XXX Come back to this.
7506          */
7507         if (fvp == tvp) {
7508                 int pathconf_val;
7509
7510                 /*
7511                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
7512                  * then assume that this file system is case sensitive.
7513                  */
7514                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
7515                     pathconf_val != 0) {
7516                         goto out1;
7517                 }
7518         }
7519
7520         /*
7521          * Allow the renaming of mount points.
7522          * - target must not exist
7523          * - target must reside in the same directory as source
7524          * - union mounts cannot be renamed
7525          * - "/" cannot be renamed
7526          *
7527          * XXX Handle this in VFS after a continued lookup (if we missed
7528          * in the cache to start off)
7529          *
7530          * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
7531          * we'll skip past here.  The file system is responsible for
7532          * checking that @tvp is not a descendent of @fvp and vice versa
7533          * so it should always return EINVAL if either @tvp or @fvp is the
7534          * root of a volume.
7535          */
7536         if ((fvp->v_flag & VROOT) &&
7537             (fvp->v_type == VDIR) &&
7538             (tvp == NULL)  &&
7539             (fvp->v_mountedhere == NULL)  &&
7540             (fdvp == tdvp)  &&
7541             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0)  &&
7542             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
7543                 vnode_t coveredvp;
7544
7545                 /* switch fvp to the covered vnode */
7546                 coveredvp = fvp->v_mount->mnt_vnodecovered;
7547                 if ( (vnode_getwithref(coveredvp)) ) {
7548                         error = ENOENT;
7549                         goto out1;
7550                 }
7551                 vnode_put(fvp);
7552
7553                 fvp = coveredvp;
7554                 mntrename = TRUE;
7555         }
7556         /*
7557          * Check for cross-device rename.
7558          */
7559         if ((fvp->v_mount != tdvp->v_mount) ||
7560             (tvp && (fvp->v_mount != tvp->v_mount))) {
7561                 error = EXDEV;
7562                 goto out1;
7563         }
7564
7565         /*
7566          * If source is the same as the destination (that is the
7567          * same inode number) then there is nothing to do...
7568          * EXCEPT if the underlying file system supports case
7569          * insensitivity and is case preserving.  In this case
7570          * the file system needs to handle the special case of
7571          * getting the same vnode as target (fvp) and source (tvp).
7572          *
7573          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
7574          * and _PC_CASE_PRESERVING can have this exception, and they need to
7575          * handle the special case of getting the same vnode as target and
7576          * source.  NOTE: Then the target is unlocked going into vnop_rename,
7577          * so not to cause locking problems. There is a single reference on tvp.
7578          *
7579          * NOTE - that fvp == tvp also occurs if they are hard linked and
7580          * that correct behaviour then is just to return success without doing
7581          * anything.
7582          *
7583          * XXX filesystem should take care of this itself, perhaps...
7584          */
7585         if (fvp == tvp && fdvp == tdvp) {
7586                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
7587                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
7588                           fromnd->ni_cnd.cn_namelen)) {
7589                         goto out1;
7590                 }
7591         }
7592
7593         if (holding_mntlock && fvp->v_mount != locked_mp) {
7594                 /*
7595                  * we're holding a reference and lock
7596                  * on locked_mp, but it no longer matches
7597                  * what we want to do... so drop our hold
7598                  */
7599                 mount_unlock_renames(locked_mp);
7600                 mount_drop(locked_mp, 0);
7601                 holding_mntlock = 0;
7602         }
7603         if (tdvp != fdvp && fvp->v_type == VDIR) {
7604                 /*
7605                  * serialize renames that re-shape
7606                  * the tree... if holding_mntlock is
7607                  * set, then we're ready to go...
7608                  * otherwise we
7609                  * first need to drop the iocounts
7610                  * we picked up, second take the
7611                  * lock to serialize the access,
7612                  * then finally start the lookup
7613                  * process over with the lock held
7614                  */
7615                 if (!holding_mntlock) {
7616                         /*
7617                          * need to grab a reference on
7618                          * the mount point before we
7619                          * drop all the iocounts... once
7620                          * the iocounts are gone, the mount
7621                          * could follow
7622                          */
7623                         locked_mp = fvp->v_mount;
7624                         mount_ref(locked_mp, 0);
7625
7626                         /*
7627                          * nameidone has to happen before we vnode_put(tvp)
7628                          * since it may need to release the fs_nodelock on the tvp
7629                          */
7630                         nameidone(tond);
7631
7632                         if (tvp)
7633                                 vnode_put(tvp);
7634                         vnode_put(tdvp);
7635
7636                         /*
7637                          * nameidone has to happen before we vnode_put(fdvp)
7638                          * since it may need to release the fs_nodelock on the fvp
7639                          */
7640                         nameidone(fromnd);
7641
7642                         vnode_put(fvp);
7643                         vnode_put(fdvp);
7644
7645                         mount_lock_renames(locked_mp);
7646                         holding_mntlock = 1;
7647
7648                         goto retry;
7649                 }
7650         } else {
7651                 /*
7652                  * when we dropped the iocounts to take
7653                  * the lock, we allowed the identity of
7654                  * the various vnodes to change... if they did,
7655                  * we may no longer be dealing with a rename
7656                  * that reshapes the tree... once we're holding
7657                  * the iocounts, the vnodes can't change type
7658                  * so we're free to drop the lock at this point
7659                  * and continue on
7660                  */
7661                 if (holding_mntlock) {
7662                         mount_unlock_renames(locked_mp);
7663                         mount_drop(locked_mp, 0);
7664                         holding_mntlock = 0;
7665                 }
7666         }
7667
7668         // save these off so we can later verify that fvp is the same
7669         oname   = fvp->v_name;
7670         oparent = fvp->v_parent;
7671
7672 skipped_lookup:
7673         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
7674                             tdvp, &tvp, &tond->ni_cnd, tvap,
7675                             flags, ctx);
7676
7677         if (holding_mntlock) {
7678                 /*
7679                  * we can drop our serialization
7680                  * lock now
7681                  */
7682                 mount_unlock_renames(locked_mp);
7683                 mount_drop(locked_mp, 0);
7684                 holding_mntlock = 0;
7685         }
7686         if (error) {
7687                 if (error == EKEEPLOOKING) {
7688                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
7689                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
7690                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
7691                                 }
7692                         }
7693
7694                         fromnd->ni_vp = fvp;
7695                         tond->ni_vp = tvp;
7696
7697                         goto continue_lookup;
7698                 }
7699
7700                 /*
7701                  * We may encounter a race in the VNOP where the destination didn't
7702                  * exist when we did the namei, but it does by the time we go and
7703                  * try to create the entry. In this case, we should re-drive this rename
7704                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
7705                  * but other filesystems susceptible to this race could return it, too.
7706                  */
7707                 if (error == ERECYCLE) {
7708                         do_retry = 1;
7709                 }
7710
7711                 /*
7712                  * For compound VNOPs, the authorization callback may return
7713                  * ENOENT in case of racing hardlink lookups hitting the name
7714                  * cache, redrive the lookup.
7715                  */
7716                 if (batched && error == ENOENT) {
7717                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7718                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7719                                 do_retry = 1;
7720                                 retry_count += 1;
7721                         }
7722                 }
7723
7724                 goto out1;
7725         }
7726
7727         /* call out to allow 3rd party notification of rename.
7728          * Ignore result of kauth_authorize_fileop call.
7729          */
7730         kauth_authorize_fileop(vfs_context_ucred(ctx),
7731                         KAUTH_FILEOP_RENAME,
7732                         (uintptr_t)from_name, (uintptr_t)to_name);
7733         if (flags & VFS_RENAME_SWAP) {
7734                 kauth_authorize_fileop(vfs_context_ucred(ctx),
7735                                                            KAUTH_FILEOP_RENAME,
7736                                                            (uintptr_t)to_name, (uintptr_t)from_name);
7737         }
7738
7739 #if CONFIG_FSE
7740         if (from_name != NULL && to_name != NULL) {
7741                 if (from_truncated || to_truncated) {
7742                         // set it here since only the from_finfo gets reported up to user space
7743                         from_finfo.mode |= FSE_TRUNCATED_PATH;
7744                 }
7745
7746                 if (tvap && tvp) {
7747                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
7748                 }
7749                 if (fvap) {
7750                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
7751                 }
7752
7753                 if (tvp) {
7754                         add_fsevent(FSE_RENAME, ctx,
7755                                                 FSE_ARG_STRING, from_len, from_name,
7756                                                 FSE_ARG_FINFO, &from_finfo,
7757                                                 FSE_ARG_STRING, to_len, to_name,
7758                                                 FSE_ARG_FINFO, &to_finfo,
7759                                                 FSE_ARG_DONE);
7760                         if (flags & VFS_RENAME_SWAP) {
7761                                 /*
7762                                  * Strictly speaking, swap is the equivalent of
7763                                  * *three* renames.  FSEvents clients should only take
7764                                  * the events as a hint, so we only bother reporting
7765                                  * two.
7766                                  */
7767                                 add_fsevent(FSE_RENAME, ctx,
7768                                                         FSE_ARG_STRING, to_len, to_name,
7769                                                         FSE_ARG_FINFO, &to_finfo,
7770                                                         FSE_ARG_STRING, from_len, from_name,
7771                                                         FSE_ARG_FINFO, &from_finfo,
7772                                                         FSE_ARG_DONE);
7773                         }
7774                 } else {
7775                         add_fsevent(FSE_RENAME, ctx,
7776                                     FSE_ARG_STRING, from_len, from_name,
7777                                     FSE_ARG_FINFO, &from_finfo,
7778                                     FSE_ARG_STRING, to_len, to_name,
7779                                     FSE_ARG_DONE);
7780                 }
7781         }
7782 #endif /* CONFIG_FSE */
7783
7784         /*
7785          * update filesystem's mount point data
7786          */
7787         if (mntrename) {
7788                 char *cp, *pathend, *mpname;
7789                 char * tobuf;
7790                 struct mount *mp;
7791                 int maxlen;
7792                 size_t len = 0;
7793
7794                 mp = fvp->v_mountedhere;
7795
7796                 if (vfs_busy(mp, LK_NOWAIT)) {
7797                         error = EBUSY;
7798                         goto out1;
7799                 }
7800                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
7801
7802                 if (UIO_SEG_IS_USER_SPACE(segflg))
7803                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
7804                 else
7805                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
7806                 if (!error) {
7807                         /* find current mount point prefix */
7808                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
7809                         for (cp = pathend; *cp != '\0'; ++cp) {
7810                                 if (*cp == '/')
7811                                         pathend = cp + 1;
7812                         }
7813                         /* find last component of target name */
7814                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
7815                                 if (*cp == '/')
7816                                         mpname = cp + 1;
7817                         }
7818                         /* append name to prefix */
7819                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
7820                         bzero(pathend, maxlen);
7821                         strlcpy(pathend, mpname, maxlen);
7822                 }
7823                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
7824
7825                 vfs_unbusy(mp);
7826         }
7827         /*
7828          * fix up name & parent pointers.  note that we first
7829          * check that fvp has the same name/parent pointers it
7830          * had before the rename call... this is a 'weak' check
7831          * at best...
7832          *
7833          * XXX oparent and oname may not be set in the compound vnop case
7834          */
7835         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
7836                 int update_flags;
7837
7838                 update_flags = VNODE_UPDATE_NAME;
7839
7840                 if (fdvp != tdvp)
7841                         update_flags |= VNODE_UPDATE_PARENT;
7842
7843                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
7844         }
7845 out1:
7846         if (to_name != NULL) {
7847                 RELEASE_PATH(to_name);
7848                 to_name = NULL;
7849         }
7850         if (from_name != NULL) {
7851                 RELEASE_PATH(from_name);
7852                 from_name = NULL;
7853         }
7854         if (holding_mntlock) {
7855                 mount_unlock_renames(locked_mp);
7856                 mount_drop(locked_mp, 0);
7857                 holding_mntlock = 0;
7858         }
7859         if (tdvp) {
7860                 /*
7861                  * nameidone has to happen before we vnode_put(tdvp)
7862                  * since it may need to release the fs_nodelock on the tdvp
7863                  */
7864                 nameidone(tond);
7865
7866                 if (tvp)
7867                         vnode_put(tvp);
7868                 vnode_put(tdvp);
7869         }
7870         if (fdvp) {
7871                 /*
7872                  * nameidone has to happen before we vnode_put(fdvp)
7873                  * since it may need to release the fs_nodelock on the fdvp
7874                  */
7875                 nameidone(fromnd);
7876
7877                 if (fvp)
7878                         vnode_put(fvp);
7879                 vnode_put(fdvp);
7880         }
7881
7882         /*
7883          * If things changed after we did the namei, then we will re-drive
7884          * this rename call from the top.
7885          */
7886         if (do_retry) {
7887                 do_retry = 0;
7888                 goto retry;
7889         }
7890
7891         FREE(__rename_data, M_TEMP);
7892         return (error);
7893 }
7894
7895 int
7896 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
7897 {
7898         return (renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
7899             AT_FDCWD, uap->to, UIO_USERSPACE, 0));
7900 }
7901
7902 int renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
7903 {
7904         return renameat_internal(
7905                 vfs_context_current(),
7906                 uap->fromfd, uap->from,
7907                 uap->tofd, uap->to,
7908                 UIO_USERSPACE, uap->flags);
7909 }
7910
7911 int
7912 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
7913 {
7914         return (renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
7915             uap->tofd, uap->to, UIO_USERSPACE, 0));
7916 }
7917
7918 /*
7919  * Make a directory file.
7920  *
7921  * Returns:     0                       Success
7922  *              EEXIST
7923  *      namei:???
7924  *      vnode_authorize:???
7925  *      vn_create:???
7926  */
7927 /* ARGSUSED */
7928 static int
7929 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
7930     enum uio_seg segflg)
7931 {
7932         vnode_t vp, dvp;
7933         int error;
7934         int update_flags = 0;
7935         int batched;
7936         struct nameidata nd;
7937
7938         AUDIT_ARG(mode, vap->va_mode);
7939         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
7940                path, ctx);
7941         nd.ni_cnd.cn_flags |= WILLBEDIR;
7942         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
7943
7944 continue_lookup:
7945         error = nameiat(&nd, fd);
7946         if (error)
7947                 return (error);
7948         dvp = nd.ni_dvp;
7949         vp = nd.ni_vp;
7950
7951         if (vp != NULL) {
7952                 error = EEXIST;
7953                 goto out;
7954         }
7955
7956         batched = vnode_compound_mkdir_available(dvp);
7957
7958         VATTR_SET(vap, va_type, VDIR);
7959
7960         /*
7961          * XXX
7962          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
7963          * only get EXISTS or EISDIR for existing path components, and not that it could see
7964          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
7965          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
7966          */
7967         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
7968                 if (error == EACCES || error == EPERM) {
7969                         int error2;
7970
7971                         nameidone(&nd);
7972                         vnode_put(dvp);
7973                         dvp = NULLVP;
7974
7975                         /*
7976                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
7977                          * rather than EACCESS if the target exists.
7978                          */
7979                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
7980                                         path, ctx);
7981                         error2 = nameiat(&nd, fd);
7982                         if (error2) {
7983                                 goto out;
7984                         } else {
7985                                 vp = nd.ni_vp;
7986                                 error = EEXIST;
7987                                 goto out;
7988                         }
7989                 }
7990
7991                 goto out;
7992         }
7993
7994         /*
7995          * make the directory
7996          */
7997         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
7998                 if (error == EKEEPLOOKING) {
7999                         nd.ni_vp = vp;
8000                         goto continue_lookup;
8001                 }
8002
8003                 goto out;
8004         }
8005
8006         // Make sure the name & parent pointers are hooked up
8007         if (vp->v_name == NULL)
8008                 update_flags |= VNODE_UPDATE_NAME;
8009         if (vp->v_parent == NULLVP)
8010                 update_flags |= VNODE_UPDATE_PARENT;
8011
8012         if (update_flags)
8013                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
8014
8015 #if CONFIG_FSE
8016         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
8017 #endif
8018
8019 out:
8020         /*
8021          * nameidone has to happen before we vnode_put(dvp)
8022          * since it may need to release the fs_nodelock on the dvp
8023          */
8024         nameidone(&nd);
8025
8026         if (vp)
8027                 vnode_put(vp);
8028         if (dvp)
8029                 vnode_put(dvp);
8030
8031         return (error);
8032 }
8033
8034 /*
8035  * mkdir_extended: Create a directory; with extended security (ACL).
8036  *
8037  * Parameters:    p                       Process requesting to create the directory
8038  *                uap                     User argument descriptor (see below)
8039  *                retval                  (ignored)
8040  *
8041  * Indirect:      uap->path               Path of directory to create
8042  *                uap->mode               Access permissions to set
8043  *                uap->xsecurity          ACL to set
8044  *
8045  * Returns:        0                      Success
8046  *                !0                      Not success
8047  *
8048  */
8049 int
8050 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
8051 {
8052         int ciferror;
8053         kauth_filesec_t xsecdst;
8054         struct vnode_attr va;
8055
8056         AUDIT_ARG(owner, uap->uid, uap->gid);
8057
8058         xsecdst = NULL;
8059         if ((uap->xsecurity != USER_ADDR_NULL) &&
8060             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
8061                 return ciferror;
8062
8063         VATTR_INIT(&va);
8064         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8065         if (xsecdst != NULL)
8066                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
8067
8068         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8069             UIO_USERSPACE);
8070         if (xsecdst != NULL)
8071                 kauth_filesec_free(xsecdst);
8072         return ciferror;
8073 }
8074
8075 int
8076 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
8077 {
8078         struct vnode_attr va;
8079
8080         VATTR_INIT(&va);
8081         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8082
8083         return (mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
8084             UIO_USERSPACE));
8085 }
8086
8087 int
8088 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
8089 {
8090         struct vnode_attr va;
8091
8092         VATTR_INIT(&va);
8093         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
8094
8095         return(mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
8096             UIO_USERSPACE));
8097 }
8098
8099 static int
8100 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
8101     enum uio_seg segflg)
8102 {
8103         vnode_t vp, dvp;
8104         int error;
8105         struct nameidata nd;
8106         char     *path = NULL;
8107         int       len=0;
8108         int has_listeners = 0;
8109         int need_event = 0;
8110         int truncated = 0;
8111 #if CONFIG_FSE
8112         struct vnode_attr va;
8113 #endif /* CONFIG_FSE */
8114         struct vnode_attr *vap = NULL;
8115         int restart_count = 0;
8116         int batched;
8117
8118         int restart_flag;
8119
8120         /*
8121          * This loop exists to restart rmdir in the unlikely case that two
8122          * processes are simultaneously trying to remove the same directory
8123          * containing orphaned appleDouble files.
8124          */
8125         do {
8126                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
8127                     segflg, dirpath, ctx);
8128                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
8129 continue_lookup:
8130                 restart_flag = 0;
8131                 vap = NULL;
8132
8133                 error = nameiat(&nd, fd);
8134                 if (error)
8135                         return (error);
8136
8137                 dvp = nd.ni_dvp;
8138                 vp = nd.ni_vp;
8139
8140                 if (vp) {
8141                         batched = vnode_compound_rmdir_available(vp);
8142
8143                         if (vp->v_flag & VROOT) {
8144                                 /*
8145                                  * The root of a mounted filesystem cannot be deleted.
8146                                  */
8147                                 error = EBUSY;
8148                                 goto out;
8149                         }
8150
8151 #if DEVELOPMENT || DEBUG
8152                         /*
8153                          * XXX VSWAP: Check for entitlements or special flag here
8154                          * so we can restrict access appropriately.
8155                          */
8156 #else /* DEVELOPMENT || DEBUG */
8157
8158                         if (vnode_isswap(vp) && (ctx != vfs_context_kernel())) {
8159                                 error = EPERM;
8160                                 goto out;
8161                         }
8162 #endif /* DEVELOPMENT || DEBUG */
8163
8164                         /*
8165                          * Removed a check here; we used to abort if vp's vid
8166                          * was not the same as what we'd seen the last time around.
8167                          * I do not think that check was valid, because if we retry
8168                          * and all dirents are gone, the directory could legitimately
8169                          * be recycled but still be present in a situation where we would
8170                          * have had permission to delete.  Therefore, we won't make
8171                          * an effort to preserve that check now that we may not have a
8172                          * vp here.
8173                          */
8174
8175                         if (!batched) {
8176                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
8177                                 if (error) {
8178                                         if (error == ENOENT) {
8179                                                 assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
8180                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8181                                                         restart_flag = 1;
8182                                                         restart_count += 1;
8183                                                 }
8184                                         }
8185                                         goto out;
8186                                 }
8187                         }
8188                 } else {
8189                         batched = 1;
8190
8191                         if (!vnode_compound_rmdir_available(dvp)) {
8192                                 panic("No error, but no compound rmdir?");
8193                         }
8194                 }
8195
8196 #if CONFIG_FSE
8197                 fse_info  finfo;
8198
8199                 need_event = need_fsevent(FSE_DELETE, dvp);
8200                 if (need_event) {
8201                         if (!batched) {
8202                                 get_fse_info(vp, &finfo, ctx);
8203                         } else {
8204                                 error = vfs_get_notify_attributes(&va);
8205                                 if (error) {
8206                                         goto out;
8207                                 }
8208
8209                                 vap = &va;
8210                         }
8211                 }
8212 #endif
8213                 has_listeners = kauth_authorize_fileop_has_listeners();
8214                 if (need_event || has_listeners) {
8215                         if (path == NULL) {
8216                                 GET_PATH(path);
8217                                 if (path == NULL) {
8218                                         error = ENOMEM;
8219                                         goto out;
8220                                 }
8221                         }
8222
8223                         len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
8224 #if CONFIG_FSE
8225                         if (truncated) {
8226                                 finfo.mode |= FSE_TRUNCATED_PATH;
8227                         }
8228 #endif
8229                 }
8230
8231                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8232                 nd.ni_vp = vp;
8233                 if (vp == NULLVP) {
8234                         /* Couldn't find a vnode */
8235                         goto out;
8236                 }
8237
8238                 if (error == EKEEPLOOKING) {
8239                         goto continue_lookup;
8240                 } else if (batched && error == ENOENT) {
8241                         assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
8242                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
8243                                 /*
8244                                  * For compound VNOPs, the authorization callback
8245                                  * may return ENOENT in case of racing hard link lookups
8246                                  * redrive the lookup.
8247                                  */
8248                                 restart_flag = 1;
8249                                 restart_count += 1;
8250                                 goto out;
8251                         }
8252                 }
8253 #if CONFIG_APPLEDOUBLE
8254                 /*
8255                  * Special case to remove orphaned AppleDouble
8256                  * files. I don't like putting this in the kernel,
8257                  * but carbon does not like putting this in carbon either,
8258                  * so here we are.
8259                  */
8260                 if (error == ENOTEMPTY) {
8261                         error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
8262                         if (error == EBUSY) {
8263                                 goto out;
8264                         }
8265
8266
8267                         /*
8268                          * Assuming everything went well, we will try the RMDIR again
8269                          */
8270                         if (!error)
8271                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8272                 }
8273 #endif /* CONFIG_APPLEDOUBLE */
8274                 /*
8275                  * Call out to allow 3rd party notification of delete.
8276                  * Ignore result of kauth_authorize_fileop call.
8277                  */
8278                 if (!error) {
8279                         if (has_listeners) {
8280                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8281                                                 KAUTH_FILEOP_DELETE,
8282                                                 (uintptr_t)vp,
8283                                                 (uintptr_t)path);
8284                         }
8285
8286                         if (vp->v_flag & VISHARDLINK) {
8287                                 // see the comment in unlink1() about why we update
8288                                 // the parent of a hard link when it is removed
8289                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
8290                         }
8291
8292 #if CONFIG_FSE
8293                         if (need_event) {
8294                                 if (vap) {
8295                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
8296                                 }
8297                                 add_fsevent(FSE_DELETE, ctx,
8298                                                 FSE_ARG_STRING, len, path,
8299                                                 FSE_ARG_FINFO, &finfo,
8300                                                 FSE_ARG_DONE);
8301                         }
8302 #endif
8303                 }
8304
8305 out:
8306                 if (path != NULL) {
8307                         RELEASE_PATH(path);
8308                         path = NULL;
8309                 }
8310                 /*
8311                  * nameidone has to happen before we vnode_put(dvp)
8312                  * since it may need to release the fs_nodelock on the dvp
8313                  */
8314                 nameidone(&nd);
8315                 vnode_put(dvp);
8316
8317                 if (vp)
8318                         vnode_put(vp);
8319
8320                 if (restart_flag == 0) {
8321                         wakeup_one((caddr_t)vp);
8322                         return (error);
8323                 }
8324                 tsleep(vp, PVFS, "rm AD", 1);
8325
8326         } while (restart_flag != 0);
8327
8328         return (error);
8329
8330 }
8331
8332 /*
8333  * Remove a directory file.
8334  */
8335 /* ARGSUSED */
8336 int
8337 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
8338 {
8339         return (rmdirat_internal(vfs_context_current(), AT_FDCWD,
8340             CAST_USER_ADDR_T(uap->path), UIO_USERSPACE));
8341 }
8342
8343 /* Get direntry length padded to 8 byte alignment */
8344 #define DIRENT64_LEN(namlen) \
8345         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
8346
8347 /* Get dirent length padded to 4 byte alignment */
8348 #define DIRENT_LEN(namelen) \
8349         ((sizeof(struct dirent) + (namelen + 1) - (__DARWIN_MAXNAMLEN + 1) + 3) & ~3)
8350
8351 /* Get the end of this dirent */
8352 #define DIRENT_END(dep) \
8353         (((char *)(dep)) + (dep)->d_reclen - 1)
8354
8355 errno_t
8356 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
8357                 int *numdirent, vfs_context_t ctxp)
8358 {
8359         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
8360         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
8361                    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0))  {
8362                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
8363         } else {
8364                 size_t bufsize;
8365                 void * bufptr;
8366                 uio_t auio;
8367                 struct direntry *entry64;
8368                 struct dirent *dep;
8369                 int bytesread;
8370                 int error;
8371
8372                 /*
8373                  * We're here because the underlying file system does not
8374                  * support direnties or we mounted denying support so we must
8375                  * fall back to dirents and convert them to direntries.
8376                  *
8377                  * Our kernel buffer needs to be smaller since re-packing will
8378                  * expand each dirent.  The worse case (when the name length
8379                  * is 3 or less) corresponds to a struct direntry size of 32
8380                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
8381                  * (4-byte aligned).  So having a buffer that is 3/8 the size
8382                  * will prevent us from reading more than we can pack.
8383                  *
8384                  * Since this buffer is wired memory, we will limit the
8385                  * buffer size to a maximum of 32K. We would really like to
8386                  * use 32K in the MIN(), but we use magic number 87371 to
8387                  * prevent uio_resid() * 3 / 8 from overflowing.
8388                  */
8389                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
8390                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
8391                 if (bufptr == NULL) {
8392                         return ENOMEM;
8393                 }
8394
8395                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
8396                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
8397                 auio->uio_offset = uio->uio_offset;
8398
8399                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
8400
8401                 dep = (struct dirent *)bufptr;
8402                 bytesread = bufsize - uio_resid(auio);
8403
8404                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
8405                        M_TEMP, M_WAITOK);
8406                 /*
8407                  * Convert all the entries and copy them out to user's buffer.
8408                  */
8409                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
8410                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
8411
8412                         if (DIRENT_END(dep) > ((char *)bufptr + bytesread) ||
8413                             DIRENT_LEN(dep->d_namlen) > dep->d_reclen) {
8414                                 printf("%s: %s: Bad dirent recived from directory %s\n", __func__,
8415                                        vp->v_mount->mnt_vfsstat.f_mntonname,
8416                                        vp->v_name ? vp->v_name : "<unknown>");
8417                                 error = EIO;
8418                                 break;
8419                         }
8420
8421                         bzero(entry64, enbufsize);
8422                         /* Convert a dirent to a dirent64. */
8423                         entry64->d_ino = dep->d_ino;
8424                         entry64->d_seekoff = 0;
8425                         entry64->d_reclen = enbufsize;
8426                         entry64->d_namlen = dep->d_namlen;
8427                         entry64->d_type = dep->d_type;
8428                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
8429
8430                         /* Move to next entry. */
8431                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
8432
8433                         /* Copy entry64 to user's buffer. */
8434                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
8435                 }
8436
8437                 /* Update the real offset using the offset we got from VNOP_READDIR. */
8438                 if (error == 0) {
8439                         uio->uio_offset = auio->uio_offset;
8440                 }
8441                 uio_free(auio);
8442                 FREE(bufptr, M_TEMP);
8443                 FREE(entry64, M_TEMP);
8444                 return (error);
8445         }
8446 }
8447
8448 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
8449
8450 /*
8451  * Read a block of directory entries in a file system independent format.
8452  */
8453 static int
8454 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
8455                      off_t *offset, int flags)
8456 {
8457         vnode_t vp;
8458         struct vfs_context context = *vfs_context_current();    /* local copy */
8459         struct fileproc *fp;
8460         uio_t auio;
8461         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8462         off_t loff;
8463         int error, eofflag, numdirent;
8464         char uio_buf[ UIO_SIZEOF(1) ];
8465
8466         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
8467         if (error) {
8468                 return (error);
8469         }
8470         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
8471                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8472                 error = EBADF;
8473                 goto out;
8474         }
8475
8476         if (bufsize > GETDIRENTRIES_MAXBUFSIZE)
8477                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
8478
8479 #if CONFIG_MACF
8480         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
8481         if (error)
8482                 goto out;
8483 #endif
8484         if ( (error = vnode_getwithref(vp)) ) {
8485                 goto out;
8486         }
8487         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8488
8489 unionread:
8490         if (vp->v_type != VDIR) {
8491                 (void)vnode_put(vp);
8492                 error = EINVAL;
8493                 goto out;
8494         }
8495
8496 #if CONFIG_MACF
8497         error = mac_vnode_check_readdir(&context, vp);
8498         if (error != 0) {
8499                 (void)vnode_put(vp);
8500                 goto out;
8501         }
8502 #endif /* MAC */
8503
8504         loff = fp->f_fglob->fg_offset;
8505         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8506         uio_addiov(auio, bufp, bufsize);
8507
8508         if (flags & VNODE_READDIR_EXTENDED) {
8509                 error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
8510                 fp->f_fglob->fg_offset = uio_offset(auio);
8511         } else {
8512                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
8513                 fp->f_fglob->fg_offset = uio_offset(auio);
8514         }
8515         if (error) {
8516                 (void)vnode_put(vp);
8517                 goto out;
8518         }
8519
8520         if ((user_ssize_t)bufsize == uio_resid(auio)){
8521                 if (union_dircheckp) {
8522                         error = union_dircheckp(&vp, fp, &context);
8523                         if (error == -1)
8524                                 goto unionread;
8525                         if (error) {
8526                                 (void)vnode_put(vp);
8527                                 goto out;
8528                         }
8529                 }
8530
8531                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
8532                         struct vnode *tvp = vp;
8533                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
8534                                 vnode_ref(vp);
8535                                 fp->f_fglob->fg_data = (caddr_t) vp;
8536                                 fp->f_fglob->fg_offset = 0;
8537                                 vnode_rele(tvp);
8538                                 vnode_put(tvp);
8539                                 goto unionread;
8540                         }
8541                         vp = tvp;
8542                 }
8543         }
8544
8545         vnode_put(vp);
8546         if (offset) {
8547                 *offset = loff;
8548         }
8549
8550         *bytesread = bufsize - uio_resid(auio);
8551 out:
8552         file_drop(fd);
8553         return (error);
8554 }
8555
8556
8557 int
8558 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
8559 {
8560         off_t offset;
8561         ssize_t bytesread;
8562         int error;
8563
8564         AUDIT_ARG(fd, uap->fd);
8565         error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
8566
8567         if (error == 0) {
8568                 if (proc_is64bit(p)) {
8569                         user64_long_t base = (user64_long_t)offset;
8570                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
8571                 } else {
8572                         user32_long_t base = (user32_long_t)offset;
8573                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
8574                 }
8575                 *retval = bytesread;
8576         }
8577         return (error);
8578 }
8579
8580 int
8581 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
8582 {
8583         off_t offset;
8584         ssize_t bytesread;
8585         int error;
8586
8587         AUDIT_ARG(fd, uap->fd);
8588         error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
8589
8590         if (error == 0) {
8591                 *retval = bytesread;
8592                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
8593         }
8594         return (error);
8595 }
8596
8597
8598 /*
8599  * Set the mode mask for creation of filesystem nodes.
8600  * XXX implement xsecurity
8601  */
8602 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
8603 static int
8604 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
8605 {
8606         struct filedesc *fdp;
8607
8608         AUDIT_ARG(mask, newmask);
8609         proc_fdlock(p);
8610         fdp = p->p_fd;
8611         *retval = fdp->fd_cmask;
8612         fdp->fd_cmask = newmask & ALLPERMS;
8613         proc_fdunlock(p);
8614         return (0);
8615 }
8616
8617 /*
8618  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
8619  *
8620  * Parameters:    p                       Process requesting to set the umask
8621  *                uap                     User argument descriptor (see below)
8622  *                retval                  umask of the process (parameter p)
8623  *
8624  * Indirect:      uap->newmask            umask to set
8625  *                uap->xsecurity          ACL to set
8626  *
8627  * Returns:        0                      Success
8628  *                !0                      Not success
8629  *
8630  */
8631 int
8632 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
8633 {
8634         int ciferror;
8635         kauth_filesec_t xsecdst;
8636
8637         xsecdst = KAUTH_FILESEC_NONE;
8638         if (uap->xsecurity != USER_ADDR_NULL) {
8639                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
8640                         return ciferror;
8641         } else {
8642                 xsecdst = KAUTH_FILESEC_NONE;
8643         }
8644
8645         ciferror = umask1(p, uap->newmask, xsecdst, retval);
8646
8647         if (xsecdst != KAUTH_FILESEC_NONE)
8648                 kauth_filesec_free(xsecdst);
8649         return ciferror;
8650 }
8651
8652 int
8653 umask(proc_t p, struct umask_args *uap, int32_t *retval)
8654 {
8655         return(umask1(p, uap->newmask, UMASK_NOXSECURITY, retval));
8656 }
8657
8658 /*
8659  * Void all references to file by ripping underlying filesystem
8660  * away from vnode.
8661  */
8662 /* ARGSUSED */
8663 int
8664 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
8665 {
8666         vnode_t vp;
8667         struct vnode_attr va;
8668         vfs_context_t ctx = vfs_context_current();
8669         int error;
8670         struct nameidata nd;
8671
8672         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
8673                uap->path, ctx);
8674         error = namei(&nd);
8675         if (error)
8676                 return (error);
8677         vp = nd.ni_vp;
8678
8679         nameidone(&nd);
8680
8681         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
8682                 error = ENOTSUP;
8683                 goto out;
8684         }
8685
8686         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
8687                 error = EBUSY;
8688                 goto out;
8689         }
8690
8691 #if CONFIG_MACF
8692         error = mac_vnode_check_revoke(ctx, vp);
8693         if (error)
8694                 goto out;
8695 #endif
8696
8697         VATTR_INIT(&va);
8698         VATTR_WANTED(&va, va_uid);
8699         if ((error = vnode_getattr(vp, &va, ctx)))
8700                 goto out;
8701         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
8702             (error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
8703                 goto out;
8704         if (vp->v_usecount > 0 || (vnode_isaliased(vp)))
8705                 VNOP_REVOKE(vp, REVOKEALL, ctx);
8706 out:
8707         vnode_put(vp);
8708         return (error);
8709 }
8710
8711
8712 /*
8713  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
8714  *  The following system calls are designed to support features
8715  *  which are specific to the HFS & HFS Plus volume formats
8716  */
8717
8718
8719 /*
8720  * Obtain attribute information on objects in a directory while enumerating
8721  * the directory.
8722  */
8723 /* ARGSUSED */
8724 int
8725 getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
8726 {
8727         vnode_t vp;
8728         struct fileproc *fp;
8729         uio_t auio = NULL;
8730         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8731         uint32_t count = 0, savecount = 0;
8732         uint32_t newstate = 0;
8733         int error, eofflag;
8734         uint32_t loff = 0;
8735         struct attrlist attributelist;
8736         vfs_context_t ctx = vfs_context_current();
8737         int fd = uap->fd;
8738         char uio_buf[ UIO_SIZEOF(1) ];
8739         kauth_action_t action;
8740
8741         AUDIT_ARG(fd, fd);
8742
8743         /* Get the attributes into kernel space */
8744         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
8745                 return(error);
8746         }
8747         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
8748                 return(error);
8749         }
8750         savecount = count;
8751         if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) {
8752                 return (error);
8753         }
8754         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
8755                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8756                 error = EBADF;
8757                 goto out;
8758         }
8759
8760
8761 #if CONFIG_MACF
8762         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
8763             fp->f_fglob);
8764         if (error)
8765                 goto out;
8766 #endif
8767
8768
8769         if ( (error = vnode_getwithref(vp)) )
8770                 goto out;
8771
8772         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8773
8774 unionread:
8775         if (vp->v_type != VDIR) {
8776                 (void)vnode_put(vp);
8777                 error = EINVAL;
8778                 goto out;
8779         }
8780
8781 #if CONFIG_MACF
8782         error = mac_vnode_check_readdir(ctx, vp);
8783         if (error != 0) {
8784                 (void)vnode_put(vp);
8785                 goto out;
8786         }
8787 #endif /* MAC */
8788
8789         /* set up the uio structure which will contain the users return buffer */
8790         loff = fp->f_fglob->fg_offset;
8791         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8792         uio_addiov(auio, uap->buffer, uap->buffersize);
8793
8794         /*
8795          * If the only item requested is file names, we can let that past with
8796          * just LIST_DIRECTORY.  If they want any other attributes, that means
8797          * they need SEARCH as well.
8798          */
8799         action = KAUTH_VNODE_LIST_DIRECTORY;
8800         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
8801             attributelist.fileattr || attributelist.dirattr)
8802                 action |= KAUTH_VNODE_SEARCH;
8803
8804         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
8805
8806                 /* Believe it or not, uap->options only has 32-bits of valid
8807                  * info, so truncate before extending again */
8808
8809                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
8810                                 (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
8811         }
8812
8813         if (error) {
8814                 (void) vnode_put(vp);
8815                 goto out;
8816         }
8817
8818         /*
8819          * If we've got the last entry of a directory in a union mount
8820          * then reset the eofflag and pretend there's still more to come.
8821          * The next call will again set eofflag and the buffer will be empty,
8822          * so traverse to the underlying directory and do the directory
8823          * read there.
8824          */
8825         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
8826                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
8827                         eofflag = 0;
8828                 } else {                                                // Empty buffer
8829                         struct vnode *tvp = vp;
8830                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
8831                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
8832                                 fp->f_fglob->fg_data = (caddr_t) vp;
8833                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
8834                                 count = savecount;
8835                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
8836                                 vnode_put(tvp);
8837                                 goto unionread;
8838                         }
8839                         vp = tvp;
8840                 }
8841         }
8842
8843         (void)vnode_put(vp);
8844
8845         if (error)
8846                 goto out;
8847         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
8848
8849         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count))))
8850                 goto out;
8851         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate))))
8852                 goto out;
8853         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff))))
8854                 goto out;
8855
8856         *retval = eofflag;  /* similar to getdirentries */
8857         error = 0;
8858 out:
8859         file_drop(fd);
8860         return (error); /* return error earlier, an retval of 0 or 1 now */
8861
8862 } /* end of getdirentriesattr system call */
8863
8864 /*
8865 * Exchange data between two files
8866 */
8867
8868 /* ARGSUSED */
8869 int
8870 exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
8871 {
8872
8873         struct nameidata fnd, snd;
8874         vfs_context_t ctx = vfs_context_current();
8875         vnode_t fvp;
8876         vnode_t svp;
8877         int error;
8878         u_int32_t nameiflags;
8879         char *fpath = NULL;
8880         char *spath = NULL;
8881         int   flen=0, slen=0;
8882         int from_truncated=0, to_truncated=0;
8883 #if CONFIG_FSE
8884         fse_info f_finfo, s_finfo;
8885 #endif
8886
8887         nameiflags = 0;
8888         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8889
8890         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
8891                UIO_USERSPACE, uap->path1, ctx);
8892
8893         error = namei(&fnd);
8894         if (error)
8895                 goto out2;
8896
8897         nameidone(&fnd);
8898         fvp = fnd.ni_vp;
8899
8900         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
8901                UIO_USERSPACE, uap->path2, ctx);
8902
8903         error = namei(&snd);
8904         if (error) {
8905                 vnode_put(fvp);
8906                 goto out2;
8907         }
8908         nameidone(&snd);
8909         svp = snd.ni_vp;
8910
8911         /*
8912          * if the files are the same, return an inval error
8913          */
8914         if (svp == fvp) {
8915                 error = EINVAL;
8916                 goto out;
8917         }
8918
8919         /*
8920          * if the files are on different volumes, return an error
8921          */
8922         if (svp->v_mount != fvp->v_mount) {
8923                 error = EXDEV;
8924                 goto out;
8925         }
8926
8927         /* If they're not files, return an error */
8928         if ( (vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
8929                 error = EINVAL;
8930                 goto out;
8931         }
8932
8933 #if CONFIG_MACF
8934         error = mac_vnode_check_exchangedata(ctx,
8935             fvp, svp);
8936         if (error)
8937                 goto out;
8938 #endif
8939         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
8940             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0))
8941                 goto out;
8942
8943         if (
8944 #if CONFIG_FSE
8945         need_fsevent(FSE_EXCHANGE, fvp) ||
8946 #endif
8947         kauth_authorize_fileop_has_listeners()) {
8948                 GET_PATH(fpath);
8949                 GET_PATH(spath);
8950                 if (fpath == NULL || spath == NULL) {
8951                         error = ENOMEM;
8952                         goto out;
8953                 }
8954
8955                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
8956                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
8957
8958 #if CONFIG_FSE
8959                 get_fse_info(fvp, &f_finfo, ctx);
8960                 get_fse_info(svp, &s_finfo, ctx);
8961                 if (from_truncated || to_truncated) {
8962                         // set it here since only the f_finfo gets reported up to user space
8963                         f_finfo.mode |= FSE_TRUNCATED_PATH;
8964                 }
8965 #endif
8966         }
8967         /* Ok, make the call */
8968         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
8969
8970         if (error == 0) {
8971             const char *tmpname;
8972
8973             if (fpath != NULL && spath != NULL) {
8974                     /* call out to allow 3rd party notification of exchangedata.
8975                      * Ignore result of kauth_authorize_fileop call.
8976                      */
8977                     kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
8978                                            (uintptr_t)fpath, (uintptr_t)spath);
8979             }
8980             name_cache_lock();
8981
8982             tmpname     = fvp->v_name;
8983             fvp->v_name = svp->v_name;
8984             svp->v_name = tmpname;
8985
8986             if (fvp->v_parent != svp->v_parent) {
8987                 vnode_t tmp;
8988
8989                 tmp           = fvp->v_parent;
8990                 fvp->v_parent = svp->v_parent;
8991                 svp->v_parent = tmp;
8992             }
8993             name_cache_unlock();
8994
8995 #if CONFIG_FSE
8996             if (fpath != NULL && spath != NULL) {
8997                     add_fsevent(FSE_EXCHANGE, ctx,
8998                                 FSE_ARG_STRING, flen, fpath,
8999                                 FSE_ARG_FINFO, &f_finfo,
9000                                 FSE_ARG_STRING, slen, spath,
9001                                 FSE_ARG_FINFO, &s_finfo,
9002                                 FSE_ARG_DONE);
9003             }
9004 #endif
9005         }
9006
9007 out:
9008         if (fpath != NULL)
9009                 RELEASE_PATH(fpath);
9010         if (spath != NULL)
9011                 RELEASE_PATH(spath);
9012         vnode_put(svp);
9013         vnode_put(fvp);
9014 out2:
9015         return (error);
9016 }
9017
9018 /*
9019  * Return (in MB) the amount of freespace on the given vnode's volume.
9020  */
9021 uint32_t freespace_mb(vnode_t vp);
9022
9023 uint32_t
9024 freespace_mb(vnode_t vp)
9025 {
9026         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
9027         return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
9028                 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
9029 }
9030
9031 #if CONFIG_SEARCHFS
9032
9033 /* ARGSUSED */
9034
9035 int
9036 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
9037 {
9038         vnode_t vp, tvp;
9039         int i, error=0;
9040         int fserror = 0;
9041         struct nameidata nd;
9042         struct user64_fssearchblock searchblock;
9043         struct searchstate *state;
9044         struct attrlist *returnattrs;
9045         struct timeval timelimit;
9046         void *searchparams1,*searchparams2;
9047         uio_t auio = NULL;
9048         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
9049         uint32_t nummatches;
9050         int mallocsize;
9051         uint32_t nameiflags;
9052         vfs_context_t ctx = vfs_context_current();
9053         char uio_buf[ UIO_SIZEOF(1) ];
9054
9055         /* Start by copying in fsearchblock parameter list */
9056     if (IS_64BIT_PROCESS(p)) {
9057         error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
9058         timelimit.tv_sec = searchblock.timelimit.tv_sec;
9059         timelimit.tv_usec = searchblock.timelimit.tv_usec;
9060     }
9061     else {
9062         struct user32_fssearchblock tmp_searchblock;
9063
9064         error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
9065         // munge into 64-bit version
9066         searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
9067         searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
9068         searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
9069         searchblock.maxmatches = tmp_searchblock.maxmatches;
9070                 /*
9071                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
9072                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
9073                  */
9074         timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
9075         timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
9076         searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
9077         searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
9078         searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
9079         searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
9080         searchblock.searchattrs = tmp_searchblock.searchattrs;
9081     }
9082         if (error)
9083                 return(error);
9084
9085         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
9086          */
9087         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
9088                 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS)
9089                 return(EINVAL);
9090
9091         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
9092         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
9093         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
9094         /* block.                                                                                             */
9095         /*                                                                                                    */
9096         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
9097         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
9098         /*       assumes the size is still 556 bytes it will continue to work                                 */
9099
9100         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
9101                 sizeof(struct attrlist) + sizeof(struct searchstate) + (2*sizeof(uint32_t));
9102
9103         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
9104
9105         /* Now set up the various pointers to the correct place in our newly allocated memory */
9106
9107         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
9108         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
9109         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof (struct attrlist));
9110
9111         /* Now copy in the stuff given our local variables. */
9112
9113         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1)))
9114                 goto freeandexit;
9115
9116         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2)))
9117                 goto freeandexit;
9118
9119         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist))))
9120                 goto freeandexit;
9121
9122         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate))))
9123                 goto freeandexit;
9124
9125         /*
9126          * When searching a union mount, need to set the
9127          * start flag at the first call on each layer to
9128          * reset state for the new volume.
9129          */
9130         if (uap->options & SRCHFS_START)
9131                 state->ss_union_layer = 0;
9132         else
9133                 uap->options |= state->ss_union_flags;
9134         state->ss_union_flags = 0;
9135
9136         /*
9137          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
9138          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
9139          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
9140          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
9141          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
9142          */
9143
9144         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
9145                 attrreference_t* string_ref;
9146                 u_int32_t* start_length;
9147                 user64_size_t param_length;
9148
9149                 /* validate searchparams1 */
9150                 param_length = searchblock.sizeofsearchparams1;
9151                 /* skip the word that specifies length of the buffer */
9152                 start_length= (u_int32_t*) searchparams1;
9153                 start_length= start_length+1;
9154                 string_ref= (attrreference_t*) start_length;
9155
9156                 /* ensure no negative offsets or too big offsets */
9157                 if (string_ref->attr_dataoffset < 0 ) {
9158                         error = EINVAL;
9159                         goto freeandexit;
9160                 }
9161                 if (string_ref->attr_length > MAXPATHLEN) {
9162                         error = EINVAL;
9163                         goto freeandexit;
9164                 }
9165
9166                 /* Check for pointer overflow in the string ref */
9167                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
9168                         error = EINVAL;
9169                         goto freeandexit;
9170                 }
9171
9172                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
9173                         error = EINVAL;
9174                         goto freeandexit;
9175                 }
9176                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
9177                         error = EINVAL;
9178                         goto freeandexit;
9179                 }
9180         }
9181
9182         /* set up the uio structure which will contain the users return buffer */
9183         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
9184         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
9185
9186         nameiflags = 0;
9187         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
9188         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
9189                UIO_USERSPACE, uap->path, ctx);
9190
9191         error = namei(&nd);
9192         if (error)
9193                 goto freeandexit;
9194         vp = nd.ni_vp;
9195         nameidone(&nd);
9196
9197         /*
9198          * Switch to the root vnode for the volume
9199          */
9200         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
9201         vnode_put(vp);
9202         if (error)
9203                 goto freeandexit;
9204         vp = tvp;
9205
9206         /*
9207          * If it's a union mount, the path lookup takes
9208          * us to the top layer. But we may need to descend
9209          * to a lower layer. For non-union mounts the layer
9210          * is always zero.
9211          */
9212         for (i = 0; i < (int) state->ss_union_layer; i++) {
9213                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0)
9214                         break;
9215                 tvp = vp;
9216                 vp = vp->v_mount->mnt_vnodecovered;
9217                 if (vp == NULL) {
9218                         vnode_put(tvp);
9219                         error = ENOENT;
9220                         goto freeandexit;
9221                 }
9222                 error = vnode_getwithref(vp);
9223                 vnode_put(tvp);
9224                 if (error)
9225                         goto freeandexit;
9226         }
9227
9228 #if CONFIG_MACF
9229         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
9230         if (error) {
9231                 vnode_put(vp);
9232                 goto freeandexit;
9233         }
9234 #endif
9235
9236
9237         /*
9238          * If searchblock.maxmatches == 0, then skip the search. This has happened
9239          * before and sometimes the underlying code doesnt deal with it well.
9240          */
9241          if (searchblock.maxmatches == 0) {
9242                 nummatches = 0;
9243                 goto saveandexit;
9244          }
9245
9246         /*
9247          * Allright, we have everything we need, so lets make that call.
9248          *
9249          * We keep special track of the return value from the file system:
9250          * EAGAIN is an acceptable error condition that shouldn't keep us
9251          * from copying out any results...
9252          */
9253
9254         fserror = VNOP_SEARCHFS(vp,
9255                 searchparams1,
9256                 searchparams2,
9257                 &searchblock.searchattrs,
9258                 (u_long)searchblock.maxmatches,
9259                 &timelimit,
9260                 returnattrs,
9261                 &nummatches,
9262                 (u_long)uap->scriptcode,
9263                 (u_long)uap->options,
9264                 auio,
9265                 (struct searchstate *) &state->ss_fsstate,
9266                 ctx);
9267
9268         /*
9269          * If it's a union mount we need to be called again
9270          * to search the mounted-on filesystem.
9271          */
9272         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
9273                 state->ss_union_flags = SRCHFS_START;
9274                 state->ss_union_layer++;        // search next layer down
9275                 fserror = EAGAIN;
9276         }
9277
9278 saveandexit:
9279
9280         vnode_put(vp);
9281
9282         /* Now copy out the stuff that needs copying out. That means the number of matches, the
9283            search state.  Everything was already put into he return buffer by the vop call. */
9284
9285         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0)
9286                 goto freeandexit;
9287
9288         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0)
9289                 goto freeandexit;
9290
9291         error = fserror;
9292
9293 freeandexit:
9294
9295         FREE(searchparams1,M_TEMP);
9296
9297         return(error);
9298
9299
9300 } /* end of searchfs system call */
9301
9302 #else /* CONFIG_SEARCHFS */
9303
9304 int
9305 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
9306 {
9307         return (ENOTSUP);
9308 }
9309
9310 #endif /* CONFIG_SEARCHFS */
9311
9312
9313 lck_grp_attr_t *  nspace_group_attr;
9314 lck_attr_t *      nspace_lock_attr;
9315 lck_grp_t *       nspace_mutex_group;
9316
9317 lck_mtx_t         nspace_handler_lock;
9318 lck_mtx_t         nspace_handler_exclusion_lock;
9319
9320 time_t snapshot_timestamp=0;
9321 int nspace_allow_virtual_devs=0;
9322
9323 void nspace_handler_init(void);
9324
9325 typedef struct nspace_item_info {
9326         struct vnode *vp;
9327         void         *arg;
9328         uint64_t      op;
9329         uint32_t      vid;
9330         uint32_t      flags;
9331         uint32_t      token;
9332         uint32_t      refcount;
9333 } nspace_item_info;
9334
9335 #define MAX_NSPACE_ITEMS   128
9336 nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
9337 uint32_t      nspace_item_idx=0;              // also used as the sleep/wakeup rendezvous address
9338 uint32_t      nspace_token_id=0;
9339 uint32_t      nspace_handler_timeout = 15;    // seconds
9340
9341 #define NSPACE_ITEM_NEW         0x0001
9342 #define NSPACE_ITEM_PROCESSING  0x0002
9343 #define NSPACE_ITEM_DEAD        0x0004
9344 #define NSPACE_ITEM_CANCELLED   0x0008
9345 #define NSPACE_ITEM_DONE        0x0010
9346 #define NSPACE_ITEM_RESET_TIMER 0x0020
9347
9348 #define NSPACE_ITEM_NSPACE_EVENT   0x0040
9349 #define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
9350
9351 #define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT)
9352
9353 //#pragma optimization_level 0
9354
9355 typedef enum {
9356         NSPACE_HANDLER_NSPACE = 0,
9357         NSPACE_HANDLER_SNAPSHOT = 1,
9358
9359         NSPACE_HANDLER_COUNT,
9360 } nspace_type_t;
9361
9362 typedef struct {
9363         uint64_t handler_tid;
9364         struct proc *handler_proc;
9365         int handler_busy;
9366 } nspace_handler_t;
9367
9368 nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
9369
9370 /* namespace fsctl functions */
9371 static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
9372 static int nspace_item_flags_for_type(nspace_type_t nspace_type);
9373 static int nspace_open_flags_for_type(nspace_type_t nspace_type);
9374 static nspace_type_t nspace_type_for_op(uint64_t op);
9375 static int nspace_is_special_process(struct proc *proc);
9376 static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
9377 static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
9378 static int validate_namespace_args (int is64bit, int size);
9379 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
9380
9381
9382 static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
9383 {
9384         switch(nspace_type) {
9385                 case NSPACE_HANDLER_NSPACE:
9386                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
9387                 case NSPACE_HANDLER_SNAPSHOT:
9388                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
9389                 default:
9390                         printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
9391                         return 0;
9392         }
9393 }
9394
9395 static inline int nspace_item_flags_for_type(nspace_type_t nspace_type)
9396 {
9397         switch(nspace_type) {
9398                 case NSPACE_HANDLER_NSPACE:
9399                         return NSPACE_ITEM_NSPACE_EVENT;
9400                 case NSPACE_HANDLER_SNAPSHOT:
9401                         return NSPACE_ITEM_SNAPSHOT_EVENT;
9402                 default:
9403                         printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
9404                         return 0;
9405         }
9406 }
9407
9408 static inline int nspace_open_flags_for_type(nspace_type_t nspace_type)
9409 {
9410         switch(nspace_type) {
9411                 case NSPACE_HANDLER_NSPACE:
9412                         return FREAD | FWRITE | O_EVTONLY;
9413                 case NSPACE_HANDLER_SNAPSHOT:
9414                         return FREAD | O_EVTONLY;
9415                 default:
9416                         printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
9417                         return 0;
9418         }
9419 }
9420
9421 static inline nspace_type_t nspace_type_for_op(uint64_t op)
9422 {
9423         switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
9424                 case NAMESPACE_HANDLER_NSPACE_EVENT:
9425                         return NSPACE_HANDLER_NSPACE;
9426                 case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
9427                         return NSPACE_HANDLER_SNAPSHOT;
9428                 default:
9429                         printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
9430                         return NSPACE_HANDLER_NSPACE;
9431         }
9432 }
9433
9434 static inline int nspace_is_special_process(struct proc *proc)
9435 {
9436         int i;
9437         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
9438                 if (proc == nspace_handlers[i].handler_proc)
9439                         return 1;
9440         }
9441         return 0;
9442 }
9443
9444 void
9445 nspace_handler_init(void)
9446 {
9447         nspace_lock_attr    = lck_attr_alloc_init();
9448         nspace_group_attr   = lck_grp_attr_alloc_init();
9449         nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
9450         lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
9451         lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
9452         memset(&nspace_items[0], 0, sizeof(nspace_items));
9453 }
9454
9455 void
9456 nspace_proc_exit(struct proc *p)
9457 {
9458         int i, event_mask = 0;
9459
9460         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
9461                 if (p == nspace_handlers[i].handler_proc) {
9462                         event_mask |= nspace_item_flags_for_type(i);
9463                         nspace_handlers[i].handler_tid = 0;
9464                         nspace_handlers[i].handler_proc = NULL;
9465                 }
9466         }
9467
9468         if (event_mask == 0) {
9469                 return;
9470         }
9471
9472         lck_mtx_lock(&nspace_handler_lock);
9473         if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
9474                 // if this process was the snapshot handler, zero snapshot_timeout
9475                 snapshot_timestamp = 0;
9476         }
9477
9478         //
9479         // unblock anyone that's waiting for the handler that died
9480         //
9481         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9482                 if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
9483
9484                         if ( nspace_items[i].flags & event_mask ) {
9485
9486                                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9487                                         vnode_lock_spin(nspace_items[i].vp);
9488                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9489                                         vnode_unlock(nspace_items[i].vp);
9490                                 }
9491                                 nspace_items[i].vp = NULL;
9492                                 nspace_items[i].vid = 0;
9493                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9494                                 nspace_items[i].token = 0;
9495
9496                                 wakeup((caddr_t)&(nspace_items[i].vp));
9497                         }
9498                 }
9499         }
9500
9501         wakeup((caddr_t)&nspace_item_idx);
9502         lck_mtx_unlock(&nspace_handler_lock);
9503 }
9504
9505
9506 int
9507 resolve_nspace_item(struct vnode *vp, uint64_t op)
9508 {
9509         return resolve_nspace_item_ext(vp, op, NULL);
9510 }
9511
9512 int
9513 resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
9514 {
9515         int i, error, keep_waiting;
9516         struct timespec ts;
9517         nspace_type_t nspace_type = nspace_type_for_op(op);
9518
9519         // only allow namespace events on regular files, directories and symlinks.
9520         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
9521                 return 0;
9522         }
9523
9524         //
9525         // if this is a snapshot event and the vnode is on a
9526         // disk image just pretend nothing happened since any
9527         // change to the disk image will cause the disk image
9528         // itself to get backed up and this avoids multi-way
9529         // deadlocks between the snapshot handler and the ever
9530         // popular diskimages-helper process.  the variable
9531         // nspace_allow_virtual_devs allows this behavior to
9532         // be overridden (for use by the Mobile TimeMachine
9533         // testing infrastructure which uses disk images)
9534         //
9535         if (   (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
9536             && (vp->v_mount != NULL)
9537             && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
9538             && !nspace_allow_virtual_devs) {
9539
9540                 return 0;
9541         }
9542
9543         // if (thread_tid(current_thread()) == namespace_handler_tid) {
9544         if (nspace_handlers[nspace_type].handler_proc == NULL) {
9545                 return 0;
9546         }
9547
9548         if (nspace_is_special_process(current_proc())) {
9549                 return EDEADLK;
9550         }
9551
9552         lck_mtx_lock(&nspace_handler_lock);
9553
9554 retry:
9555         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9556                 if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
9557                         break;
9558                 }
9559         }
9560
9561         if (i >= MAX_NSPACE_ITEMS) {
9562                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9563                         if (nspace_items[i].flags == 0) {
9564                                 break;
9565                         }
9566                 }
9567         } else {
9568                 nspace_items[i].refcount++;
9569         }
9570
9571         if (i >= MAX_NSPACE_ITEMS) {
9572                 ts.tv_sec = nspace_handler_timeout;
9573                 ts.tv_nsec = 0;
9574
9575                 error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS|PCATCH, "nspace-no-space", &ts);
9576                 if (error == 0) {
9577                         // an entry got free'd up, go see if we can get a slot
9578                         goto retry;
9579                 } else {
9580                         lck_mtx_unlock(&nspace_handler_lock);
9581                         return error;
9582                 }
9583         }
9584
9585         //
9586         // if it didn't already exist, add it.  if it did exist
9587         // we'll get woken up when someone does a wakeup() on
9588         // the slot in the nspace_items table.
9589         //
9590         if (vp != nspace_items[i].vp) {
9591                 nspace_items[i].vp = vp;
9592                 nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
9593                 nspace_items[i].op = op;
9594                 nspace_items[i].vid = vnode_vid(vp);
9595                 nspace_items[i].flags = NSPACE_ITEM_NEW;
9596                 nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
9597                 if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
9598                         if (arg) {
9599                                 vnode_lock_spin(vp);
9600                                 vp->v_flag |= VNEEDSSNAPSHOT;
9601                                 vnode_unlock(vp);
9602                         }
9603                 }
9604
9605                 nspace_items[i].token = 0;
9606                 nspace_items[i].refcount = 1;
9607
9608                 wakeup((caddr_t)&nspace_item_idx);
9609         }
9610
9611         //
9612         // Now go to sleep until the handler does a wakeup on this
9613         // slot in the nspace_items table (or we timeout).
9614         //
9615         keep_waiting = 1;
9616         while(keep_waiting) {
9617                 ts.tv_sec = nspace_handler_timeout;
9618                 ts.tv_nsec = 0;
9619                 error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS|PCATCH, "namespace-done", &ts);
9620
9621                 if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
9622                         error = 0;
9623                 } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
9624                         error = nspace_items[i].token;
9625                 } else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
9626                         if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
9627                                 nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
9628                                 continue;
9629                         } else {
9630                                 error = ETIMEDOUT;
9631                         }
9632                 } else if (error == 0) {
9633                         // hmmm, why did we get woken up?
9634                         printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
9635                                nspace_items[i].token);
9636                 }
9637
9638                 if (--nspace_items[i].refcount == 0) {
9639                         nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
9640                         nspace_items[i].arg = NULL;
9641                         nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
9642                         nspace_items[i].flags = 0;     // this clears it for re-use
9643                 }
9644                 wakeup(&nspace_token_id);
9645                 keep_waiting = 0;
9646         }
9647
9648         lck_mtx_unlock(&nspace_handler_lock);
9649
9650         return error;
9651 }
9652
9653 int nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg)
9654 {
9655         int snapshot_error = 0;
9656
9657         if (vp == NULL) {
9658                 return 0;
9659         }
9660
9661         /* Swap files are special; skip them */
9662         if (vnode_isswap(vp)) {
9663                 return 0;
9664         }
9665
9666         if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) {
9667                 // the change time is within this epoch
9668                 int error;
9669
9670                 error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg);
9671                 if (error == EDEADLK) {
9672                         snapshot_error = 0;
9673                 } else if (error) {
9674                         if (error == EAGAIN) {
9675                                 printf("nspace_snapshot_event: timed out waiting for namespace handler...\n");
9676                         } else if (error == EINTR) {
9677                                 // printf("nspace_snapshot_event: got a signal while waiting for namespace handler...\n");
9678                                 snapshot_error = EINTR;
9679                         }
9680                 }
9681         }
9682
9683         return snapshot_error;
9684 }
9685
9686 int
9687 get_nspace_item_status(struct vnode *vp, int32_t *status)
9688 {
9689         int i;
9690
9691         lck_mtx_lock(&nspace_handler_lock);
9692         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9693                 if (nspace_items[i].vp == vp) {
9694                         break;
9695                 }
9696         }
9697
9698         if (i >= MAX_NSPACE_ITEMS) {
9699                 lck_mtx_unlock(&nspace_handler_lock);
9700                 return ENOENT;
9701         }
9702
9703         *status = nspace_items[i].flags;
9704         lck_mtx_unlock(&nspace_handler_lock);
9705         return 0;
9706 }
9707
9708
9709 #if 0
9710 static int
9711 build_volfs_path(struct vnode *vp, char *path, int *len)
9712 {
9713         struct vnode_attr va;
9714         int ret;
9715
9716         VATTR_INIT(&va);
9717         VATTR_WANTED(&va, va_fsid);
9718         VATTR_WANTED(&va, va_fileid);
9719
9720         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
9721                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
9722                 ret = -1;
9723         } else {
9724                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
9725                 ret = 0;
9726         }
9727
9728         return ret;
9729 }
9730 #endif
9731
9732 //
9733 // Note: this function does NOT check permissions on all of the
9734 // parent directories leading to this vnode.  It should only be
9735 // called on behalf of a root process.  Otherwise a process may
9736 // get access to a file because the file itself is readable even
9737 // though its parent directories would prevent access.
9738 //
9739 static int
9740 vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
9741 {
9742         int error, action;
9743
9744         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9745                 return error;
9746         }
9747
9748 #if CONFIG_MACF
9749         error = mac_vnode_check_open(ctx, vp, fmode);
9750         if (error)
9751                 return error;
9752 #endif
9753
9754         /* compute action to be authorized */
9755         action = 0;
9756         if (fmode & FREAD) {
9757                 action |= KAUTH_VNODE_READ_DATA;
9758         }
9759         if (fmode & (FWRITE | O_TRUNC)) {
9760                 /*
9761                  * If we are writing, appending, and not truncating,
9762                  * indicate that we are appending so that if the
9763                  * UF_APPEND or SF_APPEND bits are set, we do not deny
9764                  * the open.
9765                  */
9766                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
9767                         action |= KAUTH_VNODE_APPEND_DATA;
9768                 } else {
9769                         action |= KAUTH_VNODE_WRITE_DATA;
9770                 }
9771         }
9772
9773         if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
9774                 return error;
9775
9776
9777         //
9778         // if the vnode is tagged VOPENEVT and the current process
9779         // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
9780         // flag to the open mode so that this open won't count against
9781         // the vnode when carbon delete() does a vnode_isinuse() to see
9782         // if a file is currently in use.  this allows spotlight
9783         // importers to not interfere with carbon apps that depend on
9784         // the no-delete-if-busy semantics of carbon delete().
9785         //
9786         if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
9787                 fmode |= O_EVTONLY;
9788         }
9789
9790         if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) {
9791                 return error;
9792         }
9793         if ( (error = vnode_ref_ext(vp, fmode, 0)) ) {
9794                 VNOP_CLOSE(vp, fmode, ctx);
9795                 return error;
9796         }
9797
9798         /* Call out to allow 3rd party notification of open.
9799          * Ignore result of kauth_authorize_fileop call.
9800          */
9801 #if CONFIG_MACF
9802         mac_vnode_notify_open(ctx, vp, fmode);
9803 #endif
9804         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
9805                                (uintptr_t)vp, 0);
9806
9807
9808         return 0;
9809 }
9810
9811 static int
9812 wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
9813 {
9814         int i;
9815         int error = 0;
9816         int unblock = 0;
9817         task_t curtask;
9818
9819         lck_mtx_lock(&nspace_handler_exclusion_lock);
9820         if (nspace_handlers[nspace_type].handler_busy) {
9821                 lck_mtx_unlock(&nspace_handler_exclusion_lock);
9822                 return EBUSY;
9823         }
9824
9825         nspace_handlers[nspace_type].handler_busy = 1;
9826         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9827
9828         /*
9829          * Any process that gets here will be one of the namespace handlers.
9830          * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
9831          * as we can cause deadlocks to occur, because the namespace handler may prevent
9832          * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
9833          * process.
9834          */
9835         curtask = current_task();
9836         bsd_set_dependency_capable (curtask);
9837
9838         lck_mtx_lock(&nspace_handler_lock);
9839         if (nspace_handlers[nspace_type].handler_proc == NULL) {
9840                 nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
9841                 nspace_handlers[nspace_type].handler_proc = current_proc();
9842         }
9843
9844         if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
9845                         (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9846                 error = EINVAL;
9847         }
9848
9849         while (error == 0) {
9850
9851                 /* Try to find matching namespace item */
9852                 for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
9853                         if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9854                                 if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9855                                         break;
9856                                 }
9857                         }
9858                 }
9859
9860                 if (i >= MAX_NSPACE_ITEMS) {
9861                         /* Nothing is there yet. Wait for wake up and retry */
9862                         error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0);
9863                         if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9864                                 /* Prevent infinite loop if snapshot handler exited */
9865                                 error = EINVAL;
9866                                 break;
9867                         }
9868                         continue;
9869                 }
9870
9871                 nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
9872                 nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
9873                 nspace_items[i].token  = ++nspace_token_id;
9874
9875                 assert(nspace_items[i].vp);
9876                 struct fileproc *fp;
9877                 int32_t indx;
9878                 int32_t fmode;
9879                 struct proc *p = current_proc();
9880                 vfs_context_t ctx = vfs_context_current();
9881                 struct vnode_attr va;
9882                 bool vn_get_succsessful = false;
9883                 bool vn_open_successful = false;
9884                 bool fp_alloc_successful = false;
9885
9886                 /*
9887                  * Use vnode pointer to acquire a file descriptor for
9888                  * hand-off to userland
9889                  */
9890                 fmode = nspace_open_flags_for_type(nspace_type);
9891                 error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
9892                 if (error) goto cleanup;
9893                 vn_get_succsessful = true;
9894
9895                 error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
9896                 if (error) goto cleanup;
9897                 vn_open_successful = true;
9898
9899                 error = falloc(p, &fp, &indx, ctx);
9900                 if (error) goto cleanup;
9901                 fp_alloc_successful = true;
9902
9903                 fp->f_fglob->fg_flag = fmode;
9904                 fp->f_fglob->fg_ops = &vnops;
9905                 fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
9906
9907                 proc_fdlock(p);
9908                 procfdtbl_releasefd(p, indx, NULL);
9909                 fp_drop(p, indx, fp, 1);
9910                 proc_fdunlock(p);
9911
9912                 /*
9913                  * All variants of the namespace handler struct support these three fields:
9914                  * token, flags, and the FD pointer
9915                  */
9916                 error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
9917                 if (error) goto cleanup;
9918                 error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
9919                 if (error) goto cleanup;
9920                 error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
9921                 if (error) goto cleanup;
9922
9923                 /*
9924                  * Handle optional fields:
9925                  * extended version support an info ptr (offset, length), and the
9926                  *
9927                  * namedata version supports a unique per-link object ID
9928                  *
9929                  */
9930                 if (nhd->infoptr) {
9931                         uio_t uio = (uio_t)nspace_items[i].arg;
9932                         uint64_t u_offset, u_length;
9933
9934                         if (uio) {
9935                                 u_offset = uio_offset(uio);
9936                                 u_length = uio_resid(uio);
9937                         } else {
9938                                 u_offset = 0;
9939                                 u_length = 0;
9940                         }
9941                         error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
9942                         if (error) goto cleanup;
9943                         error = copyout(&u_length, nhd->infoptr + sizeof(uint64_t), sizeof(uint64_t));
9944                         if (error) goto cleanup;
9945                 }
9946
9947                 if (nhd->objid) {
9948                         VATTR_INIT(&va);
9949                         VATTR_WANTED(&va, va_linkid);
9950                         error = vnode_getattr(nspace_items[i].vp, &va, ctx);
9951                         if (error) goto cleanup;
9952
9953                         uint64_t linkid = 0;
9954                         if (VATTR_IS_SUPPORTED (&va, va_linkid)) {
9955                                 linkid = (uint64_t)va.va_linkid;
9956                         }
9957                         error = copyout(&linkid, nhd->objid, sizeof(uint64_t));
9958                 }
9959 cleanup:
9960                 if (error) {
9961                         if (fp_alloc_successful) fp_free(p, indx, fp);
9962                         if (vn_open_successful) vn_close(nspace_items[i].vp, fmode, ctx);
9963                         unblock = 1;
9964                 }
9965
9966                 if (vn_get_succsessful) vnode_put(nspace_items[i].vp);
9967
9968                 break;
9969         }
9970
9971         if (unblock) {
9972                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9973                         vnode_lock_spin(nspace_items[i].vp);
9974                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9975                         vnode_unlock(nspace_items[i].vp);
9976                 }
9977                 nspace_items[i].vp = NULL;
9978                 nspace_items[i].vid = 0;
9979                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9980                 nspace_items[i].token = 0;
9981
9982                 wakeup((caddr_t)&(nspace_items[i].vp));
9983         }
9984
9985         if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
9986                 // just go through every snapshot event and unblock it immediately.
9987                 if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9988                         for(i = 0; i < MAX_NSPACE_ITEMS; i++) {
9989                                 if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9990                                         if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9991                                                 nspace_items[i].vp = NULL;
9992                                                 nspace_items[i].vid = 0;
9993                                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9994                                                 nspace_items[i].token = 0;
9995
9996                                                 wakeup((caddr_t)&(nspace_items[i].vp));
9997                                         }
9998                                 }
9999                         }
10000                 }
10001         }
10002
10003         lck_mtx_unlock(&nspace_handler_lock);
10004
10005         lck_mtx_lock(&nspace_handler_exclusion_lock);
10006         nspace_handlers[nspace_type].handler_busy = 0;
10007         lck_mtx_unlock(&nspace_handler_exclusion_lock);
10008
10009         return error;
10010 }
10011
10012 static inline int validate_namespace_args (int is64bit, int size) {
10013
10014         if (is64bit) {
10015                 /* Must be one of these */
10016                 if (size == sizeof(user64_namespace_handler_info)) {
10017                         goto sizeok;
10018                 }
10019                 if (size == sizeof(user64_namespace_handler_info_ext)) {
10020                         goto sizeok;
10021                 }
10022                 if (size == sizeof(user64_namespace_handler_data)) {
10023                         goto sizeok;
10024                 }
10025                 return EINVAL;
10026         }
10027         else {
10028                 /* 32 bit -- must be one of these */
10029                 if (size == sizeof(user32_namespace_handler_info)) {
10030                         goto sizeok;
10031                 }
10032                 if (size == sizeof(user32_namespace_handler_info_ext)) {
10033                         goto sizeok;
10034                 }
10035                 if (size == sizeof(user32_namespace_handler_data)) {
10036                         goto sizeok;
10037                 }
10038                 return EINVAL;
10039         }
10040
10041 sizeok:
10042
10043         return 0;
10044
10045 }
10046
10047 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
10048 {
10049         int error = 0;
10050         namespace_handler_data nhd;
10051
10052         bzero (&nhd, sizeof(namespace_handler_data));
10053
10054         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10055                 return error;
10056         }
10057
10058         error = validate_namespace_args (is64bit, size);
10059         if (error) {
10060                 return error;
10061         }
10062
10063         /* Copy in the userland pointers into our kernel-only struct */
10064
10065         if (is64bit) {
10066                 /* 64 bit userland structures */
10067                 nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
10068                 nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
10069                 nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
10070
10071                 /* If the size is greater than the standard info struct, add in extra fields */
10072                 if (size > (sizeof(user64_namespace_handler_info))) {
10073                         if (size >= (sizeof(user64_namespace_handler_info_ext))) {
10074                                 nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
10075                         }
10076                         if (size == (sizeof(user64_namespace_handler_data))) {
10077                                 nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
10078                         }
10079                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
10080                 }
10081         }
10082         else {
10083                 /* 32 bit userland structures */
10084                 nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
10085                 nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
10086                 nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
10087
10088                 if (size > (sizeof(user32_namespace_handler_info))) {
10089                         if (size >= (sizeof(user32_namespace_handler_info_ext))) {
10090                                 nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
10091                         }
10092                         if (size == (sizeof(user32_namespace_handler_data))) {
10093                                 nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
10094                         }
10095                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
10096                 }
10097         }
10098
10099         return wait_for_namespace_event(&nhd, nspace_type);
10100 }
10101
10102 static unsigned long
10103 fsctl_bogus_command_compat(unsigned long cmd)
10104 {
10105
10106         switch (cmd) {
10107         case IOCBASECMD(FSIOC_SYNC_VOLUME):
10108                 return (FSIOC_SYNC_VOLUME);
10109         case IOCBASECMD(FSIOC_ROUTEFS_SETROUTEID):
10110                 return (FSIOC_ROUTEFS_SETROUTEID);
10111         case IOCBASECMD(FSIOC_SET_PACKAGE_EXTS):
10112                 return (FSIOC_SET_PACKAGE_EXTS);
10113         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_GET):
10114                 return (FSIOC_NAMESPACE_HANDLER_GET);
10115         case IOCBASECMD(FSIOC_OLD_SNAPSHOT_HANDLER_GET):
10116                 return (FSIOC_OLD_SNAPSHOT_HANDLER_GET);
10117         case IOCBASECMD(FSIOC_SNAPSHOT_HANDLER_GET_EXT):
10118                 return (FSIOC_SNAPSHOT_HANDLER_GET_EXT);
10119         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UPDATE):
10120                 return (FSIOC_NAMESPACE_HANDLER_UPDATE);
10121         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UNBLOCK):
10122                 return (FSIOC_NAMESPACE_HANDLER_UNBLOCK);
10123         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_CANCEL):
10124                 return (FSIOC_NAMESPACE_HANDLER_CANCEL);
10125         case IOCBASECMD(FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME):
10126                 return (FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME);
10127         case IOCBASECMD(FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS):
10128                 return (FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS);
10129         case IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE):
10130                 return (FSIOC_SET_FSTYPENAME_OVERRIDE);
10131         case IOCBASECMD(DISK_CONDITIONER_IOC_GET):
10132                 return (DISK_CONDITIONER_IOC_GET);
10133         case IOCBASECMD(DISK_CONDITIONER_IOC_SET):
10134                 return (DISK_CONDITIONER_IOC_SET);
10135         case IOCBASECMD(FSIOC_FIOSEEKHOLE):
10136                 return (FSIOC_FIOSEEKHOLE);
10137         case IOCBASECMD(FSIOC_FIOSEEKDATA):
10138                 return (FSIOC_FIOSEEKDATA);
10139         case IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME):
10140                 return (SPOTLIGHT_IOC_GET_MOUNT_TIME);
10141         case IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME):
10142                 return (SPOTLIGHT_IOC_GET_LAST_MTIME);
10143         }
10144
10145         return (cmd);
10146 }
10147
10148 /*
10149  * Make a filesystem-specific control call:
10150  */
10151 /* ARGSUSED */
10152 static int
10153 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
10154 {
10155         int error=0;
10156         boolean_t is64bit;
10157         u_int size;
10158 #define STK_PARAMS 128
10159         char stkbuf[STK_PARAMS] = {0};
10160         caddr_t data, memp;
10161         vnode_t vp = *arg_vp;
10162
10163         cmd = fsctl_bogus_command_compat(cmd);
10164
10165         size = IOCPARM_LEN(cmd);
10166         if (size > IOCPARM_MAX) return (EINVAL);
10167
10168         is64bit = proc_is64bit(p);
10169
10170         memp = NULL;
10171
10172         if (size > sizeof (stkbuf)) {
10173                 if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
10174                 data = memp;
10175         } else {
10176                 data = &stkbuf[0];
10177         };
10178
10179         if (cmd & IOC_IN) {
10180                 if (size) {
10181                         error = copyin(udata, data, size);
10182                         if (error) {
10183                                 if (memp) {
10184                                         kfree (memp, size);
10185                                 }
10186                                 return error;
10187                         }
10188                 } else {
10189                         if (is64bit) {
10190                                 *(user_addr_t *)data = udata;
10191                         }
10192                         else {
10193                                 *(uint32_t *)data = (uint32_t)udata;
10194                         }
10195                 };
10196         } else if ((cmd & IOC_OUT) && size) {
10197                 /*
10198                  * Zero the buffer so the user always
10199                  * gets back something deterministic.
10200                  */
10201                 bzero(data, size);
10202         } else if (cmd & IOC_VOID) {
10203                 if (is64bit) {
10204                         *(user_addr_t *)data = udata;
10205                 }
10206                 else {
10207                         *(uint32_t *)data = (uint32_t)udata;
10208                 }
10209         }
10210
10211         /* Check to see if it's a generic command */
10212         switch (cmd) {
10213
10214                 case FSIOC_SYNC_VOLUME: {
10215                         mount_t mp = vp->v_mount;
10216                         int arg = *(uint32_t*)data;
10217
10218                         /* record vid of vp so we can drop it below. */
10219                         uint32_t vvid = vp->v_id;
10220
10221                         /*
10222                          * Then grab mount_iterref so that we can release the vnode.
10223                          * Without this, a thread may call vnode_iterate_prepare then
10224                          * get into a deadlock because we've never released the root vp
10225                          */
10226                         error = mount_iterref (mp, 0);
10227                         if (error)  {
10228                                 break;
10229                         }
10230                         vnode_put(vp);
10231
10232                         /* issue the sync for this volume */
10233                         (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
10234
10235                         /*
10236                          * Then release the mount_iterref once we're done syncing; it's not
10237                          * needed for the VNOP_IOCTL below
10238                          */
10239                         mount_iterdrop(mp);
10240
10241                         if (arg & FSCTL_SYNC_FULLSYNC) {
10242                                 /* re-obtain vnode iocount on the root vp, if possible */
10243                                 error = vnode_getwithvid (vp, vvid);
10244                                 if (error == 0) {
10245                                         error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
10246                                         vnode_put (vp);
10247                                 }
10248                         }
10249                         /* mark the argument VP as having been released */
10250                         *arg_vp = NULL;
10251                 }
10252                 break;
10253
10254                 case FSIOC_ROUTEFS_SETROUTEID: {
10255 #if ROUTEFS
10256                         char routepath[MAXPATHLEN];
10257                         size_t len = 0;
10258
10259                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10260                                 break;
10261                         }
10262                         bzero(routepath, MAXPATHLEN);
10263                         error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
10264                         if (error) {
10265                                 break;
10266                         }
10267                         error = routefs_kernel_mount(routepath);
10268                         if (error) {
10269                                 break;
10270                         }
10271 #endif
10272                 }
10273                 break;
10274
10275                 case FSIOC_SET_PACKAGE_EXTS: {
10276                         user_addr_t ext_strings;
10277                         uint32_t    num_entries;
10278                         uint32_t    max_width;
10279
10280                         if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0)))
10281                                 break;
10282
10283                         if (   (is64bit && size != sizeof(user64_package_ext_info))
10284                                         || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
10285
10286                                 // either you're 64-bit and passed a 64-bit struct or
10287                                 // you're 32-bit and passed a 32-bit struct.  otherwise
10288                                 // it's not ok.
10289                                 error = EINVAL;
10290                                 break;
10291                         }
10292
10293                         if (is64bit) {
10294                                 ext_strings = ((user64_package_ext_info *)data)->strings;
10295                                 num_entries = ((user64_package_ext_info *)data)->num_entries;
10296                                 max_width   = ((user64_package_ext_info *)data)->max_width;
10297                         } else {
10298                                 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
10299                                 num_entries = ((user32_package_ext_info *)data)->num_entries;
10300                                 max_width   = ((user32_package_ext_info *)data)->max_width;
10301                         }
10302                         error = set_package_extensions_table(ext_strings, num_entries, max_width);
10303                 }
10304                 break;
10305
10306                 /* namespace handlers */
10307                 case FSIOC_NAMESPACE_HANDLER_GET: {
10308                         error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
10309                 }
10310                 break;
10311
10312                 /* Snapshot handlers */
10313                 case FSIOC_OLD_SNAPSHOT_HANDLER_GET: {
10314                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10315                 }
10316                 break;
10317
10318                 case FSIOC_SNAPSHOT_HANDLER_GET_EXT: {
10319                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10320                 }
10321                 break;
10322
10323                 case FSIOC_NAMESPACE_HANDLER_UPDATE: {
10324                         uint32_t token, val;
10325                         int i;
10326
10327                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10328                                 break;
10329                         }
10330
10331                         if (!nspace_is_special_process(p)) {
10332                                 error = EINVAL;
10333                                 break;
10334                         }
10335
10336                         token = ((uint32_t *)data)[0];
10337                         val   = ((uint32_t *)data)[1];
10338
10339                         lck_mtx_lock(&nspace_handler_lock);
10340
10341                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10342                                 if (nspace_items[i].token == token) {
10343                                         break;  /* exit for loop, not case stmt */
10344                                 }
10345                         }
10346
10347                         if (i >= MAX_NSPACE_ITEMS) {
10348                                 error = ENOENT;
10349                         } else {
10350                                 //
10351                                 // if this bit is set, when resolve_nspace_item() times out
10352                                 // it will loop and go back to sleep.
10353                                 //
10354                                 nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
10355                         }
10356
10357                         lck_mtx_unlock(&nspace_handler_lock);
10358
10359                         if (error) {
10360                                 printf("nspace-handler-update: did not find token %u\n", token);
10361                         }
10362                 }
10363                 break;
10364
10365                 case FSIOC_NAMESPACE_HANDLER_UNBLOCK: {
10366                         uint32_t token, val;
10367                         int i;
10368
10369                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10370                                 break;
10371                         }
10372
10373                         if (!nspace_is_special_process(p)) {
10374                                 error = EINVAL;
10375                                 break;
10376                         }
10377
10378                         token = ((uint32_t *)data)[0];
10379                         val   = ((uint32_t *)data)[1];
10380
10381                         lck_mtx_lock(&nspace_handler_lock);
10382
10383                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10384                                 if (nspace_items[i].token == token) {
10385                                         break; /* exit for loop, not case statement */
10386                                 }
10387                         }
10388
10389                         if (i >= MAX_NSPACE_ITEMS) {
10390                                 printf("nspace-handler-unblock: did not find token %u\n", token);
10391                                 error = ENOENT;
10392                         } else {
10393                                 if (val == 0 && nspace_items[i].vp) {
10394                                         vnode_lock_spin(nspace_items[i].vp);
10395                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10396                                         vnode_unlock(nspace_items[i].vp);
10397                                 }
10398
10399                                 nspace_items[i].vp = NULL;
10400                                 nspace_items[i].arg = NULL;
10401                                 nspace_items[i].op = 0;
10402                                 nspace_items[i].vid = 0;
10403                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
10404                                 nspace_items[i].token = 0;
10405
10406                                 wakeup((caddr_t)&(nspace_items[i].vp));
10407                         }
10408
10409                         lck_mtx_unlock(&nspace_handler_lock);
10410                 }
10411                 break;
10412
10413                 case FSIOC_NAMESPACE_HANDLER_CANCEL: {
10414                         uint32_t token, val;
10415                         int i;
10416
10417                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10418                                 break;
10419                         }
10420
10421                         if (!nspace_is_special_process(p)) {
10422                                 error = EINVAL;
10423                                 break;
10424                         }
10425
10426                         token = ((uint32_t *)data)[0];
10427                         val   = ((uint32_t *)data)[1];
10428
10429                         lck_mtx_lock(&nspace_handler_lock);
10430
10431                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10432                                 if (nspace_items[i].token == token) {
10433                                         break;  /* exit for loop, not case stmt */
10434                                 }
10435                         }
10436
10437                         if (i >= MAX_NSPACE_ITEMS) {
10438                                 printf("nspace-handler-cancel: did not find token %u\n", token);
10439                                 error = ENOENT;
10440                         } else {
10441                                 if (nspace_items[i].vp) {
10442                                         vnode_lock_spin(nspace_items[i].vp);
10443                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10444                                         vnode_unlock(nspace_items[i].vp);
10445                                 }
10446
10447                                 nspace_items[i].vp = NULL;
10448                                 nspace_items[i].arg = NULL;
10449                                 nspace_items[i].vid = 0;
10450                                 nspace_items[i].token = val;
10451                                 nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
10452                                 nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
10453
10454                                 wakeup((caddr_t)&(nspace_items[i].vp));
10455                         }
10456
10457                         lck_mtx_unlock(&nspace_handler_lock);
10458                 }
10459                 break;
10460
10461                 case FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
10462                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10463                                 break;
10464                         }
10465
10466                         // we explicitly do not do the namespace_handler_proc check here
10467
10468                         lck_mtx_lock(&nspace_handler_lock);
10469                         snapshot_timestamp = ((uint32_t *)data)[0];
10470                         wakeup(&nspace_item_idx);
10471                         lck_mtx_unlock(&nspace_handler_lock);
10472                         printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
10473
10474                 }
10475                 break;
10476
10477                 case FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
10478                 {
10479                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10480                                 break;
10481                         }
10482
10483                         lck_mtx_lock(&nspace_handler_lock);
10484                         nspace_allow_virtual_devs = ((uint32_t *)data)[0];
10485                         lck_mtx_unlock(&nspace_handler_lock);
10486                         printf("nspace-snapshot-handler will%s allow events on disk-images\n",
10487                                         nspace_allow_virtual_devs ? "" : " NOT");
10488                         error = 0;
10489
10490                 }
10491                 break;
10492
10493                 case FSIOC_SET_FSTYPENAME_OVERRIDE:
10494                 {
10495                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10496                                 break;
10497                         }
10498                         if (vp->v_mount) {
10499                                 mount_lock(vp->v_mount);
10500                                 if (data[0] != 0) {
10501                                         strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
10502                                         vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
10503                                         if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
10504                                                 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
10505                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
10506                                         }
10507                                 } else {
10508                                         if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
10509                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
10510                                         }
10511                                         vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
10512                                         vp->v_mount->fstypename_override[0] = '\0';
10513                                 }
10514                                 mount_unlock(vp->v_mount);
10515                         }
10516                 }
10517                 break;
10518
10519                 case DISK_CONDITIONER_IOC_GET: {
10520                   error = disk_conditioner_get_info(vp->v_mount, (disk_conditioner_info *)data);
10521                 }
10522                 break;
10523
10524                 case DISK_CONDITIONER_IOC_SET: {
10525                   error = disk_conditioner_set_info(vp->v_mount, (disk_conditioner_info *)data);
10526                 }
10527                 break;
10528
10529                 default: {
10530                         /* other, known commands shouldn't be passed down here */
10531                         switch (cmd) {
10532                                 case F_PUNCHHOLE:
10533                                 case F_TRIM_ACTIVE_FILE:
10534                                 case F_RDADVISE:
10535                                 case F_TRANSCODEKEY:
10536                                 case F_GETPROTECTIONLEVEL:
10537                                 case F_GETDEFAULTPROTLEVEL:
10538                                 case F_MAKECOMPRESSED:
10539                                 case F_SET_GREEDY_MODE:
10540                                 case F_SETSTATICCONTENT:
10541                                 case F_SETIOTYPE:
10542                                 case F_SETBACKINGSTORE:
10543                                 case F_GETPATH_MTMINFO:
10544                                 case APFSIOC_REVERT_TO_SNAPSHOT:
10545                                 case FSIOC_FIOSEEKHOLE:
10546                                 case FSIOC_FIOSEEKDATA:
10547                                 case HFS_GET_BOOT_INFO:
10548                                 case HFS_SET_BOOT_INFO:
10549                                 case FIOPINSWAP:
10550                                 case F_CHKCLEAN:
10551                                 case F_FULLFSYNC:
10552                                 case F_BARRIERFSYNC:
10553                                 case F_FREEZE_FS:
10554                                 case F_THAW_FS:
10555                                         error = EINVAL;
10556                                         goto outdrop;
10557                         }
10558                         /* Invoke the filesystem-specific code */
10559                         error = VNOP_IOCTL(vp, cmd, data, options, ctx);
10560                 }
10561
10562         } /* end switch stmt */
10563
10564         /*
10565          * if no errors, copy any data to user. Size was
10566          * already set and checked above.
10567          */
10568         if (error == 0 && (cmd & IOC_OUT) && size)
10569                 error = copyout(data, udata, size);
10570
10571 outdrop:
10572         if (memp) {
10573                 kfree(memp, size);
10574         }
10575
10576         return error;
10577 }
10578
10579 /* ARGSUSED */
10580 int
10581 fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
10582 {
10583         int error;
10584         struct nameidata nd;
10585         u_long nameiflags;
10586         vnode_t vp = NULL;
10587         vfs_context_t ctx = vfs_context_current();
10588
10589         AUDIT_ARG(cmd, uap->cmd);
10590         AUDIT_ARG(value32, uap->options);
10591         /* Get the vnode for the file we are getting info on:  */
10592         nameiflags = 0;
10593         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
10594         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
10595                UIO_USERSPACE, uap->path, ctx);
10596         if ((error = namei(&nd))) goto done;
10597         vp = nd.ni_vp;
10598         nameidone(&nd);
10599
10600 #if CONFIG_MACF
10601         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
10602         if (error) {
10603                 goto done;
10604         }
10605 #endif
10606
10607         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
10608
10609 done:
10610         if (vp)
10611                 vnode_put(vp);
10612         return error;
10613 }
10614 /* ARGSUSED */
10615 int
10616 ffsctl (proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
10617 {
10618         int error;
10619         vnode_t vp = NULL;
10620         vfs_context_t ctx = vfs_context_current();
10621         int fd = -1;
10622
10623         AUDIT_ARG(fd, uap->fd);
10624         AUDIT_ARG(cmd, uap->cmd);
10625         AUDIT_ARG(value32, uap->options);
10626
10627         /* Get the vnode for the file we are getting info on:  */
10628         if ((error = file_vnode(uap->fd, &vp)))
10629                 return error;
10630         fd = uap->fd;
10631         if ((error = vnode_getwithref(vp))) {
10632                 file_drop(fd);
10633                 return error;
10634         }
10635
10636 #if CONFIG_MACF
10637         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
10638                 file_drop(fd);
10639                 vnode_put(vp);
10640                 return error;
10641         }
10642 #endif
10643
10644         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
10645
10646         file_drop(fd);
10647
10648         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
10649         if (vp) {
10650                 vnode_put(vp);
10651         }
10652
10653         return error;
10654 }
10655 /* end of fsctl system call */
10656
10657 /*
10658  *  Retrieve the data of an extended attribute.
10659  */
10660 int
10661 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
10662 {
10663         vnode_t vp;
10664         struct nameidata nd;
10665         char attrname[XATTR_MAXNAMELEN+1];
10666         vfs_context_t ctx = vfs_context_current();
10667         uio_t auio = NULL;
10668         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10669         size_t attrsize = 0;
10670         size_t namelen;
10671         u_int32_t nameiflags;
10672         int error;
10673         char uio_buf[ UIO_SIZEOF(1) ];
10674
10675         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10676                 return (EINVAL);
10677
10678         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10679         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
10680         if ((error = namei(&nd))) {
10681                 return (error);
10682         }
10683         vp = nd.ni_vp;
10684         nameidone(&nd);
10685
10686         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10687         if (error != 0) {
10688                 goto out;
10689         }
10690         if (xattr_protected(attrname)) {
10691                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
10692                         error = EPERM;
10693                         goto out;
10694                 }
10695         }
10696         /*
10697          * the specific check for 0xffffffff is a hack to preserve
10698          * binaray compatibilty in K64 with applications that discovered
10699          * that passing in a buf pointer and a size of -1 resulted in
10700          * just the size of the indicated extended attribute being returned.
10701          * this isn't part of the documented behavior, but because of the
10702          * original implemtation's check for "uap->size > 0", this behavior
10703          * was allowed. In K32 that check turned into a signed comparison
10704          * even though uap->size is unsigned...  in K64, we blow by that
10705          * check because uap->size is unsigned and doesn't get sign smeared
10706          * in the munger for a 32 bit user app.  we also need to add a
10707          * check to limit the maximum size of the buffer being passed in...
10708          * unfortunately, the underlying fileystems seem to just malloc
10709          * the requested size even if the actual extended attribute is tiny.
10710          * because that malloc is for kernel wired memory, we have to put a
10711          * sane limit on it.
10712          *
10713          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
10714          * U64 running on K64 will yield -1 (64 bits wide)
10715          * U32/U64 running on K32 will yield -1 (32 bits wide)
10716          */
10717         if (uap->size == 0xffffffff || uap->size == (size_t)-1)
10718                 goto no_uio;
10719
10720         if (uap->value) {
10721                 if (uap->size > (size_t)XATTR_MAXSIZE)
10722                         uap->size = XATTR_MAXSIZE;
10723
10724                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
10725                                             &uio_buf[0], sizeof(uio_buf));
10726                 uio_addiov(auio, uap->value, uap->size);
10727         }
10728 no_uio:
10729         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
10730 out:
10731         vnode_put(vp);
10732
10733         if (auio) {
10734                 *retval = uap->size - uio_resid(auio);
10735         } else {
10736                 *retval = (user_ssize_t)attrsize;
10737         }
10738
10739         return (error);
10740 }
10741
10742 /*
10743  * Retrieve the data of an extended attribute.
10744  */
10745 int
10746 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
10747 {
10748         vnode_t vp;
10749         char attrname[XATTR_MAXNAMELEN+1];
10750         uio_t auio = NULL;
10751         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10752         size_t attrsize = 0;
10753         size_t namelen;
10754         int error;
10755         char uio_buf[ UIO_SIZEOF(1) ];
10756
10757         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10758                 return (EINVAL);
10759
10760         if ( (error = file_vnode(uap->fd, &vp)) ) {
10761                 return (error);
10762         }
10763         if ( (error = vnode_getwithref(vp)) ) {
10764                 file_drop(uap->fd);
10765                 return(error);
10766         }
10767         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10768         if (error != 0) {
10769                 goto out;
10770         }
10771         if (xattr_protected(attrname)) {
10772                 error = EPERM;
10773                 goto out;
10774         }
10775         if (uap->value && uap->size > 0) {
10776                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
10777                                             &uio_buf[0], sizeof(uio_buf));
10778                 uio_addiov(auio, uap->value, uap->size);
10779         }
10780
10781         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
10782 out:
10783         (void)vnode_put(vp);
10784         file_drop(uap->fd);
10785
10786         if (auio) {
10787                 *retval = uap->size - uio_resid(auio);
10788         } else {
10789                 *retval = (user_ssize_t)attrsize;
10790         }
10791         return (error);
10792 }
10793
10794 /*
10795  * Set the data of an extended attribute.
10796  */
10797 int
10798 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
10799 {
10800         vnode_t vp;
10801         struct nameidata nd;
10802         char attrname[XATTR_MAXNAMELEN+1];
10803         vfs_context_t ctx = vfs_context_current();
10804         uio_t auio = NULL;
10805         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10806         size_t namelen;
10807         u_int32_t nameiflags;
10808         int error;
10809         char uio_buf[ UIO_SIZEOF(1) ];
10810
10811         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10812                 return (EINVAL);
10813
10814         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10815         if (error != 0) {
10816                 if (error == EPERM) {
10817                         /* if the string won't fit in attrname, copyinstr emits EPERM */
10818                         return (ENAMETOOLONG);
10819                 }
10820                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
10821                 return error;
10822         }
10823         if (xattr_protected(attrname))
10824                 return(EPERM);
10825         if (uap->size != 0 && uap->value == 0) {
10826                 return (EINVAL);
10827         }
10828
10829         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10830         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
10831         if ((error = namei(&nd))) {
10832                 return (error);
10833         }
10834         vp = nd.ni_vp;
10835         nameidone(&nd);
10836
10837         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
10838                                     &uio_buf[0], sizeof(uio_buf));
10839         uio_addiov(auio, uap->value, uap->size);
10840
10841         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
10842 #if CONFIG_FSE
10843         if (error == 0) {
10844                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
10845                     FSE_ARG_VNODE, vp,
10846                     FSE_ARG_DONE);
10847         }
10848 #endif
10849         vnode_put(vp);
10850         *retval = 0;
10851         return (error);
10852 }
10853
10854 /*
10855  * Set the data of an extended attribute.
10856  */
10857 int
10858 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
10859 {
10860         vnode_t vp;
10861         char attrname[XATTR_MAXNAMELEN+1];
10862         uio_t auio = NULL;
10863         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10864         size_t namelen;
10865         int error;
10866         char uio_buf[ UIO_SIZEOF(1) ];
10867 #if CONFIG_FSE
10868         vfs_context_t ctx = vfs_context_current();
10869 #endif
10870
10871         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10872                 return (EINVAL);
10873
10874         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10875         if (error != 0) {
10876                 if (error == EPERM) {
10877                         /* if the string won't fit in attrname, copyinstr emits EPERM */
10878                         return (ENAMETOOLONG);
10879                 }
10880                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
10881                 return error;
10882         }
10883         if (xattr_protected(attrname))
10884                 return(EPERM);
10885         if (uap->size != 0 && uap->value == 0) {
10886                 return (EINVAL);
10887         }
10888         if ( (error = file_vnode(uap->fd, &vp)) ) {
10889                 return (error);
10890         }
10891         if ( (error = vnode_getwithref(vp)) ) {
10892                 file_drop(uap->fd);
10893                 return(error);
10894         }
10895         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
10896                                     &uio_buf[0], sizeof(uio_buf));
10897         uio_addiov(auio, uap->value, uap->size);
10898
10899         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
10900 #if CONFIG_FSE
10901         if (error == 0) {
10902                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
10903                     FSE_ARG_VNODE, vp,
10904                     FSE_ARG_DONE);
10905         }
10906 #endif
10907         vnode_put(vp);
10908         file_drop(uap->fd);
10909         *retval = 0;
10910         return (error);
10911 }
10912
10913 /*
10914  * Remove an extended attribute.
10915  * XXX Code duplication here.
10916  */
10917 int
10918 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
10919 {
10920         vnode_t vp;
10921         struct nameidata nd;
10922         char attrname[XATTR_MAXNAMELEN+1];
10923         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10924         vfs_context_t ctx = vfs_context_current();
10925         size_t namelen;
10926         u_int32_t nameiflags;
10927         int error;
10928
10929         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10930                 return (EINVAL);
10931
10932         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10933         if (error != 0) {
10934                 return (error);
10935         }
10936         if (xattr_protected(attrname))
10937                 return(EPERM);
10938         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10939         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
10940         if ((error = namei(&nd))) {
10941                 return (error);
10942         }
10943         vp = nd.ni_vp;
10944         nameidone(&nd);
10945
10946         error = vn_removexattr(vp, attrname, uap->options, ctx);
10947 #if CONFIG_FSE
10948         if (error == 0) {
10949                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10950                     FSE_ARG_VNODE, vp,
10951                     FSE_ARG_DONE);
10952         }
10953 #endif
10954         vnode_put(vp);
10955         *retval = 0;
10956         return (error);
10957 }
10958
10959 /*
10960  * Remove an extended attribute.
10961  * XXX Code duplication here.
10962  */
10963 int
10964 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
10965 {
10966         vnode_t vp;
10967         char attrname[XATTR_MAXNAMELEN+1];
10968         size_t namelen;
10969         int error;
10970 #if CONFIG_FSE
10971         vfs_context_t ctx = vfs_context_current();
10972 #endif
10973
10974         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10975                 return (EINVAL);
10976
10977         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10978         if (error != 0) {
10979                 return (error);
10980         }
10981         if (xattr_protected(attrname))
10982                 return(EPERM);
10983         if ( (error = file_vnode(uap->fd, &vp)) ) {
10984                 return (error);
10985         }
10986         if ( (error = vnode_getwithref(vp)) ) {
10987                 file_drop(uap->fd);
10988                 return(error);
10989         }
10990
10991         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
10992 #if CONFIG_FSE
10993         if (error == 0) {
10994                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10995                     FSE_ARG_VNODE, vp,
10996                     FSE_ARG_DONE);
10997         }
10998 #endif
10999         vnode_put(vp);
11000         file_drop(uap->fd);
11001         *retval = 0;
11002         return (error);
11003 }
11004
11005 /*
11006  * Retrieve the list of extended attribute names.
11007  * XXX Code duplication here.
11008  */
11009 int
11010 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
11011 {
11012         vnode_t vp;
11013         struct nameidata nd;
11014         vfs_context_t ctx = vfs_context_current();
11015         uio_t auio = NULL;
11016         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11017         size_t attrsize = 0;
11018         u_int32_t nameiflags;
11019         int error;
11020         char uio_buf[ UIO_SIZEOF(1) ];
11021
11022         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
11023                 return (EINVAL);
11024
11025         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
11026         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
11027         if ((error = namei(&nd))) {
11028                 return (error);
11029         }
11030         vp = nd.ni_vp;
11031         nameidone(&nd);
11032         if (uap->namebuf != 0 && uap->bufsize > 0) {
11033                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
11034                                             &uio_buf[0], sizeof(uio_buf));
11035                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11036         }
11037
11038         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
11039
11040         vnode_put(vp);
11041         if (auio) {
11042                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11043         } else {
11044                 *retval = (user_ssize_t)attrsize;
11045         }
11046         return (error);
11047 }
11048
11049 /*
11050  * Retrieve the list of extended attribute names.
11051  * XXX Code duplication here.
11052  */
11053 int
11054 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
11055 {
11056         vnode_t vp;
11057         uio_t auio = NULL;
11058         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
11059         size_t attrsize = 0;
11060         int error;
11061         char uio_buf[ UIO_SIZEOF(1) ];
11062
11063         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
11064                 return (EINVAL);
11065
11066         if ( (error = file_vnode(uap->fd, &vp)) ) {
11067                 return (error);
11068         }
11069         if ( (error = vnode_getwithref(vp)) ) {
11070                 file_drop(uap->fd);
11071                 return(error);
11072         }
11073         if (uap->namebuf != 0 && uap->bufsize > 0) {
11074                 auio = uio_createwithbuffer(1, 0, spacetype,
11075                                                                           UIO_READ, &uio_buf[0], sizeof(uio_buf));
11076                 uio_addiov(auio, uap->namebuf, uap->bufsize);
11077         }
11078
11079         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
11080
11081         vnode_put(vp);
11082         file_drop(uap->fd);
11083         if (auio) {
11084                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
11085         } else {
11086                 *retval = (user_ssize_t)attrsize;
11087         }
11088         return (error);
11089 }
11090
11091 static int fsgetpath_internal(
11092         vfs_context_t ctx, int volfs_id, uint64_t objid,
11093         vm_size_t bufsize, caddr_t buf, int *pathlen)
11094 {
11095         int error;
11096         struct mount *mp = NULL;
11097         vnode_t vp;
11098         int length;
11099         int bpflags;
11100         /* maximum number of times to retry build_path */
11101         unsigned int retries = 0x10;
11102
11103         if (bufsize > PAGE_SIZE) {
11104                 return (EINVAL);
11105         }
11106
11107         if (buf == NULL) {
11108                 return (ENOMEM);
11109         }
11110
11111 retry:
11112         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
11113                 error = ENOTSUP;  /* unexpected failure */
11114                 return ENOTSUP;
11115         }
11116
11117 unionget:
11118         if (objid == 2) {
11119                 error = VFS_ROOT(mp, &vp, ctx);
11120         } else {
11121                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
11122         }
11123
11124         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
11125                 /*
11126                  * If the fileid isn't found and we're in a union
11127                  * mount volume, then see if the fileid is in the
11128                  * mounted-on volume.
11129                  */
11130                 struct mount *tmp = mp;
11131                 mp = vnode_mount(tmp->mnt_vnodecovered);
11132                 vfs_unbusy(tmp);
11133                 if (vfs_busy(mp, LK_NOWAIT) == 0)
11134                         goto unionget;
11135         } else {
11136                 vfs_unbusy(mp);
11137         }
11138
11139         if (error) {
11140                 return error;
11141         }
11142
11143 #if CONFIG_MACF
11144         error = mac_vnode_check_fsgetpath(ctx, vp);
11145         if (error) {
11146                 vnode_put(vp);
11147                 return error;
11148         }
11149 #endif
11150
11151         /* Obtain the absolute path to this vnode. */
11152         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
11153         bpflags |= BUILDPATH_CHECK_MOVED;
11154         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
11155         vnode_put(vp);
11156
11157         if (error) {
11158                 /* there was a race building the path, try a few more times */
11159                 if (error == EAGAIN) {
11160                         --retries;
11161                         if (retries > 0)
11162                                 goto retry;
11163
11164                         error = ENOENT;
11165                 }
11166                 goto out;
11167         }
11168
11169         AUDIT_ARG(text, buf);
11170
11171         if (kdebug_enable) {
11172                 long dbg_parms[NUMPARMS];
11173                 int  dbg_namelen;
11174
11175                 dbg_namelen = (int)sizeof(dbg_parms);
11176
11177         if (length < dbg_namelen) {
11178                         memcpy((char *)dbg_parms, buf, length);
11179                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
11180
11181                         dbg_namelen = length;
11182                 } else {
11183                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
11184                 }
11185
11186                 kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
11187                                 KDBG_VFS_LOOKUP_FLAG_LOOKUP);
11188         }
11189
11190         *pathlen = (user_ssize_t)length; /* may be superseded by error */
11191
11192 out:
11193         return (error);
11194 }
11195
11196 /*
11197  * Obtain the full pathname of a file system object by id.
11198  */
11199 int
11200 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
11201 {
11202         vfs_context_t ctx = vfs_context_current();
11203         fsid_t fsid;
11204         char *realpath;
11205         int length;
11206         int error;
11207
11208         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
11209                 return (error);
11210         }
11211         AUDIT_ARG(value32, fsid.val[0]);
11212         AUDIT_ARG(value64, uap->objid);
11213         /* Restrict output buffer size for now. */
11214
11215         if (uap->bufsize > PAGE_SIZE) {
11216                 return (EINVAL);
11217         }
11218         MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK | M_ZERO);
11219         if (realpath == NULL) {
11220                 return (ENOMEM);
11221         }
11222
11223         error = fsgetpath_internal(
11224                 ctx, fsid.val[0], uap->objid,
11225                 uap->bufsize, realpath, &length);
11226
11227         if (error) {
11228                 goto out;
11229         }
11230
11231         error = copyout((caddr_t)realpath, uap->buf, length);
11232
11233         *retval = (user_ssize_t)length; /* may be superseded by error */
11234 out:
11235         if (realpath) {
11236                 FREE(realpath, M_TEMP);
11237         }
11238         return (error);
11239 }
11240
11241 /*
11242  * Common routine to handle various flavors of statfs data heading out
11243  *      to user space.
11244  *
11245  * Returns:     0                       Success
11246  *              EFAULT
11247  */
11248 static int
11249 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
11250     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
11251     boolean_t partial_copy)
11252 {
11253         int             error;
11254         int             my_size, copy_size;
11255
11256         if (is_64_bit) {
11257                 struct user64_statfs sfs;
11258                 my_size = copy_size = sizeof(sfs);
11259                 bzero(&sfs, my_size);
11260                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
11261                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
11262                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
11263                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
11264                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
11265                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
11266                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
11267                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
11268                 sfs.f_files = (user64_long_t)sfsp->f_files;
11269                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
11270                 sfs.f_fsid = sfsp->f_fsid;
11271                 sfs.f_owner = sfsp->f_owner;
11272                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11273                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
11274                 } else {
11275                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
11276                 }
11277                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
11278                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
11279
11280                 if (partial_copy) {
11281                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
11282                 }
11283                 error = copyout((caddr_t)&sfs, bufp, copy_size);
11284         }
11285         else {
11286                 struct user32_statfs sfs;
11287
11288                 my_size = copy_size = sizeof(sfs);
11289                 bzero(&sfs, my_size);
11290
11291                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
11292                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
11293                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
11294
11295                 /*
11296                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
11297                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
11298                  * to reflect the filesystem size as best we can.
11299                  */
11300                 if ((sfsp->f_blocks > INT_MAX)
11301                         /* Hack for 4061702 . I think the real fix is for Carbon to
11302                          * look for some volume capability and not depend on hidden
11303                          * semantics agreed between a FS and carbon.
11304                          * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
11305                          * for Carbon to set bNoVolumeSizes volume attribute.
11306                          * Without this the webdavfs files cannot be copied onto
11307                          * disk as they look huge. This change should not affect
11308                          * XSAN as they should not setting these to -1..
11309                          */
11310                          && (sfsp->f_blocks != 0xffffffffffffffffULL)
11311                          && (sfsp->f_bfree != 0xffffffffffffffffULL)
11312                          && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
11313                         int             shift;
11314
11315                         /*
11316                          * Work out how far we have to shift the block count down to make it fit.
11317                          * Note that it's possible to have to shift so far that the resulting
11318                          * blocksize would be unreportably large.  At that point, we will clip
11319                          * any values that don't fit.
11320                          *
11321                          * For safety's sake, we also ensure that f_iosize is never reported as
11322                          * being smaller than f_bsize.
11323                          */
11324                         for (shift = 0; shift < 32; shift++) {
11325                                 if ((sfsp->f_blocks >> shift) <= INT_MAX)
11326                                         break;
11327                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX)
11328                                         break;
11329                         }
11330 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
11331                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
11332                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
11333                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
11334 #undef __SHIFT_OR_CLIP
11335                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
11336                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
11337                 } else {
11338                         /* filesystem is small enough to be reported honestly */
11339                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
11340                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
11341                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
11342                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
11343                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
11344                 }
11345                 sfs.f_files = (user32_long_t)sfsp->f_files;
11346                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
11347                 sfs.f_fsid = sfsp->f_fsid;
11348                 sfs.f_owner = sfsp->f_owner;
11349                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11350                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
11351                 } else {
11352                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
11353                 }
11354                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
11355                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
11356
11357                 if (partial_copy) {
11358                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
11359                 }
11360                 error = copyout((caddr_t)&sfs, bufp, copy_size);
11361         }
11362
11363         if (sizep != NULL) {
11364                 *sizep = my_size;
11365         }
11366         return(error);
11367 }
11368
11369 /*
11370  * copy stat structure into user_stat structure.
11371  */
11372 void munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
11373 {
11374         bzero(usbp, sizeof(*usbp));
11375
11376         usbp->st_dev = sbp->st_dev;
11377         usbp->st_ino = sbp->st_ino;
11378         usbp->st_mode = sbp->st_mode;
11379         usbp->st_nlink = sbp->st_nlink;
11380         usbp->st_uid = sbp->st_uid;
11381         usbp->st_gid = sbp->st_gid;
11382         usbp->st_rdev = sbp->st_rdev;
11383 #ifndef _POSIX_C_SOURCE
11384         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11385         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11386         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11387         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11388         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11389         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11390 #else
11391         usbp->st_atime = sbp->st_atime;
11392         usbp->st_atimensec = sbp->st_atimensec;
11393         usbp->st_mtime = sbp->st_mtime;
11394         usbp->st_mtimensec = sbp->st_mtimensec;
11395         usbp->st_ctime = sbp->st_ctime;
11396         usbp->st_ctimensec = sbp->st_ctimensec;
11397 #endif
11398         usbp->st_size = sbp->st_size;
11399         usbp->st_blocks = sbp->st_blocks;
11400         usbp->st_blksize = sbp->st_blksize;
11401         usbp->st_flags = sbp->st_flags;
11402         usbp->st_gen = sbp->st_gen;
11403         usbp->st_lspare = sbp->st_lspare;
11404         usbp->st_qspare[0] = sbp->st_qspare[0];
11405         usbp->st_qspare[1] = sbp->st_qspare[1];
11406 }
11407
11408 void munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
11409 {
11410         bzero(usbp, sizeof(*usbp));
11411
11412         usbp->st_dev = sbp->st_dev;
11413         usbp->st_ino = sbp->st_ino;
11414         usbp->st_mode = sbp->st_mode;
11415         usbp->st_nlink = sbp->st_nlink;
11416         usbp->st_uid = sbp->st_uid;
11417         usbp->st_gid = sbp->st_gid;
11418         usbp->st_rdev = sbp->st_rdev;
11419 #ifndef _POSIX_C_SOURCE
11420         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11421         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11422         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11423         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11424         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11425         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11426 #else
11427         usbp->st_atime = sbp->st_atime;
11428         usbp->st_atimensec = sbp->st_atimensec;
11429         usbp->st_mtime = sbp->st_mtime;
11430         usbp->st_mtimensec = sbp->st_mtimensec;
11431         usbp->st_ctime = sbp->st_ctime;
11432         usbp->st_ctimensec = sbp->st_ctimensec;
11433 #endif
11434         usbp->st_size = sbp->st_size;
11435         usbp->st_blocks = sbp->st_blocks;
11436         usbp->st_blksize = sbp->st_blksize;
11437         usbp->st_flags = sbp->st_flags;
11438         usbp->st_gen = sbp->st_gen;
11439         usbp->st_lspare = sbp->st_lspare;
11440         usbp->st_qspare[0] = sbp->st_qspare[0];
11441         usbp->st_qspare[1] = sbp->st_qspare[1];
11442 }
11443
11444 /*
11445  * copy stat64 structure into user_stat64 structure.
11446  */
11447 void munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
11448 {
11449         bzero(usbp, sizeof(*usbp));
11450
11451         usbp->st_dev = sbp->st_dev;
11452         usbp->st_ino = sbp->st_ino;
11453         usbp->st_mode = sbp->st_mode;
11454         usbp->st_nlink = sbp->st_nlink;
11455         usbp->st_uid = sbp->st_uid;
11456         usbp->st_gid = sbp->st_gid;
11457         usbp->st_rdev = sbp->st_rdev;
11458 #ifndef _POSIX_C_SOURCE
11459         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11460         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11461         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11462         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11463         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11464         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11465         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11466         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11467 #else
11468         usbp->st_atime = sbp->st_atime;
11469         usbp->st_atimensec = sbp->st_atimensec;
11470         usbp->st_mtime = sbp->st_mtime;
11471         usbp->st_mtimensec = sbp->st_mtimensec;
11472         usbp->st_ctime = sbp->st_ctime;
11473         usbp->st_ctimensec = sbp->st_ctimensec;
11474         usbp->st_birthtime = sbp->st_birthtime;
11475         usbp->st_birthtimensec = sbp->st_birthtimensec;
11476 #endif
11477         usbp->st_size = sbp->st_size;
11478         usbp->st_blocks = sbp->st_blocks;
11479         usbp->st_blksize = sbp->st_blksize;
11480         usbp->st_flags = sbp->st_flags;
11481         usbp->st_gen = sbp->st_gen;
11482         usbp->st_lspare = sbp->st_lspare;
11483         usbp->st_qspare[0] = sbp->st_qspare[0];
11484         usbp->st_qspare[1] = sbp->st_qspare[1];
11485 }
11486
11487 void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
11488 {
11489         bzero(usbp, sizeof(*usbp));
11490
11491         usbp->st_dev = sbp->st_dev;
11492         usbp->st_ino = sbp->st_ino;
11493         usbp->st_mode = sbp->st_mode;
11494         usbp->st_nlink = sbp->st_nlink;
11495         usbp->st_uid = sbp->st_uid;
11496         usbp->st_gid = sbp->st_gid;
11497         usbp->st_rdev = sbp->st_rdev;
11498 #ifndef _POSIX_C_SOURCE
11499         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11500         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11501         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11502         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11503         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11504         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11505         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11506         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11507 #else
11508         usbp->st_atime = sbp->st_atime;
11509         usbp->st_atimensec = sbp->st_atimensec;
11510         usbp->st_mtime = sbp->st_mtime;
11511         usbp->st_mtimensec = sbp->st_mtimensec;
11512         usbp->st_ctime = sbp->st_ctime;
11513         usbp->st_ctimensec = sbp->st_ctimensec;
11514         usbp->st_birthtime = sbp->st_birthtime;
11515         usbp->st_birthtimensec = sbp->st_birthtimensec;
11516 #endif
11517         usbp->st_size = sbp->st_size;
11518         usbp->st_blocks = sbp->st_blocks;
11519         usbp->st_blksize = sbp->st_blksize;
11520         usbp->st_flags = sbp->st_flags;
11521         usbp->st_gen = sbp->st_gen;
11522         usbp->st_lspare = sbp->st_lspare;
11523         usbp->st_qspare[0] = sbp->st_qspare[0];
11524         usbp->st_qspare[1] = sbp->st_qspare[1];
11525 }
11526
11527 /*
11528  * Purge buffer cache for simulating cold starts
11529  */
11530 static int vnode_purge_callback(struct vnode *vp, __unused void *cargs)
11531 {
11532         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
11533
11534         return VNODE_RETURNED;
11535 }
11536
11537 static int vfs_purge_callback(mount_t mp, __unused void * arg)
11538 {
11539         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
11540
11541         return VFS_RETURNED;
11542 }
11543
11544 int
11545 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
11546 {
11547         if (!kauth_cred_issuser(kauth_cred_get()))
11548                 return EPERM;
11549
11550         vfs_iterate(0/* flags */, vfs_purge_callback, NULL);
11551
11552         return 0;
11553 }
11554
11555 /*
11556  * gets the vnode associated with the (unnamed) snapshot directory
11557  * for a Filesystem. The snapshot directory vnode is returned with
11558  * an iocount on it.
11559  */
11560 int
11561 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
11562 {
11563         return (VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx));
11564 }
11565
11566 /*
11567  * Get the snapshot vnode.
11568  *
11569  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
11570  * needs nameidone() on ndp.
11571  *
11572  * If the snapshot vnode exists it is returned in ndp->ni_vp.
11573  *
11574  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
11575  * not needed.
11576  */
11577 static int
11578 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
11579     user_addr_t name, struct nameidata *ndp, int32_t op,
11580 #if !CONFIG_TRIGGERS
11581     __unused
11582 #endif
11583     enum path_operation pathop,
11584     vfs_context_t ctx)
11585 {
11586         int error, i;
11587         caddr_t name_buf;
11588         size_t name_len;
11589         struct vfs_attr vfa;
11590
11591         *sdvpp = NULLVP;
11592         *rvpp = NULLVP;
11593
11594         error = vnode_getfromfd(ctx, dirfd, rvpp);
11595         if (error)
11596                 return (error);
11597
11598         if (!vnode_isvroot(*rvpp)) {
11599                 error = EINVAL;
11600                 goto out;
11601         }
11602
11603         /* Make sure the filesystem supports snapshots */
11604         VFSATTR_INIT(&vfa);
11605         VFSATTR_WANTED(&vfa, f_capabilities);
11606         if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
11607             !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
11608             !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
11609             VOL_CAP_INT_SNAPSHOT)) ||
11610             !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
11611             VOL_CAP_INT_SNAPSHOT))) {
11612                 error = ENOTSUP;
11613                 goto out;
11614         }
11615
11616         error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
11617         if (error)
11618                 goto out;
11619
11620         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11621         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11622         if (error)
11623                 goto out1;
11624
11625         /*
11626          * Some sanity checks- name can't be empty, "." or ".." or have slashes.
11627          * (the length returned by copyinstr includes the terminating NUL)
11628          */
11629         if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
11630             (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
11631                 error = EINVAL;
11632                 goto out1;
11633         }
11634         for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++);
11635         if (i < (int)name_len) {
11636                 error = EINVAL;
11637                 goto out1;
11638         }
11639
11640 #if CONFIG_MACF
11641         if (op == CREATE) {
11642                 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
11643                     name_buf);
11644         } else if (op == DELETE) {
11645                 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
11646                     name_buf);
11647         }
11648         if (error)
11649                 goto out1;
11650 #endif
11651
11652         /* Check if the snapshot already exists ... */
11653         NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
11654             UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
11655         ndp->ni_dvp = *sdvpp;
11656
11657         error = namei(ndp);
11658 out1:
11659         FREE(name_buf, M_TEMP);
11660 out:
11661         if (error) {
11662                 if (*sdvpp) {
11663                         vnode_put(*sdvpp);
11664                         *sdvpp = NULLVP;
11665                 }
11666                 if (*rvpp) {
11667                         vnode_put(*rvpp);
11668                         *rvpp = NULLVP;
11669                 }
11670         }
11671         return (error);
11672 }
11673
11674 /*
11675  * create a filesystem snapshot (for supporting filesystems)
11676  *
11677  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
11678  * We get to the (unnamed) snapshot directory vnode and create the vnode
11679  * for the snapshot in it.
11680  *
11681  * Restrictions:
11682  *
11683  *    a) Passed in name for snapshot cannot have slashes.
11684  *    b) name can't be "." or ".."
11685  *
11686  * Since this requires superuser privileges, vnode_authorize calls are not
11687  * made.
11688  */
11689 static int
11690 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
11691     vfs_context_t ctx)
11692 {
11693         vnode_t rvp, snapdvp;
11694         int error;
11695         struct nameidata namend;
11696
11697         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
11698             OP_LINK, ctx);
11699         if (error)
11700                 return (error);
11701
11702         if (namend.ni_vp) {
11703                 vnode_put(namend.ni_vp);
11704                 error = EEXIST;
11705         } else {
11706                 struct vnode_attr va;
11707                 vnode_t vp = NULLVP;
11708
11709                 VATTR_INIT(&va);
11710                 VATTR_SET(&va, va_type, VREG);
11711                 VATTR_SET(&va, va_mode, 0);
11712
11713                 error = vn_create(snapdvp, &vp, &namend, &va,
11714                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
11715                 if (!error && vp)
11716                         vnode_put(vp);
11717         }
11718
11719         nameidone(&namend);
11720         vnode_put(snapdvp);
11721         vnode_put(rvp);
11722         return (error);
11723 }
11724
11725 /*
11726  * Delete a Filesystem snapshot
11727  *
11728  * get the vnode for the unnamed snapshot directory and the snapshot and
11729  * delete the snapshot.
11730  */
11731 static int
11732 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
11733     vfs_context_t ctx)
11734 {
11735         vnode_t rvp, snapdvp;
11736         int error;
11737         struct nameidata namend;
11738
11739         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
11740             OP_UNLINK, ctx);
11741         if (error)
11742                 goto out;
11743
11744         error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
11745             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
11746
11747         vnode_put(namend.ni_vp);
11748         nameidone(&namend);
11749         vnode_put(snapdvp);
11750         vnode_put(rvp);
11751 out:
11752         return (error);
11753 }
11754
11755 /*
11756  * Revert a filesystem to a snapshot
11757  *
11758  * Marks the filesystem to revert to the given snapshot on next mount.
11759  */
11760 static int
11761 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
11762                 vfs_context_t ctx)
11763 {
11764     int error;
11765     vnode_t rvp;
11766     mount_t mp;
11767     struct fs_snapshot_revert_args revert_data;
11768     struct componentname cnp;
11769     caddr_t name_buf;
11770     size_t name_len;
11771
11772     error = vnode_getfromfd(ctx, dirfd, &rvp);
11773     if (error) {
11774         return (error);
11775     }
11776     mp = vnode_mount(rvp);
11777
11778     MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11779     error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11780     if (error) {
11781         FREE(name_buf, M_TEMP);
11782         vnode_put(rvp);
11783         return (error);
11784     }
11785
11786 #if CONFIG_MACF
11787     error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
11788     if (error) {
11789         FREE(name_buf, M_TEMP);
11790         vnode_put(rvp);
11791         return (error);
11792     }
11793 #endif
11794
11795     /*
11796      * Grab mount_iterref so that we can release the vnode,
11797      * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
11798      */
11799     error = mount_iterref (mp, 0);
11800     vnode_put(rvp);
11801     if (error) {
11802         FREE(name_buf, M_TEMP);
11803         return (error);
11804     }
11805
11806     memset(&cnp, 0, sizeof(cnp));
11807     cnp.cn_pnbuf = (char *)name_buf;
11808     cnp.cn_nameiop = LOOKUP;
11809     cnp.cn_flags = ISLASTCN | HASBUF;
11810     cnp.cn_pnlen = MAXPATHLEN;
11811     cnp.cn_nameptr = cnp.cn_pnbuf;
11812     cnp.cn_namelen = (int)name_len;
11813     revert_data.sr_cnp = &cnp;
11814
11815     error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
11816     mount_iterdrop(mp);
11817     FREE(name_buf, M_TEMP);
11818
11819     if (error) {
11820         /* If there was any error, try again using VNOP_IOCTL */
11821
11822         vnode_t snapdvp;
11823         struct nameidata namend;
11824
11825         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
11826                                    OP_LOOKUP, ctx);
11827         if (error) {
11828             return (error);
11829         }
11830
11831
11832         error = VNOP_IOCTL(namend.ni_vp, APFSIOC_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
11833                            0, ctx);
11834
11835         vnode_put(namend.ni_vp);
11836         nameidone(&namend);
11837         vnode_put(snapdvp);
11838         vnode_put(rvp);
11839     }
11840
11841         return (error);
11842 }
11843
11844 /*
11845  * rename a Filesystem snapshot
11846  *
11847  * get the vnode for the unnamed snapshot directory and the snapshot and
11848  * rename the snapshot. This is a very specialised (and simple) case of
11849  * rename(2) (which has to deal with a lot more complications). It differs
11850  * slightly from rename(2) in that EEXIST is returned if the new name exists.
11851  */
11852 static int
11853 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
11854     __unused uint32_t flags, vfs_context_t ctx)
11855 {
11856         vnode_t rvp, snapdvp;
11857         int error, i;
11858         caddr_t newname_buf;
11859         size_t name_len;
11860         vnode_t fvp;
11861         struct nameidata *fromnd, *tond;
11862         /* carving out a chunk for structs that are too big to be on stack. */
11863         struct {
11864                 struct nameidata from_node;
11865                 struct nameidata to_node;
11866         } * __rename_data;
11867
11868         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
11869         fromnd = &__rename_data->from_node;
11870         tond = &__rename_data->to_node;
11871
11872         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
11873             OP_UNLINK, ctx);
11874         if (error)
11875                 goto out;
11876         fvp  = fromnd->ni_vp;
11877
11878         MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11879         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
11880         if (error)
11881                 goto out1;
11882
11883         /*
11884          * Some sanity checks- new name can't be empty, "." or ".." or have
11885          * slashes.
11886          * (the length returned by copyinstr includes the terminating NUL)
11887          *
11888          * The FS rename VNOP is suppossed to handle this but we'll pick it
11889          * off here itself.
11890          */
11891         if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
11892             (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
11893                 error = EINVAL;
11894                 goto out1;
11895         }
11896         for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++);
11897         if (i < (int)name_len) {
11898                 error = EINVAL;
11899                 goto out1;
11900         }
11901
11902 #if CONFIG_MACF
11903         error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
11904             newname_buf);
11905         if (error)
11906                 goto out1;
11907 #endif
11908
11909         NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
11910             UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
11911         tond->ni_dvp = snapdvp;
11912
11913         error = namei(tond);
11914         if (error) {
11915                 goto out2;
11916         } else if (tond->ni_vp) {
11917                 /*
11918                  * snapshot rename behaves differently than rename(2) - if the
11919                  * new name exists, EEXIST is returned.
11920                  */
11921                 vnode_put(tond->ni_vp);
11922                 error = EEXIST;
11923                 goto out2;
11924         }
11925
11926         error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
11927             &tond->ni_cnd, ctx);
11928
11929 out2:
11930         nameidone(tond);
11931 out1:
11932         FREE(newname_buf, M_TEMP);
11933         vnode_put(fvp);
11934         vnode_put(snapdvp);
11935         vnode_put(rvp);
11936         nameidone(fromnd);
11937 out:
11938         FREE(__rename_data, M_TEMP);
11939         return (error);
11940 }
11941
11942 /*
11943  * Mount a Filesystem snapshot
11944  *
11945  * get the vnode for the unnamed snapshot directory and the snapshot and
11946  * mount the snapshot.
11947  */
11948 static int
11949 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
11950     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
11951 {
11952         vnode_t rvp, snapdvp, snapvp, vp, pvp;
11953         int error;
11954         struct nameidata *snapndp, *dirndp;
11955         /* carving out a chunk for structs that are too big to be on stack. */
11956         struct {
11957                 struct nameidata snapnd;
11958                 struct nameidata dirnd;
11959         } * __snapshot_mount_data;
11960
11961         MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
11962             M_TEMP, M_WAITOK);
11963         snapndp = &__snapshot_mount_data->snapnd;
11964         dirndp = &__snapshot_mount_data->dirnd;
11965
11966         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
11967             OP_LOOKUP, ctx);
11968         if (error)
11969                 goto out;
11970
11971         snapvp  = snapndp->ni_vp;
11972         if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
11973                 error = EIO;
11974                 goto out1;
11975         }
11976
11977         /* Get the vnode to be covered */
11978         NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
11979             UIO_USERSPACE, directory, ctx);
11980         error = namei(dirndp);
11981         if (error)
11982                 goto out1;
11983
11984         vp = dirndp->ni_vp;
11985         pvp = dirndp->ni_dvp;
11986
11987         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
11988                 error = EINVAL;
11989         } else {
11990                 mount_t mp = vnode_mount(rvp);
11991                 struct fs_snapshot_mount_args smnt_data;
11992
11993                 smnt_data.sm_mp  = mp;
11994                 smnt_data.sm_cnp = &snapndp->ni_cnd;
11995                 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
11996                    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), flags & MNT_DONTBROWSE,
11997                    KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
11998         }
11999
12000         vnode_put(vp);
12001         vnode_put(pvp);
12002         nameidone(dirndp);
12003 out1:
12004         vnode_put(snapvp);
12005         vnode_put(snapdvp);
12006         vnode_put(rvp);
12007         nameidone(snapndp);
12008 out:
12009         FREE(__snapshot_mount_data, M_TEMP);
12010         return (error);
12011 }
12012
12013 /*
12014  * Root from a snapshot of the filesystem
12015  *
12016  * Marks the filesystem to root from the given snapshot on next boot.
12017  */
12018 static int
12019 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
12020                 vfs_context_t ctx)
12021 {
12022     int error;
12023     vnode_t rvp;
12024     mount_t mp;
12025     struct fs_snapshot_root_args root_data;
12026     struct componentname cnp;
12027     caddr_t name_buf;
12028     size_t name_len;
12029
12030     error = vnode_getfromfd(ctx, dirfd, &rvp);
12031     if (error) {
12032         return (error);
12033     }
12034     mp = vnode_mount(rvp);
12035
12036     MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
12037     error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
12038     if (error) {
12039         FREE(name_buf, M_TEMP);
12040         vnode_put(rvp);
12041         return (error);
12042     }
12043
12044     // XXX MAC checks ?
12045
12046     /*
12047      * Grab mount_iterref so that we can release the vnode,
12048      * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
12049      */
12050     error = mount_iterref (mp, 0);
12051     vnode_put(rvp);
12052     if (error) {
12053         FREE(name_buf, M_TEMP);
12054         return (error);
12055     }
12056
12057     memset(&cnp, 0, sizeof(cnp));
12058     cnp.cn_pnbuf = (char *)name_buf;
12059     cnp.cn_nameiop = LOOKUP;
12060     cnp.cn_flags = ISLASTCN | HASBUF;
12061     cnp.cn_pnlen = MAXPATHLEN;
12062     cnp.cn_nameptr = cnp.cn_pnbuf;
12063     cnp.cn_namelen = (int)name_len;
12064     root_data.sr_cnp = &cnp;
12065
12066     error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
12067
12068     mount_iterdrop(mp);
12069     FREE(name_buf, M_TEMP);
12070
12071     return (error);
12072 }
12073
12074 /*
12075  * FS snapshot operations dispatcher
12076  */
12077 int
12078 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
12079     __unused int32_t *retval)
12080 {
12081         int error;
12082         vfs_context_t ctx = vfs_context_current();
12083
12084         AUDIT_ARG(fd, uap->dirfd);
12085         AUDIT_ARG(value32, uap->op);
12086
12087         error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
12088         if (error)
12089                 return (error);
12090
12091         switch (uap->op) {
12092         case SNAPSHOT_OP_CREATE:
12093                 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
12094                 break;
12095         case SNAPSHOT_OP_DELETE:
12096                 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
12097                 break;
12098         case SNAPSHOT_OP_RENAME:
12099                 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
12100                     uap->flags, ctx);
12101                 break;
12102         case SNAPSHOT_OP_MOUNT:
12103                 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
12104                     uap->data, uap->flags, ctx);
12105                 break;
12106     case SNAPSHOT_OP_REVERT:
12107         error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
12108         break;
12109 #if CONFIG_MNT_ROOTSNAP
12110         case SNAPSHOT_OP_ROOT:
12111                 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
12112                 break;
12113 #endif /* CONFIG_MNT_ROOTSNAP */
12114         default:
12115                 error = ENOSYS;
12116         }
12117
12118         return (error);
12119 }