bsd/vfs/vfs_syscalls.c

   1 /*
   2  * Copyright (c) 1995-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  * (c) UNIX System Laboratories, Inc.
  32  * All or some portions of this file are derived from material licensed
  33  * to the University of California by American Telephone and Telegraph
  34  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  35  * the permission of UNIX System Laboratories, Inc.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_syscalls.c      8.41 (Berkeley) 6/15/95
  66  */
  67 /*
  68  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  69  * support for mandatory and extensible security protections.  This notice
  70  * is included in support of clause 2.2 (b) of the Apple Public License,
  71  * Version 2.0.
  72  */
  73
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/namei.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/stat.h>
  81 #include <sys/vnode_internal.h>
  82 #include <sys/mount_internal.h>
  83 #include <sys/proc_internal.h>
  84 #include <sys/kauth.h>
  85 #include <sys/uio_internal.h>
  86 #include <sys/malloc.h>
  87 #include <sys/mman.h>
  88 #include <sys/dirent.h>
  89 #include <sys/attr.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/ubc.h>
  92 #include <sys/quota.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/fsevents.h>
  95 #include <sys/imgsrc.h>
  96 #include <sys/sysproto.h>
  97 #include <sys/xattr.h>
  98 #include <sys/fcntl.h>
  99 #include <sys/fsctl.h>
 100 #include <sys/ubc_internal.h>
 101 #include <sys/disk.h>
 102 #include <sys/content_protection.h>
 103 #include <sys/clonefile.h>
 104 #include <sys/snapshot.h>
 105 #include <sys/priv.h>
 106 #include <machine/cons.h>
 107 #include <machine/limits.h>
 108 #include <miscfs/specfs/specdev.h>
 109
 110 #include <security/audit/audit.h>
 111 #include <bsm/audit_kevents.h>
 112
 113 #include <mach/mach_types.h>
 114 #include <kern/kern_types.h>
 115 #include <kern/kalloc.h>
 116 #include <kern/task.h>
 117
 118 #include <vm/vm_pageout.h>
 119 #include <vm/vm_protos.h>
 120
 121 #include <libkern/OSAtomic.h>
 122 #include <pexpert/pexpert.h>
 123 #include <IOKit/IOBSD.h>
 124
 125 #if ROUTEFS
 126 #include <miscfs/routefs/routefs.h>
 127 #endif /* ROUTEFS */
 128
 129 #if CONFIG_MACF
 130 #include <security/mac.h>
 131 #include <security/mac_framework.h>
 132 #endif
 133
 134 #if CONFIG_FSE
 135 #define GET_PATH(x) \
 136         (x) = get_pathbuff();
 137 #define RELEASE_PATH(x) \
 138         release_pathbuff(x);
 139 #else
 140 #define GET_PATH(x)     \
 141         MALLOC_ZONE((x), char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
 142 #define RELEASE_PATH(x) \
 143         FREE_ZONE((x), MAXPATHLEN, M_NAMEI);
 144 #endif /* CONFIG_FSE */
 145
 146 /* struct for checkdirs iteration */
 147 struct cdirargs {
 148         vnode_t olddp;
 149         vnode_t newdp;
 150 };
 151 /* callback  for checkdirs iteration */
 152 static int checkdirs_callback(proc_t p, void * arg);
 153
 154 static int change_dir(struct nameidata *ndp, vfs_context_t ctx);
 155 static int checkdirs(vnode_t olddp, vfs_context_t ctx);
 156 void enablequotas(struct mount *mp, vfs_context_t ctx);
 157 static int getfsstat_callback(mount_t mp, void * arg);
 158 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 159 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 160 static int sync_callback(mount_t, void *);
 161 static void sync_thread(void *, __unused wait_result_t);
 162 static int sync_async(int);
 163 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
 164                         user_addr_t bufp, int *sizep, boolean_t is_64_bit,
 165                                                 boolean_t partial_copy);
 166 static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
 167                         user_addr_t bufp);
 168 static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
 169 static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 170                         struct componentname *cnp, user_addr_t fsmountargs,
 171                         int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount,
 172                         vfs_context_t ctx);
 173 void vfs_notify_mount(vnode_t pdvp);
 174
 175 int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth);
 176
 177 struct fd_vn_data * fg_vn_data_alloc(void);
 178
 179 /*
 180  * Max retries for ENOENT returns from vn_authorize_{rmdir, unlink, rename}
 181  * Concurrent lookups (or lookups by ids) on hard links can cause the
 182  * vn_getpath (which does not re-enter the filesystem as vn_getpath_fsenter
 183  * does) to return ENOENT as the path cannot be returned from the name cache
 184  * alone. We have no option but to retry and hope to get one namei->reverse path
 185  * generation done without an intervening lookup, lookup by id on the hard link
 186  * item. This is only an issue for MAC hooks which cannot reenter the filesystem
 187  * which currently are the MAC hooks for rename, unlink and rmdir.
 188  */
 189 #define MAX_AUTHORIZE_ENOENT_RETRIES 1024
 190
 191 static int rmdirat_internal(vfs_context_t, int, user_addr_t, enum uio_seg);
 192
 193 static int fsgetpath_internal(vfs_context_t, int, uint64_t, vm_size_t, caddr_t, int *);
 194
 195 #ifdef CONFIG_IMGSRC_ACCESS
 196 static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
 197 static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
 198 static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
 199 static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
 200 static void mount_end_update(mount_t mp);
 201 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 202 #endif /* CONFIG_IMGSRC_ACCESS */
 203
 204 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 205
 206 __private_extern__
 207 int sync_internal(void);
 208
 209 __private_extern__
 210 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 211
 212 extern lck_grp_t *fd_vn_lck_grp;
 213 extern lck_grp_attr_t *fd_vn_lck_grp_attr;
 214 extern lck_attr_t *fd_vn_lck_attr;
 215
 216 /*
 217  * incremented each time a mount or unmount operation occurs
 218  * used to invalidate the cached value of the rootvp in the
 219  * mount structure utilized by cache_lookup_path
 220  */
 221 uint32_t mount_generation = 0;
 222
 223 /* counts number of mount and unmount operations */
 224 unsigned int vfs_nummntops=0;
 225
 226 extern const struct fileops vnops;
 227 #if CONFIG_APPLEDOUBLE
 228 extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
 229 #endif /* CONFIG_APPLEDOUBLE */
 230
 231 /*
 232  * Virtual File System System Calls
 233  */
 234
 235 #if NFSCLIENT || DEVFS || ROUTEFS
 236 /*
 237  * Private in-kernel mounting spi (NFS only, not exported)
 238  */
 239  __private_extern__
 240 boolean_t
 241 vfs_iskernelmount(mount_t mp)
 242 {
 243         return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE);
 244 }
 245
 246  __private_extern__
 247 int
 248 kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path,
 249              void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx)
 250 {
 251         struct nameidata nd;
 252         boolean_t did_namei;
 253         int error;
 254
 255         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 256                UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx);
 257
 258         /*
 259          * Get the vnode to be covered if it's not supplied
 260          */
 261         if (vp == NULLVP) {
 262                 error = namei(&nd);
 263                 if (error)
 264                         return (error);
 265                 vp = nd.ni_vp;
 266                 pvp = nd.ni_dvp;
 267                 did_namei = TRUE;
 268         } else {
 269                 char *pnbuf = CAST_DOWN(char *, path);
 270
 271                 nd.ni_cnd.cn_pnbuf = pnbuf;
 272                 nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1;
 273                 did_namei = FALSE;
 274         }
 275
 276         error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data),
 277                              syscall_flags, kern_flags, NULL, TRUE, ctx);
 278
 279         if (did_namei) {
 280                 vnode_put(vp);
 281                 vnode_put(pvp);
 282                 nameidone(&nd);
 283         }
 284
 285         return (error);
 286 }
 287 #endif /* NFSCLIENT || DEVFS */
 288
 289 /*
 290  * Mount a file system.
 291  */
 292 /* ARGSUSED */
 293 int
 294 mount(proc_t p, struct mount_args *uap, __unused int32_t *retval)
 295 {
 296         struct __mac_mount_args muap;
 297
 298         muap.type = uap->type;
 299         muap.path = uap->path;
 300         muap.flags = uap->flags;
 301         muap.data = uap->data;
 302         muap.mac_p = USER_ADDR_NULL;
 303         return (__mac_mount(p, &muap, retval));
 304 }
 305
 306 void
 307 vfs_notify_mount(vnode_t pdvp)
 308 {
 309         vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL);
 310         lock_vnode_and_post(pdvp, NOTE_WRITE);
 311 }
 312
 313 /*
 314  * __mac_mount:
 315  *      Mount a file system taking into account MAC label behavior.
 316  *      See mount(2) man page for more information
 317  *
 318  * Parameters:    p                        Process requesting the mount
 319  *                uap                      User argument descriptor (see below)
 320  *                retval                   (ignored)
 321  *
 322  * Indirect:      uap->type                Filesystem type
 323  *                uap->path                Path to mount
 324  *                uap->data                Mount arguments
 325  *                uap->mac_p               MAC info
 326  *                uap->flags               Mount flags
 327  *
 328  *
 329  * Returns:        0                       Success
 330  *                !0                       Not success
 331  */
 332 boolean_t root_fs_upgrade_try = FALSE;
 333
 334 int
 335 __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval)
 336 {
 337         vnode_t pvp = NULL;
 338         vnode_t vp = NULL;
 339         int need_nameidone = 0;
 340         vfs_context_t ctx = vfs_context_current();
 341         char fstypename[MFSNAMELEN];
 342         struct nameidata nd;
 343         size_t dummy=0;
 344         char *labelstr = NULL;
 345         int flags = uap->flags;
 346         int error;
 347 #if CONFIG_IMGSRC_ACCESS || CONFIG_MACF
 348         boolean_t is_64bit = IS_64BIT_PROCESS(p);
 349 #else
 350 #pragma unused(p)
 351 #endif
 352         /*
 353          * Get the fs type name from user space
 354          */
 355         error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy);
 356         if (error)
 357                 return (error);
 358
 359         /*
 360          * Get the vnode to be covered
 361          */
 362         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
 363                UIO_USERSPACE, uap->path, ctx);
 364         error = namei(&nd);
 365         if (error) {
 366                 goto out;
 367         }
 368         need_nameidone = 1;
 369         vp = nd.ni_vp;
 370         pvp = nd.ni_dvp;
 371
 372 #ifdef CONFIG_IMGSRC_ACCESS
 373         /* Mounting image source cannot be batched with other operations */
 374         if (flags == MNT_IMGSRC_BY_INDEX) {
 375                 error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename,
 376                                                   ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX));
 377                 goto out;
 378         }
 379 #endif /* CONFIG_IMGSRC_ACCESS */
 380
 381 #if CONFIG_MACF
 382         /*
 383          * Get the label string (if any) from user space
 384          */
 385         if (uap->mac_p != USER_ADDR_NULL) {
 386                 struct user_mac mac;
 387                 size_t ulen = 0;
 388
 389                 if (is_64bit) {
 390                         struct user64_mac mac64;
 391                         error = copyin(uap->mac_p, &mac64, sizeof(mac64));
 392                         mac.m_buflen = mac64.m_buflen;
 393                         mac.m_string = mac64.m_string;
 394                 } else {
 395                         struct user32_mac mac32;
 396                         error = copyin(uap->mac_p, &mac32, sizeof(mac32));
 397                         mac.m_buflen = mac32.m_buflen;
 398                         mac.m_string = mac32.m_string;
 399                 }
 400                 if (error)
 401                         goto out;
 402                 if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) ||
 403                     (mac.m_buflen < 2)) {
 404                         error = EINVAL;
 405                         goto out;
 406                 }
 407                 MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK);
 408                 error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen);
 409                 if (error) {
 410                         goto out;
 411                 }
 412                 AUDIT_ARG(mac_string, labelstr);
 413         }
 414 #endif /* CONFIG_MACF */
 415
 416         AUDIT_ARG(fflags, flags);
 417
 418 #if SECURE_KERNEL
 419         if (flags & MNT_UNION) {
 420                 /* No union mounts on release kernels */
 421                 error = EPERM;
 422                 goto out;
 423         }
 424 #endif
 425
 426         if ((vp->v_flag & VROOT) &&
 427                         (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
 428                 if (!(flags & MNT_UNION)) {
 429                         flags |= MNT_UPDATE;
 430                 }
 431                 else {
 432                         /*
 433                          * For a union mount on '/', treat it as fresh
 434                          * mount instead of update.
 435                          * Otherwise, union mouting on '/' used to panic the
 436                          * system before, since mnt_vnodecovered was found to
 437                          * be NULL for '/' which is required for unionlookup
 438                          * after it gets ENOENT on union mount.
 439                          */
 440                         flags = (flags & ~(MNT_UPDATE));
 441                 }
 442
 443 #if SECURE_KERNEL
 444                 if ((flags & MNT_RDONLY) == 0) {
 445                         /* Release kernels are not allowed to mount "/" as rw */
 446                         error = EPERM;
 447                         goto out;
 448                 }
 449 #endif
 450                 /*
 451                  * See 7392553 for more details on why this check exists.
 452                  * Suffice to say: If this check is ON and something tries
 453                  * to mount the rootFS RW, we'll turn off the codesign
 454                  * bitmap optimization.
 455                  */
 456 #if CHECK_CS_VALIDATION_BITMAP
 457                 if ((flags & MNT_RDONLY) == 0 ) {
 458                         root_fs_upgrade_try = TRUE;
 459                 }
 460 #endif
 461         }
 462
 463         error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0,
 464                              labelstr, FALSE, ctx);
 465
 466 out:
 467
 468 #if CONFIG_MACF
 469         if (labelstr)
 470                 FREE(labelstr, M_MACTEMP);
 471 #endif /* CONFIG_MACF */
 472
 473         if (vp) {
 474                 vnode_put(vp);
 475         }
 476         if (pvp) {
 477                 vnode_put(pvp);
 478         }
 479         if (need_nameidone) {
 480                 nameidone(&nd);
 481         }
 482
 483         return (error);
 484 }
 485
 486 /*
 487  * common mount implementation (final stage of mounting)
 488
 489  * Arguments:
 490  *  fstypename  file system type (ie it's vfs name)
 491  *  pvp         parent of covered vnode
 492  *  vp          covered vnode
 493  *  cnp         component name (ie path) of covered vnode
 494  *  flags       generic mount flags
 495  *  fsmountargs file system specific data
 496  *  labelstr    optional MAC label
 497  *  kernelmount TRUE for mounts initiated from inside the kernel
 498  *  ctx         caller's context
 499  */
 500 static int
 501 mount_common(char *fstypename, vnode_t pvp, vnode_t vp,
 502              struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags,
 503              char *labelstr, boolean_t kernelmount, vfs_context_t ctx)
 504 {
 505 #if !CONFIG_MACF
 506 #pragma unused(labelstr)
 507 #endif
 508         struct vnode *devvp = NULLVP;
 509         struct vnode *device_vnode = NULLVP;
 510 #if CONFIG_MACF
 511         struct vnode *rvp;
 512 #endif
 513         struct mount *mp;
 514         struct vfstable *vfsp = (struct vfstable *)0;
 515         struct proc *p = vfs_context_proc(ctx);
 516         int error, flag = 0;
 517         user_addr_t devpath = USER_ADDR_NULL;
 518         int ronly = 0;
 519         int mntalloc = 0;
 520         boolean_t vfsp_ref = FALSE;
 521         boolean_t is_rwlock_locked = FALSE;
 522         boolean_t did_rele = FALSE;
 523         boolean_t have_usecount = FALSE;
 524
 525         /*
 526          * Process an update for an existing mount
 527          */
 528         if (flags & MNT_UPDATE) {
 529                 if ((vp->v_flag & VROOT) == 0) {
 530                         error = EINVAL;
 531                         goto out1;
 532                 }
 533                 mp = vp->v_mount;
 534
 535                 /* unmount in progress return error */
 536                 mount_lock_spin(mp);
 537                 if (mp->mnt_lflag & MNT_LUNMOUNT) {
 538                         mount_unlock(mp);
 539                         error = EBUSY;
 540                         goto out1;
 541                 }
 542                 mount_unlock(mp);
 543                 lck_rw_lock_exclusive(&mp->mnt_rwlock);
 544                 is_rwlock_locked = TRUE;
 545                 /*
 546                  * We only allow the filesystem to be reloaded if it
 547                  * is currently mounted read-only.
 548                  */
 549                 if ((flags & MNT_RELOAD) &&
 550                     ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 551                         error = ENOTSUP;
 552                         goto out1;
 553                 }
 554
 555                 /*
 556                  * If content protection is enabled, update mounts are not
 557                  * allowed to turn it off.
 558                  */
 559                 if ((mp->mnt_flag & MNT_CPROTECT) &&
 560                            ((flags & MNT_CPROTECT) == 0)) {
 561                         error = EINVAL;
 562                         goto out1;
 563                 }
 564
 565 #ifdef CONFIG_IMGSRC_ACCESS
 566                 /* Can't downgrade the backer of the root FS */
 567                 if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
 568                         (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) {
 569                         error = ENOTSUP;
 570                         goto out1;
 571                 }
 572 #endif /* CONFIG_IMGSRC_ACCESS */
 573
 574                 /*
 575                  * Only root, or the user that did the original mount is
 576                  * permitted to update it.
 577                  */
 578                 if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
 579                     (error = suser(vfs_context_ucred(ctx), &p->p_acflag))) {
 580                         goto out1;
 581                 }
 582 #if CONFIG_MACF
 583                 error = mac_mount_check_remount(ctx, mp);
 584                 if (error != 0) {
 585                         goto out1;
 586                 }
 587 #endif
 588                 /*
 589                  * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV,
 590                  * and MNT_NOEXEC if mount point is already MNT_NOEXEC.
 591                  */
 592                 if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 593                         flags |= MNT_NOSUID | MNT_NODEV;
 594                         if (mp->mnt_flag & MNT_NOEXEC)
 595                                 flags |= MNT_NOEXEC;
 596                 }
 597                 flag = mp->mnt_flag;
 598
 599
 600
 601                 mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
 602
 603                 vfsp = mp->mnt_vtable;
 604                 goto update;
 605         }
 606         /*
 607          * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and
 608          * MNT_NOEXEC if mount point is already MNT_NOEXEC.
 609          */
 610         if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) {
 611                 flags |= MNT_NOSUID | MNT_NODEV;
 612                 if (vp->v_mount->mnt_flag & MNT_NOEXEC)
 613                         flags |= MNT_NOEXEC;
 614         }
 615
 616         /* XXXAUDIT: Should we capture the type on the error path as well? */
 617         AUDIT_ARG(text, fstypename);
 618         mount_list_lock();
 619         for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 620                 if (!strncmp(vfsp->vfc_name, fstypename, MFSNAMELEN)) {
 621                         vfsp->vfc_refcount++;
 622                         vfsp_ref = TRUE;
 623                         break;
 624                 }
 625         mount_list_unlock();
 626         if (vfsp == NULL) {
 627                 error = ENODEV;
 628                 goto out1;
 629         }
 630
 631         /*
 632          * VFC_VFSLOCALARGS is not currently supported for kernel mounts
 633          */
 634         if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) {
 635                 error = EINVAL;  /* unsupported request */
 636                 goto out1;
 637         }
 638
 639         error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0));
 640         if (error != 0) {
 641                 goto out1;
 642         }
 643
 644         /*
 645          * Allocate and initialize the filesystem (mount_t)
 646          */
 647         MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount),
 648                 M_MOUNT, M_WAITOK);
 649         bzero((char *)mp, (u_int32_t)sizeof(struct mount));
 650         mntalloc = 1;
 651
 652         /* Initialize the default IO constraints */
 653         mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
 654         mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32;
 655         mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
 656         mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
 657         mp->mnt_devblocksize = DEV_BSIZE;
 658         mp->mnt_alignmentmask = PAGE_MASK;
 659         mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
 660         mp->mnt_ioscale = 1;
 661         mp->mnt_ioflags = 0;
 662         mp->mnt_realrootvp = NULLVP;
 663         mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
 664
 665         TAILQ_INIT(&mp->mnt_vnodelist);
 666         TAILQ_INIT(&mp->mnt_workerqueue);
 667         TAILQ_INIT(&mp->mnt_newvnodes);
 668         mount_lock_init(mp);
 669         lck_rw_lock_exclusive(&mp->mnt_rwlock);
 670         is_rwlock_locked = TRUE;
 671         mp->mnt_op = vfsp->vfc_vfsops;
 672         mp->mnt_vtable = vfsp;
 673         //mp->mnt_stat.f_type = vfsp->vfc_typenum;
 674         mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 675         strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
 676         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
 677         mp->mnt_vnodecovered = vp;
 678         mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx));
 679         mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1;
 680         mp->mnt_devbsdunit = 0;
 681
 682         /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */
 683         vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE);
 684
 685 #if NFSCLIENT || DEVFS || ROUTEFS
 686         if (kernelmount)
 687                 mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT;
 688         if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0)
 689                 mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT;
 690 #endif /* NFSCLIENT || DEVFS */
 691
 692 update:
 693         /*
 694          * Set the mount level flags.
 695          */
 696         if (flags & MNT_RDONLY)
 697                 mp->mnt_flag |= MNT_RDONLY;
 698         else if (mp->mnt_flag & MNT_RDONLY) {
 699                 // disallow read/write upgrades of file systems that
 700                 // had the TYPENAME_OVERRIDE feature set.
 701                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
 702                         error = EPERM;
 703                         goto out1;
 704                 }
 705                 mp->mnt_kern_flag |= MNTK_WANTRDWR;
 706         }
 707         mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 708                           MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 709                           MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 710                           MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 711                           MNT_QUARANTINE | MNT_CPROTECT);
 712
 713 #if SECURE_KERNEL
 714 #if !CONFIG_MNT_SUID
 715         /*
 716          * On release builds of iOS based platforms, always enforce NOSUID and NODEV on
 717          * all mounts. We do this here because we can catch update mounts as well as
 718          * non-update mounts in this case.
 719          */
 720         mp->mnt_flag |= (MNT_NOSUID);
 721 #endif
 722 #endif
 723
 724         mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 725                                  MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC |
 726                                  MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE |
 727                                  MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME |
 728                                  MNT_QUARANTINE | MNT_CPROTECT);
 729
 730 #if CONFIG_MACF
 731         if (flags & MNT_MULTILABEL) {
 732                 if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) {
 733                         error = EINVAL;
 734                         goto out1;
 735                 }
 736                 mp->mnt_flag |= MNT_MULTILABEL;
 737         }
 738 #endif
 739         /*
 740          * Process device path for local file systems if requested
 741          */
 742         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS &&
 743             !(internal_flags & KERNEL_MOUNT_SNAPSHOT)) {
 744                 if (vfs_context_is64bit(ctx)) {
 745                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
 746                                 goto out1;
 747                         fsmountargs += sizeof(devpath);
 748                 } else {
 749                         user32_addr_t tmp;
 750                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
 751                                 goto out1;
 752                         /* munge into LP64 addr */
 753                         devpath = CAST_USER_ADDR_T(tmp);
 754                         fsmountargs += sizeof(tmp);
 755                 }
 756
 757                 /* Lookup device and authorize access to it */
 758                 if ((devpath)) {
 759                         struct nameidata nd;
 760
 761                         NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx);
 762                         if ( (error = namei(&nd)) )
 763                                 goto out1;
 764
 765                         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
 766                         devvp = nd.ni_vp;
 767
 768                         nameidone(&nd);
 769
 770                         if (devvp->v_type != VBLK) {
 771                                 error = ENOTBLK;
 772                                 goto out2;
 773                         }
 774                         if (major(devvp->v_rdev) >= nblkdev) {
 775                                 error = ENXIO;
 776                                 goto out2;
 777                         }
 778                         /*
 779                         * If mount by non-root, then verify that user has necessary
 780                         * permissions on the device.
 781                         */
 782                         if (suser(vfs_context_ucred(ctx), NULL) != 0) {
 783                                 mode_t accessmode = KAUTH_VNODE_READ_DATA;
 784
 785                                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
 786                                         accessmode |= KAUTH_VNODE_WRITE_DATA;
 787                                 if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0)
 788                                         goto out2;
 789                         }
 790                 }
 791                 /* On first mount, preflight and open device */
 792                 if (devpath && ((flags & MNT_UPDATE) == 0)) {
 793                         if ( (error = vnode_ref(devvp)) )
 794                                 goto out2;
 795                         /*
 796                         * Disallow multiple mounts of the same device.
 797                         * Disallow mounting of a device that is currently in use
 798                         * (except for root, which might share swap device for miniroot).
 799                         * Flush out any old buffers remaining from a previous use.
 800                         */
 801                         if ( (error = vfs_mountedon(devvp)) )
 802                                 goto out3;
 803
 804                         if (vcount(devvp) > 1 && !(vfs_flags(mp) & MNT_ROOTFS)) {
 805                                 error = EBUSY;
 806                                 goto out3;
 807                         }
 808                         if ( (error = VNOP_FSYNC(devvp, MNT_WAIT, ctx)) ) {
 809                                 error = ENOTBLK;
 810                                 goto out3;
 811                         }
 812                         if ( (error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0)) )
 813                                 goto out3;
 814
 815                         ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 816 #if CONFIG_MACF
 817                         error = mac_vnode_check_open(ctx,
 818                             devvp,
 819                             ronly ? FREAD : FREAD|FWRITE);
 820                         if (error)
 821                                 goto out3;
 822 #endif /* MAC */
 823                         if ( (error = VNOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, ctx)) )
 824                                 goto out3;
 825
 826                         mp->mnt_devvp = devvp;
 827                         device_vnode = devvp;
 828
 829                 } else if ((mp->mnt_flag & MNT_RDONLY) &&
 830                            (mp->mnt_kern_flag & MNTK_WANTRDWR) &&
 831                            (device_vnode = mp->mnt_devvp)) {
 832                         dev_t dev;
 833                         int maj;
 834                         /*
 835                          * If upgrade to read-write by non-root, then verify
 836                          * that user has necessary permissions on the device.
 837                          */
 838                         vnode_getalways(device_vnode);
 839
 840                         if (suser(vfs_context_ucred(ctx), NULL) &&
 841                             (error = vnode_authorize(device_vnode, NULL,
 842                              KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA,
 843                              ctx)) != 0) {
 844                                 vnode_put(device_vnode);
 845                                 goto out2;
 846                         }
 847
 848                         /* Tell the device that we're upgrading */
 849                         dev = (dev_t)device_vnode->v_rdev;
 850                         maj = major(dev);
 851
 852                         if ((u_int)maj >= (u_int)nblkdev)
 853                                 panic("Volume mounted on a device with invalid major number.");
 854
 855                         error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p);
 856                         vnode_put(device_vnode);
 857                         device_vnode = NULLVP;
 858                         if (error != 0) {
 859                                 goto out2;
 860                         }
 861                 }
 862         }
 863 #if CONFIG_MACF
 864         if ((flags & MNT_UPDATE) == 0) {
 865                 mac_mount_label_init(mp);
 866                 mac_mount_label_associate(ctx, mp);
 867         }
 868         if (labelstr) {
 869                 if ((flags & MNT_UPDATE) != 0) {
 870                         error = mac_mount_check_label_update(ctx, mp);
 871                         if (error != 0)
 872                                 goto out3;
 873                 }
 874         }
 875 #endif
 876         /*
 877          * Mount the filesystem.
 878          */
 879         if (internal_flags & KERNEL_MOUNT_SNAPSHOT) {
 880                 error = VFS_IOCTL(mp, VFSIOC_MOUNT_SNAPSHOT,
 881                     (caddr_t)fsmountargs, 0, ctx);
 882         } else {
 883                 error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx);
 884         }
 885
 886         if (flags & MNT_UPDATE) {
 887                 if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 888                         mp->mnt_flag &= ~MNT_RDONLY;
 889                 mp->mnt_flag &=~
 890                     (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
 891                 mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 892                 if (error)
 893                         mp->mnt_flag = flag;  /* restore flag value */
 894                 vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL);
 895                 lck_rw_done(&mp->mnt_rwlock);
 896                 is_rwlock_locked = FALSE;
 897                 if (!error)
 898                         enablequotas(mp, ctx);
 899                 goto exit;
 900         }
 901
 902         /*
 903          * Put the new filesystem on the mount list after root.
 904          */
 905         if (error == 0) {
 906                 struct vfs_attr vfsattr;
 907 #if CONFIG_MACF
 908                 if (vfs_flags(mp) & MNT_MULTILABEL) {
 909                         error = VFS_ROOT(mp, &rvp, ctx);
 910                         if (error) {
 911                                 printf("%s() VFS_ROOT returned %d\n", __func__, error);
 912                                 goto out3;
 913                         }
 914                         error = vnode_label(mp, NULL, rvp, NULL, 0, ctx);
 915                         /*
 916                          * drop reference provided by VFS_ROOT
 917                          */
 918                         vnode_put(rvp);
 919
 920                         if (error)
 921                                 goto out3;
 922                 }
 923 #endif  /* MAC */
 924
 925                 vnode_lock_spin(vp);
 926                 CLR(vp->v_flag, VMOUNT);
 927                 vp->v_mountedhere = mp;
 928                 vnode_unlock(vp);
 929
 930                 /*
 931                  * taking the name_cache_lock exclusively will
 932                  * insure that everyone is out of the fast path who
 933                  * might be trying to use a now stale copy of
 934                  * vp->v_mountedhere->mnt_realrootvp
 935                  * bumping mount_generation causes the cached values
 936                  * to be invalidated
 937                  */
 938                 name_cache_lock();
 939                 mount_generation++;
 940                 name_cache_unlock();
 941
 942                 error = vnode_ref(vp);
 943                 if (error != 0) {
 944                         goto out4;
 945                 }
 946
 947                 have_usecount = TRUE;
 948
 949                 error = checkdirs(vp, ctx);
 950                 if (error != 0)  {
 951                         /* Unmount the filesystem as cdir/rdirs cannot be updated */
 952                         goto out4;
 953                 }
 954                 /*
 955                  * there is no cleanup code here so I have made it void
 956                  * we need to revisit this
 957                  */
 958                 (void)VFS_START(mp, 0, ctx);
 959
 960                 if (mount_list_add(mp) != 0) {
 961                         /*
 962                          * The system is shutting down trying to umount
 963                          * everything, so fail with a plausible errno.
 964                          */
 965                         error = EBUSY;
 966                         goto out4;
 967                 }
 968                 lck_rw_done(&mp->mnt_rwlock);
 969                 is_rwlock_locked = FALSE;
 970
 971                 /* Check if this mounted file system supports EAs or named streams. */
 972                 /* Skip WebDAV file systems for now since they hang in VFS_GETATTR here. */
 973                 VFSATTR_INIT(&vfsattr);
 974                 VFSATTR_WANTED(&vfsattr, f_capabilities);
 975                 if (strncmp(mp->mnt_vfsstat.f_fstypename, "webdav", sizeof("webdav")) != 0 &&
 976                     vfs_getattr(mp, &vfsattr, ctx) == 0 &&
 977                     VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
 978                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
 979                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
 980                                 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
 981                         }
 982 #if NAMEDSTREAMS
 983                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
 984                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
 985                                 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
 986                         }
 987 #endif
 988                         /* Check if this file system supports path from id lookups. */
 989                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
 990                             (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
 991                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 992                         } else if (mp->mnt_flag & MNT_DOVOLFS) {
 993                                 /* Legacy MNT_DOVOLFS flag also implies path from id lookups. */
 994                                 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID;
 995                         }
 996
 997                         if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
 998                                 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
 999                                 mp->mnt_kern_flag |= MNTK_DIR_HARDLINKS;
1000                         }
1001                 }
1002                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1003                         mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS;
1004                 }
1005                 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1006                         mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT;
1007                 }
1008                 /* increment the operations count */
1009                 OSAddAtomic(1, &vfs_nummntops);
1010                 enablequotas(mp, ctx);
1011
1012                 if (device_vnode) {
1013                         device_vnode->v_specflags |= SI_MOUNTEDON;
1014
1015                         /*
1016                          *   cache the IO attributes for the underlying physical media...
1017                          *   an error return indicates the underlying driver doesn't
1018                          *   support all the queries necessary... however, reasonable
1019                          *   defaults will have been set, so no reason to bail or care
1020                          */
1021                         vfs_init_io_attributes(device_vnode, mp);
1022                 }
1023
1024                 /* Now that mount is setup, notify the listeners */
1025                 vfs_notify_mount(pvp);
1026                 IOBSDMountChange(mp, kIOMountChangeMount);
1027
1028         } else {
1029                 /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */
1030                 if (mp->mnt_vnodelist.tqh_first != NULL) {
1031                         panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.",
1032                                         mp->mnt_vtable->vfc_name, error);
1033                 }
1034
1035                 vnode_lock_spin(vp);
1036                 CLR(vp->v_flag, VMOUNT);
1037                 vnode_unlock(vp);
1038                 mount_list_lock();
1039                 mp->mnt_vtable->vfc_refcount--;
1040                 mount_list_unlock();
1041
1042                 if (device_vnode ) {
1043                         vnode_rele(device_vnode);
1044                         VNOP_CLOSE(device_vnode, ronly ? FREAD : FREAD|FWRITE, ctx);
1045                 }
1046                 lck_rw_done(&mp->mnt_rwlock);
1047                 is_rwlock_locked = FALSE;
1048
1049                 /*
1050                  * if we get here, we have a mount structure that needs to be freed,
1051                  * but since the coveredvp hasn't yet been updated to point at it,
1052                  * no need to worry about other threads holding a crossref on this mp
1053                  * so it's ok to just free it
1054                  */
1055                 mount_lock_destroy(mp);
1056 #if CONFIG_MACF
1057                 mac_mount_label_destroy(mp);
1058 #endif
1059                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1060         }
1061 exit:
1062         /*
1063          * drop I/O count on the device vp if there was one
1064          */
1065         if (devpath && devvp)
1066                 vnode_put(devvp);
1067
1068         return(error);
1069
1070 /* Error condition exits */
1071 out4:
1072         (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx);
1073
1074         /*
1075          * If the mount has been placed on the covered vp,
1076          * it may have been discovered by now, so we have
1077          * to treat this just like an unmount
1078          */
1079         mount_lock_spin(mp);
1080         mp->mnt_lflag |= MNT_LDEAD;
1081         mount_unlock(mp);
1082
1083         if (device_vnode != NULLVP) {
1084                 vnode_rele(device_vnode);
1085                 VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
1086                        ctx);
1087                 did_rele = TRUE;
1088         }
1089
1090         vnode_lock_spin(vp);
1091
1092         mp->mnt_crossref++;
1093         vp->v_mountedhere = (mount_t) 0;
1094
1095         vnode_unlock(vp);
1096
1097         if (have_usecount) {
1098                 vnode_rele(vp);
1099         }
1100 out3:
1101         if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele))
1102                 vnode_rele(devvp);
1103 out2:
1104         if (devpath && devvp)
1105                 vnode_put(devvp);
1106 out1:
1107         /* Release mnt_rwlock only when it was taken */
1108         if (is_rwlock_locked == TRUE) {
1109                 lck_rw_done(&mp->mnt_rwlock);
1110         }
1111
1112         if (mntalloc) {
1113                 if (mp->mnt_crossref)
1114                         mount_dropcrossref(mp, vp, 0);
1115                 else {
1116                         mount_lock_destroy(mp);
1117 #if CONFIG_MACF
1118                         mac_mount_label_destroy(mp);
1119 #endif
1120                         FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
1121                 }
1122         }
1123         if (vfsp_ref) {
1124                 mount_list_lock();
1125                 vfsp->vfc_refcount--;
1126                 mount_list_unlock();
1127         }
1128
1129         return(error);
1130 }
1131
1132 /*
1133  * Flush in-core data, check for competing mount attempts,
1134  * and set VMOUNT
1135  */
1136 int
1137 prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth)
1138 {
1139 #if !CONFIG_MACF
1140 #pragma unused(cnp,fsname)
1141 #endif
1142         struct vnode_attr va;
1143         int error;
1144
1145         if (!skip_auth) {
1146                 /*
1147                  * If the user is not root, ensure that they own the directory
1148                  * onto which we are attempting to mount.
1149                  */
1150                 VATTR_INIT(&va);
1151                 VATTR_WANTED(&va, va_uid);
1152                 if ((error = vnode_getattr(vp, &va, ctx)) ||
1153                                 (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1154                                  (!vfs_context_issuser(ctx)))) {
1155                         error = EPERM;
1156                         goto out;
1157                 }
1158         }
1159
1160         if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
1161                 goto out;
1162
1163         if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) )
1164                 goto out;
1165
1166         if (vp->v_type != VDIR) {
1167                 error = ENOTDIR;
1168                 goto out;
1169         }
1170
1171         if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
1172                 error = EBUSY;
1173                 goto out;
1174         }
1175
1176 #if CONFIG_MACF
1177         error = mac_mount_check_mount(ctx, vp,
1178             cnp, fsname);
1179         if (error != 0)
1180                 goto out;
1181 #endif
1182
1183         vnode_lock_spin(vp);
1184         SET(vp->v_flag, VMOUNT);
1185         vnode_unlock(vp);
1186
1187 out:
1188         return error;
1189 }
1190
1191 #if CONFIG_IMGSRC_ACCESS
1192
1193 #if DEBUG
1194 #define IMGSRC_DEBUG(args...) printf(args)
1195 #else
1196 #define IMGSRC_DEBUG(args...) do { } while(0)
1197 #endif
1198
1199 static int
1200 authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
1201 {
1202         struct nameidata nd;
1203         vnode_t vp, realdevvp;
1204         mode_t accessmode;
1205         int error;
1206
1207         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
1208         if ( (error = namei(&nd)) ) {
1209                 IMGSRC_DEBUG("namei() failed with %d\n", error);
1210                 return error;
1211         }
1212
1213         vp = nd.ni_vp;
1214
1215         if (!vnode_isblk(vp)) {
1216                 IMGSRC_DEBUG("Not block device.\n");
1217                 error = ENOTBLK;
1218                 goto out;
1219         }
1220
1221         realdevvp = mp->mnt_devvp;
1222         if (realdevvp == NULLVP) {
1223                 IMGSRC_DEBUG("No device backs the mount.\n");
1224                 error = ENXIO;
1225                 goto out;
1226         }
1227
1228         error = vnode_getwithref(realdevvp);
1229         if (error != 0) {
1230                 IMGSRC_DEBUG("Coudn't get iocount on device.\n");
1231                 goto out;
1232         }
1233
1234         if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) {
1235                 IMGSRC_DEBUG("Wrong dev_t.\n");
1236                 error = ENXIO;
1237                 goto out1;
1238         }
1239
1240         strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
1241
1242         /*
1243          * If mount by non-root, then verify that user has necessary
1244          * permissions on the device.
1245          */
1246         if (!vfs_context_issuser(ctx)) {
1247                 accessmode = KAUTH_VNODE_READ_DATA;
1248                 if ((mp->mnt_flag & MNT_RDONLY) == 0)
1249                         accessmode |= KAUTH_VNODE_WRITE_DATA;
1250                 if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) {
1251                         IMGSRC_DEBUG("Access denied.\n");
1252                         goto out1;
1253                 }
1254         }
1255
1256         *devvpp = vp;
1257
1258 out1:
1259         vnode_put(realdevvp);
1260 out:
1261         nameidone(&nd);
1262         if (error) {
1263                 vnode_put(vp);
1264         }
1265
1266         return error;
1267 }
1268
1269 /*
1270  * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
1271  * and call checkdirs()
1272  */
1273 static int
1274 place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
1275 {
1276         int error;
1277
1278         mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
1279
1280         vnode_lock_spin(vp);
1281         CLR(vp->v_flag, VMOUNT);
1282         vp->v_mountedhere = mp;
1283         vnode_unlock(vp);
1284
1285         /*
1286          * taking the name_cache_lock exclusively will
1287          * insure that everyone is out of the fast path who
1288          * might be trying to use a now stale copy of
1289          * vp->v_mountedhere->mnt_realrootvp
1290          * bumping mount_generation causes the cached values
1291          * to be invalidated
1292          */
1293         name_cache_lock();
1294         mount_generation++;
1295         name_cache_unlock();
1296
1297         error = vnode_ref(vp);
1298         if (error != 0) {
1299                 goto out;
1300         }
1301
1302         error = checkdirs(vp, ctx);
1303         if (error != 0)  {
1304                 /* Unmount the filesystem as cdir/rdirs cannot be updated */
1305                 vnode_rele(vp);
1306                 goto out;
1307         }
1308
1309 out:
1310         if (error != 0) {
1311                 mp->mnt_vnodecovered = NULLVP;
1312         }
1313         return error;
1314 }
1315
1316 static void
1317 undo_place_on_covered_vp(mount_t mp, vnode_t vp)
1318 {
1319         vnode_rele(vp);
1320         vnode_lock_spin(vp);
1321         vp->v_mountedhere = (mount_t)NULL;
1322         vnode_unlock(vp);
1323
1324         mp->mnt_vnodecovered = NULLVP;
1325 }
1326
1327 static int
1328 mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
1329 {
1330         int error;
1331
1332         /* unmount in progress return error */
1333         mount_lock_spin(mp);
1334         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1335                 mount_unlock(mp);
1336                 return EBUSY;
1337         }
1338         mount_unlock(mp);
1339         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1340
1341         /*
1342          * We only allow the filesystem to be reloaded if it
1343          * is currently mounted read-only.
1344          */
1345         if ((flags & MNT_RELOAD) &&
1346                         ((mp->mnt_flag & MNT_RDONLY) == 0)) {
1347                 error = ENOTSUP;
1348                 goto out;
1349         }
1350
1351         /*
1352          * Only root, or the user that did the original mount is
1353          * permitted to update it.
1354          */
1355         if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
1356                         (!vfs_context_issuser(ctx))) {
1357                 error = EPERM;
1358                 goto out;
1359         }
1360 #if CONFIG_MACF
1361         error = mac_mount_check_remount(ctx, mp);
1362         if (error != 0) {
1363                 goto out;
1364         }
1365 #endif
1366
1367 out:
1368         if (error) {
1369                 lck_rw_done(&mp->mnt_rwlock);
1370         }
1371
1372         return error;
1373 }
1374
1375 static void
1376 mount_end_update(mount_t mp)
1377 {
1378         lck_rw_done(&mp->mnt_rwlock);
1379 }
1380
1381 static int
1382 get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp)
1383 {
1384         vnode_t vp;
1385
1386         if (height >= MAX_IMAGEBOOT_NESTING) {
1387                 return EINVAL;
1388         }
1389
1390         vp = imgsrc_rootvnodes[height];
1391         if ((vp != NULLVP) && (vnode_get(vp) == 0)) {
1392                 *rvpp = vp;
1393                 return 0;
1394         } else {
1395                 return ENOENT;
1396         }
1397 }
1398
1399 static int
1400 relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp,
1401                 const char *fsname, vfs_context_t ctx,
1402                 boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index)
1403 {
1404         int error;
1405         mount_t mp;
1406         boolean_t placed = FALSE;
1407         vnode_t devvp = NULLVP;
1408         struct vfstable *vfsp;
1409         user_addr_t devpath;
1410         char *old_mntonname;
1411         vnode_t rvp;
1412         uint32_t height;
1413         uint32_t flags;
1414
1415         /* If we didn't imageboot, nothing to move */
1416         if (imgsrc_rootvnodes[0] == NULLVP) {
1417                 return EINVAL;
1418         }
1419
1420         /* Only root can do this */
1421         if (!vfs_context_issuser(ctx)) {
1422                 return EPERM;
1423         }
1424
1425         IMGSRC_DEBUG("looking for root vnode.\n");
1426
1427         /*
1428          * Get root vnode of filesystem we're moving.
1429          */
1430         if (by_index) {
1431                 if (is64bit) {
1432                         struct user64_mnt_imgsrc_args mia64;
1433                         error = copyin(fsmountargs, &mia64, sizeof(mia64));
1434                         if (error != 0) {
1435                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1436                                 return error;
1437                         }
1438
1439                         height = mia64.mi_height;
1440                         flags = mia64.mi_flags;
1441                         devpath = mia64.mi_devpath;
1442                 } else {
1443                         struct user32_mnt_imgsrc_args mia32;
1444                         error = copyin(fsmountargs, &mia32, sizeof(mia32));
1445                         if (error != 0) {
1446                                 IMGSRC_DEBUG("Failed to copy in arguments.\n");
1447                                 return error;
1448                         }
1449
1450                         height = mia32.mi_height;
1451                         flags = mia32.mi_flags;
1452                         devpath = mia32.mi_devpath;
1453                 }
1454         } else {
1455                 /*
1456                  * For binary compatibility--assumes one level of nesting.
1457                  */
1458                 if (is64bit) {
1459                         if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
1460                                 return error;
1461                 } else {
1462                         user32_addr_t tmp;
1463                         if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
1464                                 return error;
1465
1466                         /* munge into LP64 addr */
1467                         devpath = CAST_USER_ADDR_T(tmp);
1468                 }
1469
1470                 height = 0;
1471                 flags = 0;
1472         }
1473
1474         if (flags != 0) {
1475                 IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__);
1476                 return EINVAL;
1477         }
1478
1479         error = get_imgsrc_rootvnode(height, &rvp);
1480         if (error != 0) {
1481                 IMGSRC_DEBUG("getting root vnode failed with %d\n", error);
1482                 return error;
1483         }
1484
1485         IMGSRC_DEBUG("got root vnode.\n");
1486
1487         MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
1488
1489         /* Can only move once */
1490         mp = vnode_mount(rvp);
1491         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1492                 IMGSRC_DEBUG("Already moved.\n");
1493                 error = EBUSY;
1494                 goto out0;
1495         }
1496
1497         IMGSRC_DEBUG("Starting updated.\n");
1498
1499         /* Get exclusive rwlock on mount, authorize update on mp */
1500         error = mount_begin_update(mp , ctx, 0);
1501         if (error != 0) {
1502                 IMGSRC_DEBUG("Starting updated failed with %d\n", error);
1503                 goto out0;
1504         }
1505
1506         /*
1507          * It can only be moved once.  Flag is set under the rwlock,
1508          * so we're now safe to proceed.
1509          */
1510         if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
1511                 IMGSRC_DEBUG("Already moved [2]\n");
1512                 goto out1;
1513         }
1514
1515
1516         IMGSRC_DEBUG("Preparing coveredvp.\n");
1517
1518         /* Mark covered vnode as mount in progress, authorize placing mount on top */
1519         error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE);
1520         if (error != 0) {
1521                 IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error);
1522                 goto out1;
1523         }
1524
1525         IMGSRC_DEBUG("Covered vp OK.\n");
1526
1527         /* Sanity check the name caller has provided */
1528         vfsp = mp->mnt_vtable;
1529         if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
1530                 IMGSRC_DEBUG("Wrong fs name.\n");
1531                 error = EINVAL;
1532                 goto out2;
1533         }
1534
1535         /* Check the device vnode and update mount-from name, for local filesystems */
1536         if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
1537                 IMGSRC_DEBUG("Local, doing device validation.\n");
1538
1539                 if (devpath != USER_ADDR_NULL) {
1540                         error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
1541                         if (error) {
1542                                 IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n");
1543                                 goto out2;
1544                         }
1545
1546                         vnode_put(devvp);
1547                 }
1548         }
1549
1550         /*
1551          * Place mp on top of vnode, ref the vnode,  call checkdirs(),
1552          * and increment the name cache's mount generation
1553          */
1554
1555         IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n");
1556         error = place_mount_and_checkdirs(mp, vp, ctx);
1557         if (error != 0) {
1558                 goto out2;
1559         }
1560
1561         placed = TRUE;
1562
1563         strlcpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
1564         strlcpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
1565
1566         /* Forbid future moves */
1567         mount_lock(mp);
1568         mp->mnt_kern_flag |= MNTK_HAS_MOVED;
1569         mount_unlock(mp);
1570
1571         /* Finally, add to mount list, completely ready to go */
1572         if (mount_list_add(mp) != 0) {
1573                 /*
1574                  * The system is shutting down trying to umount
1575                  * everything, so fail with a plausible errno.
1576                  */
1577                 error = EBUSY;
1578                 goto out3;
1579         }
1580
1581         mount_end_update(mp);
1582         vnode_put(rvp);
1583         FREE(old_mntonname, M_TEMP);
1584
1585         vfs_notify_mount(pvp);
1586
1587         return 0;
1588 out3:
1589         strlcpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
1590
1591         mount_lock(mp);
1592         mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
1593         mount_unlock(mp);
1594
1595 out2:
1596         /*
1597          * Placing the mp on the vnode clears VMOUNT,
1598          * so cleanup is different after that point
1599          */
1600         if (placed) {
1601                 /* Rele the vp, clear VMOUNT and v_mountedhere */
1602                 undo_place_on_covered_vp(mp, vp);
1603         } else {
1604                 vnode_lock_spin(vp);
1605                 CLR(vp->v_flag, VMOUNT);
1606                 vnode_unlock(vp);
1607         }
1608 out1:
1609         mount_end_update(mp);
1610
1611 out0:
1612         vnode_put(rvp);
1613         FREE(old_mntonname, M_TEMP);
1614         return error;
1615 }
1616
1617 #endif /* CONFIG_IMGSRC_ACCESS */
1618
1619 void
1620 enablequotas(struct mount *mp, vfs_context_t ctx)
1621 {
1622         struct nameidata qnd;
1623         int type;
1624         char qfpath[MAXPATHLEN];
1625         const char *qfname = QUOTAFILENAME;
1626         const char *qfopsname = QUOTAOPSNAME;
1627         const char *qfextension[] = INITQFNAMES;
1628
1629         /* XXX Shoulkd be an MNTK_ flag, instead of strncmp()'s */
1630         if (strncmp(mp->mnt_vfsstat.f_fstypename, "hfs", sizeof("hfs")) != 0 ) {
1631                 return;
1632         }
1633         /*
1634          * Enable filesystem disk quotas if necessary.
1635          * We ignore errors as this should not interfere with final mount
1636          */
1637         for (type=0; type < MAXQUOTAS; type++) {
1638                 snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]);
1639                 NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE,
1640                        CAST_USER_ADDR_T(qfpath), ctx);
1641                 if (namei(&qnd) != 0)
1642                         continue;           /* option file to trigger quotas is not present */
1643                 vnode_put(qnd.ni_vp);
1644                 nameidone(&qnd);
1645                 snprintf(qfpath, sizeof(qfpath),  "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfname, qfextension[type]);
1646
1647                 (void) VFS_QUOTACTL(mp, QCMD(Q_QUOTAON, type), 0, qfpath, ctx);
1648         }
1649         return;
1650 }
1651
1652
1653 static int
1654 checkdirs_callback(proc_t p, void * arg)
1655 {
1656         struct cdirargs * cdrp = (struct cdirargs * )arg;
1657         vnode_t olddp = cdrp->olddp;
1658         vnode_t newdp = cdrp->newdp;
1659         struct filedesc *fdp;
1660         vnode_t tvp;
1661         vnode_t fdp_cvp;
1662         vnode_t fdp_rvp;
1663         int cdir_changed = 0;
1664         int rdir_changed = 0;
1665
1666         /*
1667          * XXX Also needs to iterate each thread in the process to see if it
1668          * XXX is using a per-thread current working directory, and, if so,
1669          * XXX update that as well.
1670          */
1671
1672         proc_fdlock(p);
1673         fdp = p->p_fd;
1674         if (fdp == (struct filedesc *)0) {
1675                 proc_fdunlock(p);
1676                 return(PROC_RETURNED);
1677         }
1678         fdp_cvp = fdp->fd_cdir;
1679         fdp_rvp = fdp->fd_rdir;
1680         proc_fdunlock(p);
1681
1682         if (fdp_cvp == olddp) {
1683                 vnode_ref(newdp);
1684                 tvp = fdp->fd_cdir;
1685                 fdp_cvp = newdp;
1686                 cdir_changed = 1;
1687                 vnode_rele(tvp);
1688         }
1689         if (fdp_rvp == olddp) {
1690                 vnode_ref(newdp);
1691                 tvp = fdp->fd_rdir;
1692                 fdp_rvp = newdp;
1693                 rdir_changed = 1;
1694                 vnode_rele(tvp);
1695         }
1696         if (cdir_changed || rdir_changed) {
1697                 proc_fdlock(p);
1698                 fdp->fd_cdir = fdp_cvp;
1699                 fdp->fd_rdir = fdp_rvp;
1700                 proc_fdunlock(p);
1701         }
1702         return(PROC_RETURNED);
1703 }
1704
1705
1706
1707 /*
1708  * Scan all active processes to see if any of them have a current
1709  * or root directory onto which the new filesystem has just been
1710  * mounted. If so, replace them with the new mount point.
1711  */
1712 static int
1713 checkdirs(vnode_t olddp, vfs_context_t ctx)
1714 {
1715         vnode_t newdp;
1716         vnode_t tvp;
1717         int err;
1718         struct cdirargs cdr;
1719
1720         if (olddp->v_usecount == 1)
1721                 return(0);
1722         err = VFS_ROOT(olddp->v_mountedhere, &newdp, ctx);
1723
1724         if (err != 0) {
1725 #if DIAGNOSTIC
1726                 panic("mount: lost mount: error %d", err);
1727 #endif
1728                 return(err);
1729         }
1730
1731         cdr.olddp = olddp;
1732         cdr.newdp = newdp;
1733         /* do not block for exec/fork trans as the vp in cwd & rootdir are not changing */
1734         proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, checkdirs_callback, (void *)&cdr, NULL, NULL);
1735
1736         if (rootvnode == olddp) {
1737                 vnode_ref(newdp);
1738                 tvp = rootvnode;
1739                 rootvnode = newdp;
1740                 vnode_rele(tvp);
1741         }
1742
1743         vnode_put(newdp);
1744         return(0);
1745 }
1746
1747 /*
1748  * Unmount a file system.
1749  *
1750  * Note: unmount takes a path to the vnode mounted on as argument,
1751  * not special file (as before).
1752  */
1753 /* ARGSUSED */
1754 int
1755 unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval)
1756 {
1757         vnode_t vp;
1758         struct mount *mp;
1759         int error;
1760         struct nameidata nd;
1761         vfs_context_t ctx = vfs_context_current();
1762
1763         NDINIT(&nd, LOOKUP, OP_UNMOUNT, FOLLOW | AUDITVNPATH1,
1764                 UIO_USERSPACE, uap->path, ctx);
1765         error = namei(&nd);
1766         if (error)
1767                 return (error);
1768         vp = nd.ni_vp;
1769         mp = vp->v_mount;
1770         nameidone(&nd);
1771
1772 #if CONFIG_MACF
1773         error = mac_mount_check_umount(ctx, mp);
1774         if (error != 0) {
1775                 vnode_put(vp);
1776                 return (error);
1777         }
1778 #endif
1779         /*
1780          * Must be the root of the filesystem
1781          */
1782         if ((vp->v_flag & VROOT) == 0) {
1783                 vnode_put(vp);
1784                 return (EINVAL);
1785         }
1786         mount_ref(mp, 0);
1787         vnode_put(vp);
1788         /* safedounmount consumes the mount ref */
1789         return (safedounmount(mp, uap->flags, ctx));
1790 }
1791
1792 int
1793 vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx)
1794 {
1795         mount_t mp;
1796
1797         mp = mount_list_lookupby_fsid(fsid, 0, 1);
1798         if (mp == (mount_t)0) {
1799                 return(ENOENT);
1800         }
1801         mount_ref(mp, 0);
1802         mount_iterdrop(mp);
1803         /* safedounmount consumes the mount ref */
1804         return(safedounmount(mp, flags, ctx));
1805 }
1806
1807
1808 /*
1809  * The mount struct comes with a mount ref which will be consumed.
1810  * Do the actual file system unmount, prevent some common foot shooting.
1811  */
1812 int
1813 safedounmount(struct mount *mp, int flags, vfs_context_t ctx)
1814 {
1815         int error;
1816         proc_t p = vfs_context_proc(ctx);
1817
1818         /*
1819          * If the file system is not responding and MNT_NOBLOCK
1820          * is set and not a forced unmount then return EBUSY.
1821          */
1822         if ((mp->mnt_kern_flag & MNT_LNOTRESP) &&
1823                 (flags & MNT_NOBLOCK) && ((flags & MNT_FORCE) == 0)) {
1824                 error = EBUSY;
1825                 goto out;
1826         }
1827
1828         /*
1829          * Skip authorization if the mount is tagged as permissive and
1830          * this is not a forced-unmount attempt.
1831          */
1832         if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) {
1833                 /*
1834                  * Only root, or the user that did the original mount is
1835                  * permitted to unmount this filesystem.
1836                  */
1837                 if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) &&
1838                                 (error = suser(kauth_cred_get(), &p->p_acflag)))
1839                         goto out;
1840         }
1841         /*
1842          * Don't allow unmounting the root file system.
1843          */
1844         if (mp->mnt_flag & MNT_ROOTFS) {
1845                 error = EBUSY; /* the root is always busy */
1846                 goto out;
1847         }
1848
1849 #ifdef CONFIG_IMGSRC_ACCESS
1850         if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
1851                 error = EBUSY;
1852                 goto out;
1853         }
1854 #endif /* CONFIG_IMGSRC_ACCESS */
1855
1856         return (dounmount(mp, flags, 1, ctx));
1857
1858 out:
1859         mount_drop(mp, 0);
1860         return(error);
1861 }
1862
1863 /*
1864  * Do the actual file system unmount.
1865  */
1866 int
1867 dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx)
1868 {
1869         vnode_t coveredvp = (vnode_t)0;
1870         int error;
1871         int needwakeup = 0;
1872         int forcedunmount = 0;
1873         int lflags = 0;
1874         struct vnode *devvp = NULLVP;
1875 #if CONFIG_TRIGGERS
1876         proc_t p = vfs_context_proc(ctx);
1877         int did_vflush = 0;
1878         int pflags_save = 0;
1879 #endif /* CONFIG_TRIGGERS */
1880
1881 #if CONFIG_FSE
1882         if (!(flags & MNT_FORCE)) {
1883                 fsevent_unmount(mp, ctx);  /* has to come first! */
1884         }
1885 #endif
1886
1887         mount_lock(mp);
1888
1889         /*
1890          * If already an unmount in progress just return EBUSY.
1891          * Even a forced unmount cannot override.
1892          */
1893         if (mp->mnt_lflag & MNT_LUNMOUNT) {
1894                 if (withref != 0)
1895                         mount_drop(mp, 1);
1896                 mount_unlock(mp);
1897                 return (EBUSY);
1898         }
1899
1900         if (flags & MNT_FORCE) {
1901                 forcedunmount = 1;
1902                 mp->mnt_lflag |= MNT_LFORCE;
1903         }
1904
1905 #if CONFIG_TRIGGERS
1906         if (flags & MNT_NOBLOCK && p != kernproc)
1907                 pflags_save = OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
1908 #endif
1909
1910         mp->mnt_kern_flag |= MNTK_UNMOUNT;
1911         mp->mnt_lflag |= MNT_LUNMOUNT;
1912         mp->mnt_flag &=~ MNT_ASYNC;
1913         /*
1914          * anyone currently in the fast path that
1915          * trips over the cached rootvp will be
1916          * dumped out and forced into the slow path
1917          * to regenerate a new cached value
1918          */
1919         mp->mnt_realrootvp = NULLVP;
1920         mount_unlock(mp);
1921
1922         if (forcedunmount && (flags & MNT_LNOSUB) == 0) {
1923                 /*
1924                  * Force unmount any mounts in this filesystem.
1925                  * If any unmounts fail - just leave them dangling.
1926                  * Avoids recursion.
1927                  */
1928                 (void) dounmount_submounts(mp, flags | MNT_LNOSUB, ctx);
1929         }
1930
1931         /*
1932          * taking the name_cache_lock exclusively will
1933          * insure that everyone is out of the fast path who
1934          * might be trying to use a now stale copy of
1935          * vp->v_mountedhere->mnt_realrootvp
1936          * bumping mount_generation causes the cached values
1937          * to be invalidated
1938          */
1939         name_cache_lock();
1940         mount_generation++;
1941         name_cache_unlock();
1942
1943
1944         lck_rw_lock_exclusive(&mp->mnt_rwlock);
1945         if (withref != 0)
1946                 mount_drop(mp, 0);
1947         error = 0;
1948         if (forcedunmount == 0) {
1949                 ubc_umount(mp); /* release cached vnodes */
1950                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1951                         error = VFS_SYNC(mp, MNT_WAIT, ctx);
1952                         if (error) {
1953                                 mount_lock(mp);
1954                                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1955                                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1956                                 mp->mnt_lflag &= ~MNT_LFORCE;
1957                                 goto out;
1958                         }
1959                 }
1960         }
1961
1962         IOBSDMountChange(mp, kIOMountChangeUnmount);
1963
1964 #if CONFIG_TRIGGERS
1965         vfs_nested_trigger_unmounts(mp, flags, ctx);
1966         did_vflush = 1;
1967 #endif
1968         if (forcedunmount)
1969                 lflags |= FORCECLOSE;
1970         error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM  | SKIPROOT | lflags);
1971         if ((forcedunmount == 0) && error) {
1972                 mount_lock(mp);
1973                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1974                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1975                 mp->mnt_lflag &= ~MNT_LFORCE;
1976                 goto out;
1977         }
1978
1979         /* make sure there are no one in the mount iterations or lookup */
1980         mount_iterdrain(mp);
1981
1982         error = VFS_UNMOUNT(mp, flags, ctx);
1983         if (error) {
1984                 mount_iterreset(mp);
1985                 mount_lock(mp);
1986                 mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
1987                 mp->mnt_lflag &= ~MNT_LUNMOUNT;
1988                 mp->mnt_lflag &= ~MNT_LFORCE;
1989                 goto out;
1990         }
1991
1992         /* increment the operations count */
1993         if (!error)
1994                 OSAddAtomic(1, &vfs_nummntops);
1995
1996         if ( mp->mnt_devvp && mp->mnt_vtable->vfc_vfsflags & VFC_VFSLOCALARGS) {
1997                 /* hold an io reference and drop the usecount before close */
1998                 devvp = mp->mnt_devvp;
1999                 vnode_getalways(devvp);
2000                 vnode_rele(devvp);
2001                 VNOP_CLOSE(devvp, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE,
2002                        ctx);
2003                 vnode_clearmountedon(devvp);
2004                 vnode_put(devvp);
2005         }
2006         lck_rw_done(&mp->mnt_rwlock);
2007         mount_list_remove(mp);
2008         lck_rw_lock_exclusive(&mp->mnt_rwlock);
2009
2010         /* mark the mount point hook in the vp but not drop the ref yet */
2011         if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
2012                 /*
2013                  * The covered vnode needs special handling. Trying to get an
2014                  * iocount must not block here as this may lead to deadlocks
2015                  * if the Filesystem to which the covered vnode belongs is
2016                  * undergoing forced unmounts. Since we hold a usecount, the
2017                  * vnode cannot be reused (it can, however, still be terminated)
2018                  */
2019                 vnode_getalways(coveredvp);
2020                 vnode_lock_spin(coveredvp);
2021
2022                 mp->mnt_crossref++;
2023                 coveredvp->v_mountedhere = (struct mount *)0;
2024                 CLR(coveredvp->v_flag, VMOUNT);
2025
2026                 vnode_unlock(coveredvp);
2027                 vnode_put(coveredvp);
2028         }
2029
2030         mount_list_lock();
2031         mp->mnt_vtable->vfc_refcount--;
2032         mount_list_unlock();
2033
2034         cache_purgevfs(mp);     /* remove cache entries for this file sys */
2035         vfs_event_signal(NULL, VQ_UNMOUNT, (intptr_t)NULL);
2036         mount_lock(mp);
2037         mp->mnt_lflag |= MNT_LDEAD;
2038
2039         if (mp->mnt_lflag & MNT_LWAIT) {
2040                 /*
2041                  * do the wakeup here
2042                  * in case we block in mount_refdrain
2043                  * which will drop the mount lock
2044                  * and allow anyone blocked in vfs_busy
2045                  * to wakeup and see the LDEAD state
2046                  */
2047                 mp->mnt_lflag &= ~MNT_LWAIT;
2048                 wakeup((caddr_t)mp);
2049         }
2050         mount_refdrain(mp);
2051 out:
2052         if (mp->mnt_lflag & MNT_LWAIT) {
2053                 mp->mnt_lflag &= ~MNT_LWAIT;
2054                 needwakeup = 1;
2055         }
2056
2057 #if CONFIG_TRIGGERS
2058         if (flags & MNT_NOBLOCK && p != kernproc) {
2059                 // Restore P_NOREMOTEHANG bit to its previous value
2060                 if ((pflags_save & P_NOREMOTEHANG) == 0)
2061                         OSBitAndAtomic(~((uint32_t) P_NOREMOTEHANG), &p->p_flag);
2062         }
2063
2064         /*
2065          * Callback and context are set together under the mount lock, and
2066          * never cleared, so we're safe to examine them here, drop the lock,
2067          * and call out.
2068          */
2069         if (mp->mnt_triggercallback != NULL) {
2070                 mount_unlock(mp);
2071                 if (error == 0) {
2072                         mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx);
2073                 } else if (did_vflush) {
2074                         mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx);
2075                 }
2076         } else {
2077                 mount_unlock(mp);
2078         }
2079 #else
2080         mount_unlock(mp);
2081 #endif /* CONFIG_TRIGGERS */
2082
2083         lck_rw_done(&mp->mnt_rwlock);
2084
2085         if (needwakeup)
2086                 wakeup((caddr_t)mp);
2087
2088         if (!error) {
2089                 if ((coveredvp != NULLVP)) {
2090                         vnode_t pvp = NULLVP;
2091
2092                         /*
2093                          * The covered vnode needs special handling. Trying to
2094                          * get an iocount must not block here as this may lead
2095                          * to deadlocks if the Filesystem to which the covered
2096                          * vnode belongs is undergoing forced unmounts. Since we
2097                          * hold a usecount, the  vnode cannot be reused
2098                          * (it can, however, still be terminated).
2099                          */
2100                         vnode_getalways(coveredvp);
2101
2102                         mount_dropcrossref(mp, coveredvp, 0);
2103                         /*
2104                          * We'll _try_ to detect if this really needs to be
2105                          * done. The coveredvp can only be in termination (or
2106                          * terminated) if the coveredvp's mount point is in a
2107                          * forced unmount (or has been) since we still hold the
2108                          * ref.
2109                          */
2110                         if (!vnode_isrecycled(coveredvp)) {
2111                                 pvp = vnode_getparent(coveredvp);
2112 #if CONFIG_TRIGGERS
2113                                 if (coveredvp->v_resolve) {
2114                                         vnode_trigger_rearm(coveredvp, ctx);
2115                                 }
2116 #endif
2117                         }
2118
2119                         vnode_rele(coveredvp);
2120                         vnode_put(coveredvp);
2121                         coveredvp = NULLVP;
2122
2123                         if (pvp) {
2124                                 lock_vnode_and_post(pvp, NOTE_WRITE);
2125                                 vnode_put(pvp);
2126                         }
2127                 } else if (mp->mnt_flag & MNT_ROOTFS) {
2128                                 mount_lock_destroy(mp);
2129 #if CONFIG_MACF
2130                                 mac_mount_label_destroy(mp);
2131 #endif
2132                                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2133                 } else
2134                         panic("dounmount: no coveredvp");
2135         }
2136         return (error);
2137 }
2138
2139 /*
2140  * Unmount any mounts in this filesystem.
2141  */
2142 void
2143 dounmount_submounts(struct mount *mp, int flags, vfs_context_t ctx)
2144 {
2145         mount_t smp;
2146         fsid_t *fsids, fsid;
2147         int fsids_sz;
2148         int count = 0, i, m = 0;
2149         vnode_t vp;
2150
2151         mount_list_lock();
2152
2153         // Get an array to hold the submounts fsids.
2154         TAILQ_FOREACH(smp, &mountlist, mnt_list)
2155                 count++;
2156         fsids_sz = count * sizeof(fsid_t);
2157         MALLOC(fsids, fsid_t *, fsids_sz, M_TEMP, M_NOWAIT);
2158         if (fsids == NULL) {
2159                 mount_list_unlock();
2160                 goto out;
2161         }
2162         fsids[0] = mp->mnt_vfsstat.f_fsid;      // Prime the pump
2163
2164         /*
2165          * Fill the array with submount fsids.
2166          * Since mounts are always added to the tail of the mount list, the
2167          * list is always in mount order.
2168          * For each mount check if the mounted-on vnode belongs to a
2169          * mount that's already added to our array of mounts to be unmounted.
2170          */
2171         for (smp = TAILQ_NEXT(mp, mnt_list); smp; smp = TAILQ_NEXT(smp, mnt_list)) {
2172                 vp = smp->mnt_vnodecovered;
2173                 if (vp == NULL)
2174                         continue;
2175                 fsid = vnode_mount(vp)->mnt_vfsstat.f_fsid;     // Underlying fsid
2176                 for (i = 0; i <= m; i++) {
2177                         if (fsids[i].val[0] == fsid.val[0] &&
2178                             fsids[i].val[1] == fsid.val[1]) {
2179                                 fsids[++m] = smp->mnt_vfsstat.f_fsid;
2180                                 break;
2181                         }
2182                 }
2183         }
2184         mount_list_unlock();
2185
2186         // Unmount the submounts in reverse order. Ignore errors.
2187         for (i = m; i > 0; i--) {
2188                 smp = mount_list_lookupby_fsid(&fsids[i], 0, 1);
2189                 if (smp) {
2190                         mount_ref(smp, 0);
2191                         mount_iterdrop(smp);
2192                         (void) dounmount(smp, flags, 1, ctx);
2193                 }
2194         }
2195 out:
2196         if (fsids)
2197                 FREE(fsids, M_TEMP);
2198 }
2199
2200 void
2201 mount_dropcrossref(mount_t mp, vnode_t dp, int need_put)
2202 {
2203         vnode_lock(dp);
2204         mp->mnt_crossref--;
2205
2206         if (mp->mnt_crossref < 0)
2207                 panic("mount cross refs -ve");
2208
2209         if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) {
2210
2211                 if (need_put)
2212                         vnode_put_locked(dp);
2213                 vnode_unlock(dp);
2214
2215                 mount_lock_destroy(mp);
2216 #if CONFIG_MACF
2217                 mac_mount_label_destroy(mp);
2218 #endif
2219                 FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT);
2220                 return;
2221         }
2222         if (need_put)
2223                 vnode_put_locked(dp);
2224         vnode_unlock(dp);
2225 }
2226
2227
2228 /*
2229  * Sync each mounted filesystem.
2230  */
2231 #if DIAGNOSTIC
2232 int syncprt = 0;
2233 #endif
2234
2235 int print_vmpage_stat=0;
2236 int sync_timeout = 60;  // Sync time limit (sec)
2237
2238 static int
2239 sync_callback(mount_t mp, __unused void *arg)
2240 {
2241         if ((mp->mnt_flag & MNT_RDONLY) == 0) {
2242                 int asyncflag = mp->mnt_flag & MNT_ASYNC;
2243
2244                 mp->mnt_flag &= ~MNT_ASYNC;
2245                 VFS_SYNC(mp, arg ? MNT_WAIT : MNT_NOWAIT, vfs_context_kernel());
2246                 if (asyncflag)
2247                         mp->mnt_flag |= MNT_ASYNC;
2248         }
2249
2250         return (VFS_RETURNED);
2251 }
2252
2253 /* ARGSUSED */
2254 int
2255 sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval)
2256 {
2257         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2258
2259         if (print_vmpage_stat) {
2260                 vm_countdirtypages();
2261         }
2262
2263 #if DIAGNOSTIC
2264         if (syncprt)
2265                 vfs_bufstats();
2266 #endif /* DIAGNOSTIC */
2267         return 0;
2268 }
2269
2270 static void
2271 sync_thread(void *arg, __unused wait_result_t wr)
2272 {
2273         int *timeout = (int *) arg;
2274
2275         vfs_iterate(LK_NOWAIT, sync_callback, NULL);
2276
2277         if (timeout)
2278                 wakeup((caddr_t) timeout);
2279         if (print_vmpage_stat) {
2280                 vm_countdirtypages();
2281         }
2282
2283 #if DIAGNOSTIC
2284         if (syncprt)
2285                 vfs_bufstats();
2286 #endif /* DIAGNOSTIC */
2287 }
2288
2289 /*
2290  * Sync in a separate thread so we can time out if it blocks.
2291  */
2292 static int
2293 sync_async(int timeout)
2294 {
2295         thread_t thd;
2296         int error;
2297         struct timespec ts = {timeout, 0};
2298
2299         lck_mtx_lock(sync_mtx_lck);
2300         if (kernel_thread_start(sync_thread, &timeout, &thd) != KERN_SUCCESS) {
2301                 printf("sync_thread failed\n");
2302                 lck_mtx_unlock(sync_mtx_lck);
2303                 return (0);
2304         }
2305
2306         error = msleep((caddr_t) &timeout, sync_mtx_lck, (PVFS | PDROP | PCATCH), "sync_thread", &ts);
2307         if (error) {
2308                 printf("sync timed out: %d sec\n", timeout);
2309         }
2310         thread_deallocate(thd);
2311
2312         return (0);
2313 }
2314
2315 /*
2316  * An in-kernel sync for power management to call.
2317  */
2318 __private_extern__ int
2319 sync_internal(void)
2320 {
2321         (void) sync_async(sync_timeout);
2322
2323         return 0;
2324 } /* end of sync_internal call */
2325
2326 /*
2327  * Change filesystem quotas.
2328  */
2329 #if QUOTA
2330 int
2331 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
2332 {
2333         struct mount *mp;
2334         int error, quota_cmd, quota_status;
2335         caddr_t datap;
2336         size_t fnamelen;
2337         struct nameidata nd;
2338         vfs_context_t ctx = vfs_context_current();
2339         struct dqblk my_dqblk;
2340
2341         AUDIT_ARG(uid, uap->uid);
2342         AUDIT_ARG(cmd, uap->cmd);
2343         NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
2344                uap->path, ctx);
2345         error = namei(&nd);
2346         if (error)
2347                 return (error);
2348         mp = nd.ni_vp->v_mount;
2349         vnode_put(nd.ni_vp);
2350         nameidone(&nd);
2351
2352         /* copyin any data we will need for downstream code */
2353         quota_cmd = uap->cmd >> SUBCMDSHIFT;
2354
2355         switch (quota_cmd) {
2356         case Q_QUOTAON:
2357                 /* uap->arg specifies a file from which to take the quotas */
2358                 fnamelen = MAXPATHLEN;
2359                 datap = kalloc(MAXPATHLEN);
2360                 error = copyinstr(uap->arg, datap, MAXPATHLEN, &fnamelen);
2361                 break;
2362         case Q_GETQUOTA:
2363                 /* uap->arg is a pointer to a dqblk structure. */
2364                 datap = (caddr_t) &my_dqblk;
2365                 break;
2366         case Q_SETQUOTA:
2367         case Q_SETUSE:
2368                 /* uap->arg is a pointer to a dqblk structure. */
2369                 datap = (caddr_t) &my_dqblk;
2370                 if (proc_is64bit(p)) {
2371                         struct user_dqblk       my_dqblk64;
2372                         error = copyin(uap->arg, (caddr_t)&my_dqblk64, sizeof (my_dqblk64));
2373                         if (error == 0) {
2374                                 munge_dqblk(&my_dqblk, &my_dqblk64, FALSE);
2375                         }
2376                 }
2377                 else {
2378                         error = copyin(uap->arg, (caddr_t)&my_dqblk, sizeof (my_dqblk));
2379                 }
2380                 break;
2381         case Q_QUOTASTAT:
2382                 /* uap->arg is a pointer to an integer */
2383                 datap = (caddr_t) &quota_status;
2384                 break;
2385         default:
2386                 datap = NULL;
2387                 break;
2388         } /* switch */
2389
2390         if (error == 0) {
2391                 error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, datap, ctx);
2392         }
2393
2394         switch (quota_cmd) {
2395         case Q_QUOTAON:
2396                 if (datap != NULL)
2397                         kfree(datap, MAXPATHLEN);
2398                 break;
2399         case Q_GETQUOTA:
2400                 /* uap->arg is a pointer to a dqblk structure we need to copy out to */
2401                 if (error == 0) {
2402                         if (proc_is64bit(p)) {
2403                                 struct user_dqblk       my_dqblk64 = {.dqb_bhardlimit = 0};
2404                                 munge_dqblk(&my_dqblk, &my_dqblk64, TRUE);
2405                                 error = copyout((caddr_t)&my_dqblk64, uap->arg, sizeof (my_dqblk64));
2406                         }
2407                         else {
2408                                 error = copyout(datap, uap->arg, sizeof (struct dqblk));
2409                         }
2410                 }
2411                 break;
2412         case Q_QUOTASTAT:
2413                 /* uap->arg is a pointer to an integer */
2414                 if (error == 0) {
2415                         error = copyout(datap, uap->arg, sizeof(quota_status));
2416                 }
2417                 break;
2418         default:
2419                 break;
2420         } /* switch */
2421
2422         return (error);
2423 }
2424 #else
2425 int
2426 quotactl(__unused proc_t p, __unused struct quotactl_args *uap, __unused int32_t *retval)
2427 {
2428         return (EOPNOTSUPP);
2429 }
2430 #endif /* QUOTA */
2431
2432 /*
2433  * Get filesystem statistics.
2434  *
2435  * Returns:     0                       Success
2436  *      namei:???
2437  *      vfs_update_vfsstat:???
2438  *      munge_statfs:EFAULT
2439  */
2440 /* ARGSUSED */
2441 int
2442 statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval)
2443 {
2444         struct mount *mp;
2445         struct vfsstatfs *sp;
2446         int error;
2447         struct nameidata nd;
2448         vfs_context_t ctx = vfs_context_current();
2449         vnode_t vp;
2450
2451         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2452                 UIO_USERSPACE, uap->path, ctx);
2453         error = namei(&nd);
2454         if (error != 0)
2455                 return (error);
2456         vp = nd.ni_vp;
2457         mp = vp->v_mount;
2458         sp = &mp->mnt_vfsstat;
2459         nameidone(&nd);
2460
2461 #if CONFIG_MACF
2462         error = mac_mount_check_stat(ctx, mp);
2463         if (error != 0)
2464                 return (error);
2465 #endif
2466
2467         error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT);
2468         if (error != 0) {
2469                 vnode_put(vp);
2470                 return (error);
2471         }
2472
2473         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2474         vnode_put(vp);
2475         return (error);
2476 }
2477
2478 /*
2479  * Get filesystem statistics.
2480  */
2481 /* ARGSUSED */
2482 int
2483 fstatfs(__unused proc_t p, struct fstatfs_args *uap, __unused int32_t *retval)
2484 {
2485         vnode_t vp;
2486         struct mount *mp;
2487         struct vfsstatfs *sp;
2488         int error;
2489
2490         AUDIT_ARG(fd, uap->fd);
2491
2492         if ( (error = file_vnode(uap->fd, &vp)) )
2493                 return (error);
2494
2495         error = vnode_getwithref(vp);
2496         if (error) {
2497                 file_drop(uap->fd);
2498                 return (error);
2499         }
2500
2501         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2502
2503         mp = vp->v_mount;
2504         if (!mp) {
2505                 error = EBADF;
2506                 goto out;
2507         }
2508
2509 #if CONFIG_MACF
2510         error = mac_mount_check_stat(vfs_context_current(), mp);
2511         if (error != 0)
2512                 goto out;
2513 #endif
2514
2515         sp = &mp->mnt_vfsstat;
2516         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2517                 goto out;
2518         }
2519
2520         error = munge_statfs(mp, sp, uap->buf, NULL, IS_64BIT_PROCESS(p), TRUE);
2521
2522 out:
2523         file_drop(uap->fd);
2524         vnode_put(vp);
2525
2526         return (error);
2527 }
2528
2529 /*
2530  * Common routine to handle copying of statfs64 data to user space
2531  */
2532 static int
2533 statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp)
2534 {
2535         int error;
2536         struct statfs64 sfs;
2537
2538         bzero(&sfs, sizeof(sfs));
2539
2540         sfs.f_bsize = sfsp->f_bsize;
2541         sfs.f_iosize = (int32_t)sfsp->f_iosize;
2542         sfs.f_blocks = sfsp->f_blocks;
2543         sfs.f_bfree = sfsp->f_bfree;
2544         sfs.f_bavail = sfsp->f_bavail;
2545         sfs.f_files = sfsp->f_files;
2546         sfs.f_ffree = sfsp->f_ffree;
2547         sfs.f_fsid = sfsp->f_fsid;
2548         sfs.f_owner = sfsp->f_owner;
2549         sfs.f_type = mp->mnt_vtable->vfc_typenum;
2550         sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
2551         sfs.f_fssubtype = sfsp->f_fssubtype;
2552         if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
2553                 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN);
2554         } else {
2555                 strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN);
2556         }
2557         strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN);
2558         strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN);
2559
2560         error = copyout((caddr_t)&sfs, bufp, sizeof(sfs));
2561
2562         return(error);
2563 }
2564
2565 /*
2566  * Get file system statistics in 64-bit mode
2567  */
2568 int
2569 statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *retval)
2570 {
2571         struct mount *mp;
2572         struct vfsstatfs *sp;
2573         int error;
2574         struct nameidata nd;
2575         vfs_context_t ctxp = vfs_context_current();
2576         vnode_t vp;
2577
2578         NDINIT(&nd, LOOKUP, OP_STATFS, FOLLOW | AUDITVNPATH1,
2579                 UIO_USERSPACE, uap->path, ctxp);
2580         error = namei(&nd);
2581         if (error != 0)
2582                 return (error);
2583         vp = nd.ni_vp;
2584         mp = vp->v_mount;
2585         sp = &mp->mnt_vfsstat;
2586         nameidone(&nd);
2587
2588 #if CONFIG_MACF
2589         error = mac_mount_check_stat(ctxp, mp);
2590         if (error != 0)
2591                 return (error);
2592 #endif
2593
2594         error = vfs_update_vfsstat(mp, ctxp, VFS_USER_EVENT);
2595         if (error != 0) {
2596                 vnode_put(vp);
2597                 return (error);
2598         }
2599
2600         error = statfs64_common(mp, sp, uap->buf);
2601         vnode_put(vp);
2602
2603         return (error);
2604 }
2605
2606 /*
2607  * Get file system statistics in 64-bit mode
2608  */
2609 int
2610 fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t *retval)
2611 {
2612         struct vnode *vp;
2613         struct mount *mp;
2614         struct vfsstatfs *sp;
2615         int error;
2616
2617         AUDIT_ARG(fd, uap->fd);
2618
2619         if ( (error = file_vnode(uap->fd, &vp)) )
2620                 return (error);
2621
2622         error = vnode_getwithref(vp);
2623         if (error) {
2624                 file_drop(uap->fd);
2625                 return (error);
2626         }
2627
2628         AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
2629
2630         mp = vp->v_mount;
2631         if (!mp) {
2632                 error = EBADF;
2633                 goto out;
2634         }
2635
2636 #if CONFIG_MACF
2637         error = mac_mount_check_stat(vfs_context_current(), mp);
2638         if (error != 0)
2639                 goto out;
2640 #endif
2641
2642         sp = &mp->mnt_vfsstat;
2643         if ((error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT)) != 0) {
2644                 goto out;
2645         }
2646
2647         error = statfs64_common(mp, sp, uap->buf);
2648
2649 out:
2650         file_drop(uap->fd);
2651         vnode_put(vp);
2652
2653         return (error);
2654 }
2655
2656 struct getfsstat_struct {
2657         user_addr_t     sfsp;
2658         user_addr_t     *mp;
2659         int             count;
2660         int             maxcount;
2661         int             flags;
2662         int             error;
2663 };
2664
2665
2666 static int
2667 getfsstat_callback(mount_t mp, void * arg)
2668 {
2669
2670         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2671         struct vfsstatfs *sp;
2672         int error, my_size;
2673         vfs_context_t ctx = vfs_context_current();
2674
2675         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2676 #if CONFIG_MACF
2677                 error = mac_mount_check_stat(ctx, mp);
2678                 if (error != 0) {
2679                         fstp->error = error;
2680                         return(VFS_RETURNED_DONE);
2681                 }
2682 #endif
2683                 sp = &mp->mnt_vfsstat;
2684                 /*
2685                  * If MNT_NOWAIT is specified, do not refresh the
2686                  * fsstat cache. MNT_WAIT/MNT_DWAIT overrides MNT_NOWAIT.
2687                  */
2688                 if (((fstp->flags & MNT_NOWAIT) == 0 || (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2689                         (error = vfs_update_vfsstat(mp, ctx,
2690                             VFS_USER_EVENT))) {
2691                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2692                         return(VFS_RETURNED);
2693                 }
2694
2695                 /*
2696                  * Need to handle LP64 version of struct statfs
2697                  */
2698                 error = munge_statfs(mp, sp, fstp->sfsp, &my_size, IS_64BIT_PROCESS(vfs_context_proc(ctx)), FALSE);
2699                 if (error) {
2700                         fstp->error = error;
2701                         return(VFS_RETURNED_DONE);
2702                 }
2703                 fstp->sfsp += my_size;
2704
2705                 if (fstp->mp) {
2706 #if CONFIG_MACF
2707                         error = mac_mount_label_get(mp, *fstp->mp);
2708                         if (error) {
2709                                 fstp->error = error;
2710                                 return(VFS_RETURNED_DONE);
2711                         }
2712 #endif
2713                         fstp->mp++;
2714                 }
2715         }
2716         fstp->count++;
2717         return(VFS_RETURNED);
2718 }
2719
2720 /*
2721  * Get statistics on all filesystems.
2722  */
2723 int
2724 getfsstat(__unused proc_t p, struct getfsstat_args *uap, int *retval)
2725 {
2726         struct __mac_getfsstat_args muap;
2727
2728         muap.buf = uap->buf;
2729         muap.bufsize = uap->bufsize;
2730         muap.mac = USER_ADDR_NULL;
2731         muap.macsize = 0;
2732         muap.flags = uap->flags;
2733
2734         return (__mac_getfsstat(p, &muap, retval));
2735 }
2736
2737 /*
2738  * __mac_getfsstat: Get MAC-related file system statistics
2739  *
2740  * Parameters:    p                        (ignored)
2741  *                uap                      User argument descriptor (see below)
2742  *                retval                   Count of file system statistics (N stats)
2743  *
2744  * Indirect:      uap->bufsize             Buffer size
2745  *                uap->macsize             MAC info size
2746  *                uap->buf                 Buffer where information will be returned
2747  *                uap->mac                 MAC info
2748  *                uap->flags               File system flags
2749  *
2750  *
2751  * Returns:        0                       Success
2752  *                !0                       Not success
2753  *
2754  */
2755 int
2756 __mac_getfsstat(__unused proc_t p, struct __mac_getfsstat_args *uap, int *retval)
2757 {
2758         user_addr_t sfsp;
2759         user_addr_t *mp;
2760         size_t count, maxcount, bufsize, macsize;
2761         struct getfsstat_struct fst;
2762
2763         bufsize = (size_t) uap->bufsize;
2764         macsize = (size_t) uap->macsize;
2765
2766         if (IS_64BIT_PROCESS(p)) {
2767                 maxcount = bufsize / sizeof(struct user64_statfs);
2768         }
2769         else {
2770                 maxcount = bufsize / sizeof(struct user32_statfs);
2771         }
2772         sfsp = uap->buf;
2773         count = 0;
2774
2775         mp = NULL;
2776
2777 #if CONFIG_MACF
2778         if (uap->mac != USER_ADDR_NULL) {
2779                 u_int32_t *mp0;
2780                 int error;
2781                 unsigned int i;
2782
2783                 count = (macsize / (IS_64BIT_PROCESS(p) ? 8 : 4));
2784                 if (count != maxcount)
2785                         return (EINVAL);
2786
2787                 /* Copy in the array */
2788                 MALLOC(mp0, u_int32_t *, macsize, M_MACTEMP, M_WAITOK);
2789                 if (mp0 == NULL) {
2790                         return (ENOMEM);
2791                 }
2792
2793                 error = copyin(uap->mac, mp0, macsize);
2794                 if (error) {
2795                         FREE(mp0, M_MACTEMP);
2796                         return (error);
2797                 }
2798
2799                 /* Normalize to an array of user_addr_t */
2800                 MALLOC(mp, user_addr_t *, count * sizeof(user_addr_t), M_MACTEMP, M_WAITOK);
2801                 if (mp == NULL) {
2802                         FREE(mp0, M_MACTEMP);
2803                         return (ENOMEM);
2804                 }
2805
2806                 for (i = 0; i < count; i++) {
2807                         if (IS_64BIT_PROCESS(p))
2808                                 mp[i] = ((user_addr_t *)mp0)[i];
2809                         else
2810                                 mp[i] = (user_addr_t)mp0[i];
2811                 }
2812                 FREE(mp0, M_MACTEMP);
2813         }
2814 #endif
2815
2816
2817         fst.sfsp = sfsp;
2818         fst.mp = mp;
2819         fst.flags = uap->flags;
2820         fst.count = 0;
2821         fst.error = 0;
2822         fst.maxcount = maxcount;
2823
2824
2825         vfs_iterate(0, getfsstat_callback, &fst);
2826
2827         if (mp)
2828                 FREE(mp, M_MACTEMP);
2829
2830         if (fst.error ) {
2831                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2832                 return(fst.error);
2833         }
2834
2835         if (fst.sfsp && fst.count > fst.maxcount)
2836                 *retval = fst.maxcount;
2837         else
2838                 *retval = fst.count;
2839         return (0);
2840 }
2841
2842 static int
2843 getfsstat64_callback(mount_t mp, void * arg)
2844 {
2845         struct getfsstat_struct *fstp = (struct getfsstat_struct *)arg;
2846         struct vfsstatfs *sp;
2847         int error;
2848
2849         if (fstp->sfsp && fstp->count < fstp->maxcount) {
2850 #if CONFIG_MACF
2851                 error = mac_mount_check_stat(vfs_context_current(), mp);
2852                 if (error != 0) {
2853                         fstp->error = error;
2854                         return(VFS_RETURNED_DONE);
2855                 }
2856 #endif
2857                 sp = &mp->mnt_vfsstat;
2858                 /*
2859                  * If MNT_NOWAIT is specified, do not refresh the fsstat
2860                  * cache. MNT_WAIT overrides MNT_NOWAIT.
2861                  *
2862                  * We treat MNT_DWAIT as MNT_WAIT for all instances of
2863                  * getfsstat, since the constants are out of the same
2864                  * namespace.
2865                  */
2866                 if (((fstp->flags & MNT_NOWAIT) == 0 ||
2867                      (fstp->flags & (MNT_WAIT | MNT_DWAIT))) &&
2868                     (error = vfs_update_vfsstat(mp, vfs_context_current(), VFS_USER_EVENT))) {
2869                         KAUTH_DEBUG("vfs_update_vfsstat returned %d", error);
2870                         return(VFS_RETURNED);
2871                 }
2872
2873                 error = statfs64_common(mp, sp, fstp->sfsp);
2874                 if (error) {
2875                         fstp->error = error;
2876                         return(VFS_RETURNED_DONE);
2877                 }
2878                 fstp->sfsp += sizeof(struct statfs64);
2879         }
2880         fstp->count++;
2881         return(VFS_RETURNED);
2882 }
2883
2884 /*
2885  * Get statistics on all file systems in 64 bit mode.
2886  */
2887 int
2888 getfsstat64(__unused proc_t p, struct getfsstat64_args *uap, int *retval)
2889 {
2890         user_addr_t sfsp;
2891         int count, maxcount;
2892         struct getfsstat_struct fst;
2893
2894         maxcount = uap->bufsize / sizeof(struct statfs64);
2895
2896         sfsp = uap->buf;
2897         count = 0;
2898
2899         fst.sfsp = sfsp;
2900         fst.flags = uap->flags;
2901         fst.count = 0;
2902         fst.error = 0;
2903         fst.maxcount = maxcount;
2904
2905         vfs_iterate(0, getfsstat64_callback, &fst);
2906
2907         if (fst.error ) {
2908                 KAUTH_DEBUG("ERROR - %s gets %d", p->p_comm, fst.error);
2909                 return(fst.error);
2910         }
2911
2912         if (fst.sfsp && fst.count > fst.maxcount)
2913                 *retval = fst.maxcount;
2914         else
2915                 *retval = fst.count;
2916
2917         return (0);
2918 }
2919
2920 /*
2921  * gets the associated vnode with the file descriptor passed.
2922  * as input
2923  *
2924  * INPUT
2925  * ctx - vfs context of caller
2926  * fd - file descriptor for which vnode is required.
2927  * vpp - Pointer to pointer to vnode to be returned.
2928  *
2929  * The vnode is returned with an iocount so any vnode obtained
2930  * by this call needs a vnode_put
2931  *
2932  */
2933 int
2934 vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp)
2935 {
2936         int error;
2937         vnode_t vp;
2938         struct fileproc *fp;
2939         proc_t p = vfs_context_proc(ctx);
2940
2941         *vpp =  NULLVP;
2942
2943         error = fp_getfvp(p, fd, &fp, &vp);
2944         if (error)
2945                 return (error);
2946
2947         error = vnode_getwithref(vp);
2948         if (error) {
2949                 (void)fp_drop(p, fd, fp, 0);
2950                 return (error);
2951         }
2952
2953         (void)fp_drop(p, fd, fp, 0);
2954         *vpp = vp;
2955         return (error);
2956 }
2957
2958 /*
2959  * Wrapper function around namei to start lookup from a directory
2960  * specified by a file descriptor ni_dirfd.
2961  *
2962  * In addition to all the errors returned by namei, this call can
2963  * return ENOTDIR if the file descriptor does not refer to a directory.
2964  * and EBADF if the file descriptor is not valid.
2965  */
2966 int
2967 nameiat(struct nameidata *ndp, int dirfd)
2968 {
2969         if ((dirfd != AT_FDCWD) &&
2970             !(ndp->ni_flag & NAMEI_CONTLOOKUP) &&
2971             !(ndp->ni_cnd.cn_flags & USEDVP)) {
2972                 int error = 0;
2973                 char c;
2974
2975                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
2976                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
2977                         if (error)
2978                                 return (error);
2979                 } else {
2980                         c = *((char *)(ndp->ni_dirp));
2981                 }
2982
2983                 if (c != '/') {
2984                         vnode_t dvp_at;
2985
2986                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
2987                             &dvp_at);
2988                         if (error)
2989                                 return (error);
2990
2991                         if (vnode_vtype(dvp_at) != VDIR) {
2992                                 vnode_put(dvp_at);
2993                                 return (ENOTDIR);
2994                         }
2995
2996                         ndp->ni_dvp = dvp_at;
2997                         ndp->ni_cnd.cn_flags |= USEDVP;
2998                         error = namei(ndp);
2999                         ndp->ni_cnd.cn_flags &= ~USEDVP;
3000                         vnode_put(dvp_at);
3001                         return (error);
3002                 }
3003         }
3004
3005         return (namei(ndp));
3006 }
3007
3008 /*
3009  * Change current working directory to a given file descriptor.
3010  */
3011 /* ARGSUSED */
3012 static int
3013 common_fchdir(proc_t p, struct fchdir_args *uap, int per_thread)
3014 {
3015         struct filedesc *fdp = p->p_fd;
3016         vnode_t vp;
3017         vnode_t tdp;
3018         vnode_t tvp;
3019         struct mount *mp;
3020         int error;
3021         vfs_context_t ctx = vfs_context_current();
3022
3023         AUDIT_ARG(fd, uap->fd);
3024         if (per_thread && uap->fd == -1) {
3025                 /*
3026                  * Switching back from per-thread to per process CWD; verify we
3027                  * in fact have one before proceeding.  The only success case
3028                  * for this code path is to return 0 preemptively after zapping
3029                  * the thread structure contents.
3030                  */
3031                 thread_t th = vfs_context_thread(ctx);
3032                 if (th) {
3033                         uthread_t uth = get_bsdthread_info(th);
3034                         tvp = uth->uu_cdir;
3035                         uth->uu_cdir = NULLVP;
3036                         if (tvp != NULLVP) {
3037                                 vnode_rele(tvp);
3038                                 return (0);
3039                         }
3040                 }
3041                 return (EBADF);
3042         }
3043
3044         if ( (error = file_vnode(uap->fd, &vp)) )
3045                 return(error);
3046         if ( (error = vnode_getwithref(vp)) ) {
3047                 file_drop(uap->fd);
3048                 return(error);
3049         }
3050
3051         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
3052
3053         if (vp->v_type != VDIR) {
3054                 error = ENOTDIR;
3055                 goto out;
3056         }
3057
3058 #if CONFIG_MACF
3059         error = mac_vnode_check_chdir(ctx, vp);
3060         if (error)
3061                 goto out;
3062 #endif
3063         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3064         if (error)
3065                 goto out;
3066
3067         while (!error && (mp = vp->v_mountedhere) != NULL) {
3068                 if (vfs_busy(mp, LK_NOWAIT)) {
3069                         error = EACCES;
3070                         goto out;
3071                 }
3072                 error = VFS_ROOT(mp, &tdp, ctx);
3073                 vfs_unbusy(mp);
3074                 if (error)
3075                         break;
3076                 vnode_put(vp);
3077                 vp = tdp;
3078         }
3079         if (error)
3080                 goto out;
3081         if ( (error = vnode_ref(vp)) )
3082                 goto out;
3083         vnode_put(vp);
3084
3085         if (per_thread) {
3086                 thread_t th = vfs_context_thread(ctx);
3087                 if (th) {
3088                         uthread_t uth = get_bsdthread_info(th);
3089                         tvp = uth->uu_cdir;
3090                         uth->uu_cdir = vp;
3091                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3092                 } else {
3093                         vnode_rele(vp);
3094                         return (ENOENT);
3095                 }
3096         } else {
3097                 proc_fdlock(p);
3098                 tvp = fdp->fd_cdir;
3099                 fdp->fd_cdir = vp;
3100                 proc_fdunlock(p);
3101         }
3102
3103         if (tvp)
3104                 vnode_rele(tvp);
3105         file_drop(uap->fd);
3106
3107         return (0);
3108 out:
3109         vnode_put(vp);
3110         file_drop(uap->fd);
3111
3112         return(error);
3113 }
3114
3115 int
3116 fchdir(proc_t p, struct fchdir_args *uap, __unused int32_t *retval)
3117 {
3118         return common_fchdir(p, uap, 0);
3119 }
3120
3121 int
3122 __pthread_fchdir(proc_t p, struct __pthread_fchdir_args *uap, __unused int32_t *retval)
3123 {
3124         return common_fchdir(p, (void *)uap, 1);
3125 }
3126
3127 /*
3128  * Change current working directory (".").
3129  *
3130  * Returns:     0                       Success
3131  *      change_dir:ENOTDIR
3132  *      change_dir:???
3133  *      vnode_ref:ENOENT                No such file or directory
3134  */
3135 /* ARGSUSED */
3136 static int
3137 common_chdir(proc_t p, struct chdir_args *uap, int per_thread)
3138 {
3139         struct filedesc *fdp = p->p_fd;
3140         int error;
3141         struct nameidata nd;
3142         vnode_t tvp;
3143         vfs_context_t ctx = vfs_context_current();
3144
3145         NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1,
3146                 UIO_USERSPACE, uap->path, ctx);
3147         error = change_dir(&nd, ctx);
3148         if (error)
3149                 return (error);
3150         if ( (error = vnode_ref(nd.ni_vp)) ) {
3151                 vnode_put(nd.ni_vp);
3152                 return (error);
3153         }
3154         /*
3155          * drop the iocount we picked up in change_dir
3156          */
3157         vnode_put(nd.ni_vp);
3158
3159         if (per_thread) {
3160                 thread_t th = vfs_context_thread(ctx);
3161                 if (th) {
3162                         uthread_t uth = get_bsdthread_info(th);
3163                         tvp = uth->uu_cdir;
3164                         uth->uu_cdir = nd.ni_vp;
3165                         OSBitOrAtomic(P_THCWD, &p->p_flag);
3166                 } else {
3167                         vnode_rele(nd.ni_vp);
3168                         return (ENOENT);
3169                 }
3170         } else {
3171                 proc_fdlock(p);
3172                 tvp = fdp->fd_cdir;
3173                 fdp->fd_cdir = nd.ni_vp;
3174                 proc_fdunlock(p);
3175         }
3176
3177         if (tvp)
3178                 vnode_rele(tvp);
3179
3180         return (0);
3181 }
3182
3183
3184 /*
3185  * chdir
3186  *
3187  * Change current working directory (".") for the entire process
3188  *
3189  * Parameters:  p       Process requesting the call
3190  *              uap     User argument descriptor (see below)
3191  *              retval  (ignored)
3192  *
3193  * Indirect parameters: uap->path       Directory path
3194  *
3195  * Returns:     0                       Success
3196  *              common_chdir: ENOTDIR
3197  *              common_chdir: ENOENT    No such file or directory
3198  *              common_chdir: ???
3199  *
3200  */
3201 int
3202 chdir(proc_t p, struct chdir_args *uap, __unused int32_t *retval)
3203 {
3204         return common_chdir(p, (void *)uap, 0);
3205 }
3206
3207 /*
3208  * __pthread_chdir
3209  *
3210  * Change current working directory (".") for a single thread
3211  *
3212  * Parameters:  p       Process requesting the call
3213  *              uap     User argument descriptor (see below)
3214  *              retval  (ignored)
3215  *
3216  * Indirect parameters: uap->path       Directory path
3217  *
3218  * Returns:     0                       Success
3219  *              common_chdir: ENOTDIR
3220  *              common_chdir: ENOENT    No such file or directory
3221  *              common_chdir: ???
3222  *
3223  */
3224 int
3225 __pthread_chdir(proc_t p, struct __pthread_chdir_args *uap, __unused int32_t *retval)
3226 {
3227         return common_chdir(p, (void *)uap, 1);
3228 }
3229
3230
3231 /*
3232  * Change notion of root (``/'') directory.
3233  */
3234 /* ARGSUSED */
3235 int
3236 chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval)
3237 {
3238         struct filedesc *fdp = p->p_fd;
3239         int error;
3240         struct nameidata nd;
3241         vnode_t tvp;
3242         vfs_context_t ctx = vfs_context_current();
3243
3244         if ((error = suser(kauth_cred_get(), &p->p_acflag)))
3245                 return (error);
3246
3247         NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1,
3248                 UIO_USERSPACE, uap->path, ctx);
3249         error = change_dir(&nd, ctx);
3250         if (error)
3251                 return (error);
3252
3253 #if CONFIG_MACF
3254         error = mac_vnode_check_chroot(ctx, nd.ni_vp,
3255             &nd.ni_cnd);
3256         if (error) {
3257                 vnode_put(nd.ni_vp);
3258                 return (error);
3259         }
3260 #endif
3261
3262         if ( (error = vnode_ref(nd.ni_vp)) ) {
3263                 vnode_put(nd.ni_vp);
3264                 return (error);
3265         }
3266         vnode_put(nd.ni_vp);
3267
3268         proc_fdlock(p);
3269         tvp = fdp->fd_rdir;
3270         fdp->fd_rdir = nd.ni_vp;
3271         fdp->fd_flags |= FD_CHROOT;
3272         proc_fdunlock(p);
3273
3274         if (tvp != NULL)
3275                 vnode_rele(tvp);
3276
3277         return (0);
3278 }
3279
3280 /*
3281  * Common routine for chroot and chdir.
3282  *
3283  * Returns:     0                       Success
3284  *              ENOTDIR                 Not a directory
3285  *              namei:???               [anything namei can return]
3286  *              vnode_authorize:???     [anything vnode_authorize can return]
3287  */
3288 static int
3289 change_dir(struct nameidata *ndp, vfs_context_t ctx)
3290 {
3291         vnode_t vp;
3292         int error;
3293
3294         if ((error = namei(ndp)))
3295                 return (error);
3296         nameidone(ndp);
3297         vp = ndp->ni_vp;
3298
3299         if (vp->v_type != VDIR) {
3300                 vnode_put(vp);
3301                 return (ENOTDIR);
3302         }
3303
3304 #if CONFIG_MACF
3305         error = mac_vnode_check_chdir(ctx, vp);
3306         if (error) {
3307                 vnode_put(vp);
3308                 return (error);
3309         }
3310 #endif
3311
3312         error = vnode_authorize(vp, NULL, KAUTH_VNODE_SEARCH, ctx);
3313         if (error) {
3314                 vnode_put(vp);
3315                 return (error);
3316         }
3317
3318         return (error);
3319 }
3320
3321 /*
3322  * Free the vnode data (for directories) associated with the file glob.
3323  */
3324 struct fd_vn_data *
3325 fg_vn_data_alloc(void)
3326 {
3327         struct fd_vn_data *fvdata;
3328
3329         /* Allocate per fd vnode data */
3330         MALLOC(fvdata, struct fd_vn_data *, (sizeof(struct fd_vn_data)),
3331                M_FD_VN_DATA, M_WAITOK | M_ZERO);
3332         lck_mtx_init(&fvdata->fv_lock, fd_vn_lck_grp, fd_vn_lck_attr);
3333         return fvdata;
3334 }
3335
3336 /*
3337  * Free the vnode data (for directories) associated with the file glob.
3338  */
3339 void
3340 fg_vn_data_free(void *fgvndata)
3341 {
3342         struct fd_vn_data *fvdata = (struct fd_vn_data *)fgvndata;
3343
3344         if (fvdata->fv_buf)
3345                 FREE(fvdata->fv_buf, M_FD_DIRBUF);
3346         lck_mtx_destroy(&fvdata->fv_lock, fd_vn_lck_grp);
3347         FREE(fvdata, M_FD_VN_DATA);
3348 }
3349
3350 /*
3351  * Check permissions, allocate an open file structure,
3352  * and call the device open routine if any.
3353  *
3354  * Returns:     0                       Success
3355  *              EINVAL
3356  *              EINTR
3357  *      falloc:ENFILE
3358  *      falloc:EMFILE
3359  *      falloc:ENOMEM
3360  *      vn_open_auth:???
3361  *      dupfdopen:???
3362  *      VNOP_ADVLOCK:???
3363  *      vnode_setsize:???
3364  *
3365  * XXX Need to implement uid, gid
3366  */
3367 int
3368 open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3369     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra,
3370     int32_t *retval)
3371 {
3372         proc_t p = vfs_context_proc(ctx);
3373         uthread_t uu = get_bsdthread_info(vfs_context_thread(ctx));
3374         struct fileproc *fp;
3375         vnode_t vp;
3376         int flags, oflags;
3377         int type, indx, error;
3378         struct flock lf;
3379         struct vfs_context context;
3380
3381         oflags = uflags;
3382
3383         if ((oflags & O_ACCMODE) == O_ACCMODE)
3384                 return(EINVAL);
3385
3386         flags = FFLAGS(uflags);
3387         CLR(flags, FENCRYPTED);
3388         CLR(flags, FUNENCRYPTED);
3389
3390         AUDIT_ARG(fflags, oflags);
3391         AUDIT_ARG(mode, vap->va_mode);
3392
3393         if ((error = falloc_withalloc(p,
3394             &fp, &indx, ctx, fp_zalloc, cra)) != 0) {
3395                 return (error);
3396         }
3397         uu->uu_dupfd = -indx - 1;
3398
3399         if ((error = vn_open_auth(ndp, &flags, vap))) {
3400                 if ((error == ENODEV || error == ENXIO) && (uu->uu_dupfd >= 0)){        /* XXX from fdopen */
3401                         if ((error = dupfdopen(p->p_fd, indx, uu->uu_dupfd, flags, error)) == 0) {
3402                                 fp_drop(p, indx, NULL, 0);
3403                                 *retval = indx;
3404                                 return (0);
3405                         }
3406                 }
3407                 if (error == ERESTART)
3408                         error = EINTR;
3409                 fp_free(p, indx, fp);
3410                 return (error);
3411         }
3412         uu->uu_dupfd = 0;
3413         vp = ndp->ni_vp;
3414
3415         fp->f_fglob->fg_flag = flags & (FMASK | O_EVTONLY | FENCRYPTED | FUNENCRYPTED);
3416         fp->f_fglob->fg_ops = &vnops;
3417         fp->f_fglob->fg_data = (caddr_t)vp;
3418
3419         if (flags & (O_EXLOCK | O_SHLOCK)) {
3420                 lf.l_whence = SEEK_SET;
3421                 lf.l_start = 0;
3422                 lf.l_len = 0;
3423                 if (flags & O_EXLOCK)
3424                         lf.l_type = F_WRLCK;
3425                 else
3426                         lf.l_type = F_RDLCK;
3427                 type = F_FLOCK;
3428                 if ((flags & FNONBLOCK) == 0)
3429                         type |= F_WAIT;
3430 #if CONFIG_MACF
3431                 error = mac_file_check_lock(vfs_context_ucred(ctx), fp->f_fglob,
3432                     F_SETLK, &lf);
3433                 if (error)
3434                         goto bad;
3435 #endif
3436                 if ((error = VNOP_ADVLOCK(vp, (caddr_t)fp->f_fglob, F_SETLK, &lf, type, ctx, NULL)))
3437                         goto bad;
3438                 fp->f_fglob->fg_flag |= FHASLOCK;
3439         }
3440
3441         /* try to truncate by setting the size attribute */
3442         if ((flags & O_TRUNC) && ((error = vnode_setsize(vp, (off_t)0, 0, ctx)) != 0))
3443                 goto bad;
3444
3445         /*
3446          * For directories we hold some additional information in the fd.
3447          */
3448         if (vnode_vtype(vp) == VDIR) {
3449                 fp->f_fglob->fg_vn_data = fg_vn_data_alloc();
3450         } else {
3451                 fp->f_fglob->fg_vn_data = NULL;
3452         }
3453
3454         vnode_put(vp);
3455
3456         /*
3457          * The first terminal open (without a O_NOCTTY) by a session leader
3458          * results in it being set as the controlling terminal.
3459          */
3460         if (vnode_istty(vp) && !(p->p_flag & P_CONTROLT) &&
3461             !(flags & O_NOCTTY)) {
3462                 int tmp = 0;
3463
3464                 (void)(*fp->f_fglob->fg_ops->fo_ioctl)(fp, (int)TIOCSCTTY,
3465                     (caddr_t)&tmp, ctx);
3466         }
3467
3468         proc_fdlock(p);
3469         if (flags & O_CLOEXEC)
3470                 *fdflags(p, indx) |= UF_EXCLOSE;
3471         if (flags & O_CLOFORK)
3472                 *fdflags(p, indx) |= UF_FORKCLOSE;
3473         procfdtbl_releasefd(p, indx, NULL);
3474
3475 #if CONFIG_SECLUDED_MEMORY
3476         if (secluded_for_filecache &&
3477             FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE &&
3478             vnode_vtype(vp) == VREG) {
3479                 memory_object_control_t moc;
3480
3481                 moc = ubc_getobject(vp, UBC_FLAGS_NONE);
3482
3483                 if (moc == MEMORY_OBJECT_CONTROL_NULL) {
3484                         /* nothing to do... */
3485                 } else if (fp->f_fglob->fg_flag & FWRITE) {
3486                         /* writable -> no longer  eligible for secluded pages */
3487                         memory_object_mark_eligible_for_secluded(moc,
3488                                                                  FALSE);
3489                 } else if (secluded_for_filecache == 1) {
3490                         char pathname[32] = { 0, };
3491                         size_t copied;
3492                         /* XXX FBDP: better way to detect /Applications/ ? */
3493                         if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3494                                 copyinstr(ndp->ni_dirp,
3495                                           pathname,
3496                                           sizeof (pathname),
3497                                           &copied);
3498                         } else {
3499                                 copystr(CAST_DOWN(void *, ndp->ni_dirp),
3500                                         pathname,
3501                                         sizeof (pathname),
3502                                         &copied);
3503                         }
3504                         pathname[sizeof (pathname) - 1] = '\0';
3505                         if (strncmp(pathname,
3506                                     "/Applications/",
3507                                     strlen("/Applications/")) == 0 &&
3508                             strncmp(pathname,
3509                                     "/Applications/Camera.app/",
3510                                     strlen("/Applications/Camera.app/")) != 0) {
3511                                 /*
3512                                  * not writable
3513                                  * AND from "/Applications/"
3514                                  * AND not from "/Applications/Camera.app/"
3515                                  * ==> eligible for secluded
3516                                  */
3517                                 memory_object_mark_eligible_for_secluded(moc,
3518                                                                          TRUE);
3519                         }
3520                 } else if (secluded_for_filecache == 2) {
3521 /* not implemented... */
3522                         if (!strncmp(vp->v_name,
3523                                      DYLD_SHARED_CACHE_NAME,
3524                                      strlen(DYLD_SHARED_CACHE_NAME)) ||
3525                             !strncmp(vp->v_name,
3526                                      "dyld",
3527                                      strlen(vp->v_name)) ||
3528                             !strncmp(vp->v_name,
3529                                      "launchd",
3530                                      strlen(vp->v_name)) ||
3531                             !strncmp(vp->v_name,
3532                                      "Camera",
3533                                      strlen(vp->v_name)) ||
3534                             !strncmp(vp->v_name,
3535                                      "mediaserverd",
3536                                      strlen(vp->v_name))) {
3537                                 /*
3538                                  * This file matters when launching Camera:
3539                                  * do not store its contents in the secluded
3540                                  * pool that will be drained on Camera launch.
3541                                  */
3542                                 memory_object_mark_eligible_for_secluded(moc,
3543                                                                          FALSE);
3544                         }
3545                 }
3546         }
3547 #endif /* CONFIG_SECLUDED_MEMORY */
3548
3549         fp_drop(p, indx, fp, 1);
3550         proc_fdunlock(p);
3551
3552         *retval = indx;
3553
3554         return (0);
3555 bad:
3556         context = *vfs_context_current();
3557         context.vc_ucred = fp->f_fglob->fg_cred;
3558
3559         if ((fp->f_fglob->fg_flag & FHASLOCK) &&
3560             (FILEGLOB_DTYPE(fp->f_fglob) == DTYPE_VNODE)) {
3561                 lf.l_whence = SEEK_SET;
3562                 lf.l_start = 0;
3563                 lf.l_len = 0;
3564                 lf.l_type = F_UNLCK;
3565
3566                 (void)VNOP_ADVLOCK(
3567                         vp, (caddr_t)fp->f_fglob, F_UNLCK, &lf, F_FLOCK, ctx, NULL);
3568         }
3569
3570         vn_close(vp, fp->f_fglob->fg_flag, &context);
3571         vnode_put(vp);
3572         fp_free(p, indx, fp);
3573
3574         return (error);
3575 }
3576
3577 /*
3578  * While most of the *at syscall handlers can call nameiat() which
3579  * is a wrapper around namei, the use of namei and initialisation
3580  * of nameidata are far removed and in different functions  - namei
3581  * gets called in vn_open_auth for open1. So we'll just do here what
3582  * nameiat() does.
3583  */
3584 static int
3585 open1at(vfs_context_t ctx, struct nameidata *ndp, int uflags,
3586     struct vnode_attr *vap, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval,
3587     int dirfd)
3588 {
3589         if ((dirfd != AT_FDCWD) && !(ndp->ni_cnd.cn_flags & USEDVP)) {
3590                 int error;
3591                 char c;
3592
3593                 if (UIO_SEG_IS_USER_SPACE(ndp->ni_segflg)) {
3594                         error = copyin(ndp->ni_dirp, &c, sizeof(char));
3595                         if (error)
3596                                 return (error);
3597                 } else {
3598                         c = *((char *)(ndp->ni_dirp));
3599                 }
3600
3601                 if (c != '/') {
3602                         vnode_t dvp_at;
3603
3604                         error = vnode_getfromfd(ndp->ni_cnd.cn_context, dirfd,
3605                             &dvp_at);
3606                         if (error)
3607                                 return (error);
3608
3609                         if (vnode_vtype(dvp_at) != VDIR) {
3610                                 vnode_put(dvp_at);
3611                                 return (ENOTDIR);
3612                         }
3613
3614                         ndp->ni_dvp = dvp_at;
3615                         ndp->ni_cnd.cn_flags |= USEDVP;
3616                         error = open1(ctx, ndp, uflags, vap, fp_zalloc, cra,
3617                             retval);
3618                         vnode_put(dvp_at);
3619                         return (error);
3620                 }
3621         }
3622
3623         return (open1(ctx, ndp, uflags, vap, fp_zalloc, cra, retval));
3624 }
3625
3626 /*
3627  * open_extended: open a file given a path name; with extended argument list (including extended security (ACL)).
3628  *
3629  * Parameters:  p                       Process requesting the open
3630  *              uap                     User argument descriptor (see below)
3631  *              retval                  Pointer to an area to receive the
3632  *                                      return calue from the system call
3633  *
3634  * Indirect:    uap->path               Path to open (same as 'open')
3635  *              uap->flags              Flags to open (same as 'open'
3636  *              uap->uid                UID to set, if creating
3637  *              uap->gid                GID to set, if creating
3638  *              uap->mode               File mode, if creating (same as 'open')
3639  *              uap->xsecurity          ACL to set, if creating
3640  *
3641  * Returns:     0                       Success
3642  *              !0                      errno value
3643  *
3644  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
3645  *
3646  * XXX:         We should enummerate the possible errno values here, and where
3647  *              in the code they originated.
3648  */
3649 int
3650 open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval)
3651 {
3652         struct filedesc *fdp = p->p_fd;
3653         int ciferror;
3654         kauth_filesec_t xsecdst;
3655         struct vnode_attr va;
3656         struct nameidata nd;
3657         int cmode;
3658
3659         AUDIT_ARG(owner, uap->uid, uap->gid);
3660
3661         xsecdst = NULL;
3662         if ((uap->xsecurity != USER_ADDR_NULL) &&
3663             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
3664                 return ciferror;
3665
3666         VATTR_INIT(&va);
3667         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3668         VATTR_SET(&va, va_mode, cmode);
3669         if (uap->uid != KAUTH_UID_NONE)
3670                 VATTR_SET(&va, va_uid, uap->uid);
3671         if (uap->gid != KAUTH_GID_NONE)
3672                 VATTR_SET(&va, va_gid, uap->gid);
3673         if (xsecdst != NULL)
3674                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
3675
3676         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3677                uap->path, vfs_context_current());
3678
3679         ciferror = open1(vfs_context_current(), &nd, uap->flags, &va,
3680                          fileproc_alloc_init, NULL, retval);
3681         if (xsecdst != NULL)
3682                 kauth_filesec_free(xsecdst);
3683
3684         return ciferror;
3685 }
3686
3687 /*
3688  * Go through the data-protected atomically controlled open (2)
3689  *
3690  * int open_dprotected_np(user_addr_t path, int flags, int class, int dpflags, int mode)
3691  */
3692 int open_dprotected_np (__unused proc_t p, struct open_dprotected_np_args *uap, int32_t *retval) {
3693         int flags = uap->flags;
3694         int class = uap->class;
3695         int dpflags = uap->dpflags;
3696
3697         /*
3698          * Follow the same path as normal open(2)
3699          * Look up the item if it exists, and acquire the vnode.
3700          */
3701         struct filedesc *fdp = p->p_fd;
3702         struct vnode_attr va;
3703         struct nameidata nd;
3704         int cmode;
3705         int error;
3706
3707         VATTR_INIT(&va);
3708         /* Mask off all but regular access permissions */
3709         cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3710         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3711
3712         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
3713                uap->path, vfs_context_current());
3714
3715         /*
3716          * Initialize the extra fields in vnode_attr to pass down our
3717          * extra fields.
3718          * 1. target cprotect class.
3719          * 2. set a flag to mark it as requiring open-raw-encrypted semantics.
3720          */
3721         if (flags & O_CREAT) {
3722                /* lower level kernel code validates that the class is valid before applying it. */
3723                if (class != PROTECTION_CLASS_DEFAULT) {
3724                        /*
3725                         * PROTECTION_CLASS_DEFAULT implies that we make the class for this
3726                         * file behave the same as open (2)
3727                         */
3728                        VATTR_SET(&va, va_dataprotect_class, class);
3729                }
3730         }
3731
3732         if (dpflags & (O_DP_GETRAWENCRYPTED|O_DP_GETRAWUNENCRYPTED)) {
3733                 if ( flags & (O_RDWR | O_WRONLY)) {
3734                         /* Not allowed to write raw encrypted bytes */
3735                         return EINVAL;
3736                 }
3737                 if (uap->dpflags & O_DP_GETRAWENCRYPTED) {
3738                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWENCRYPTED);
3739                 }
3740                 if (uap->dpflags & O_DP_GETRAWUNENCRYPTED) {
3741                     VATTR_SET(&va, va_dataprotect_flags, VA_DP_RAWUNENCRYPTED);
3742                 }
3743         }
3744
3745         error = open1(vfs_context_current(), &nd, uap->flags, &va,
3746                       fileproc_alloc_init, NULL, retval);
3747
3748         return error;
3749 }
3750
3751 static int
3752 openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode,
3753     int fd, enum uio_seg segflg, int *retval)
3754 {
3755         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
3756         struct vnode_attr va;
3757         struct nameidata nd;
3758         int cmode;
3759
3760         VATTR_INIT(&va);
3761         /* Mask off all but regular access permissions */
3762         cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
3763         VATTR_SET(&va, va_mode, cmode & ACCESSPERMS);
3764
3765         NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1,
3766             segflg, path, ctx);
3767
3768         return (open1at(ctx, &nd, flags, &va, fileproc_alloc_init, NULL,
3769             retval, fd));
3770 }
3771
3772 int
3773 open(proc_t p, struct open_args *uap, int32_t *retval)
3774 {
3775         __pthread_testcancel(1);
3776         return(open_nocancel(p, (struct open_nocancel_args *)uap, retval));
3777 }
3778
3779 int
3780 open_nocancel(__unused proc_t p, struct open_nocancel_args *uap,
3781     int32_t *retval)
3782 {
3783         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3784             uap->mode, AT_FDCWD, UIO_USERSPACE, retval));
3785 }
3786
3787 int
3788 openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap,
3789                 int32_t *retval)
3790 {
3791         return (openat_internal(vfs_context_current(), uap->path, uap->flags,
3792             uap->mode, uap->fd, UIO_USERSPACE, retval));
3793 }
3794
3795 int
3796 openat(proc_t p, struct openat_args *uap, int32_t *retval)
3797 {
3798         __pthread_testcancel(1);
3799         return(openat_nocancel(p, (struct openat_nocancel_args *)uap, retval));
3800 }
3801
3802 /*
3803  * openbyid_np: open a file given a file system id and a file system object id
3804  *      the hfs file system object id is an fsobj_id_t {uint32, uint32}
3805  *      file systems that don't support object ids it is a node id (uint64_t).
3806  *
3807  * Parameters:  p                       Process requesting the open
3808  *              uap                     User argument descriptor (see below)
3809  *              retval                  Pointer to an area to receive the
3810  *                                      return calue from the system call
3811  *
3812  * Indirect:    uap->path               Path to open (same as 'open')
3813  *
3814  *              uap->fsid               id of target file system
3815  *              uap->objid              id of target file system object
3816  *              uap->flags              Flags to open (same as 'open')
3817  *
3818  * Returns:     0                       Success
3819  *              !0                      errno value
3820  *
3821  *
3822  * XXX:         We should enummerate the possible errno values here, and where
3823  *              in the code they originated.
3824  */
3825 int
3826 openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval)
3827 {
3828         fsid_t fsid;
3829         uint64_t objid;
3830         int error;
3831         char *buf = NULL;
3832         int buflen = MAXPATHLEN;
3833         int pathlen = 0;
3834         vfs_context_t ctx = vfs_context_current();
3835
3836         if ((error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_OPEN_BY_ID, 0))) {
3837                 return (error);
3838         }
3839
3840         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
3841                 return (error);
3842         }
3843
3844         /*uap->obj is an fsobj_id_t defined as struct {uint32_t, uint32_t} */
3845         if ((error = copyin(uap->objid, (caddr_t)&objid, sizeof(uint64_t)))) {
3846                 return (error);
3847         }
3848
3849         AUDIT_ARG(value32, fsid.val[0]);
3850         AUDIT_ARG(value64, objid);
3851
3852         /*resolve path from fsis, objid*/
3853         do {
3854                 MALLOC(buf, char *, buflen + 1, M_TEMP, M_WAITOK);
3855                 if (buf == NULL) {
3856                         return (ENOMEM);
3857                 }
3858
3859                 error = fsgetpath_internal(
3860                         ctx, fsid.val[0], objid,
3861                         buflen, buf, &pathlen);
3862
3863                 if (error) {
3864                         FREE(buf, M_TEMP);
3865                         buf = NULL;
3866                 }
3867         } while (error == ENOSPC && (buflen += MAXPATHLEN));
3868
3869         if (error) {
3870                 return error;
3871         }
3872
3873         buf[pathlen] = 0;
3874
3875         error = openat_internal(
3876                 ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval);
3877
3878         FREE(buf, M_TEMP);
3879
3880         return error;
3881 }
3882
3883
3884 /*
3885  * Create a special file.
3886  */
3887 static int mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap);
3888
3889 int
3890 mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval)
3891 {
3892         struct vnode_attr va;
3893         vfs_context_t ctx = vfs_context_current();
3894         int error;
3895         struct nameidata nd;
3896         vnode_t vp, dvp;
3897
3898         VATTR_INIT(&va);
3899         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
3900         VATTR_SET(&va, va_rdev, uap->dev);
3901
3902         /* If it's a mknod() of a FIFO, call mkfifo1() instead */
3903         if ((uap->mode & S_IFMT) == S_IFIFO)
3904                 return(mkfifo1(ctx, uap->path, &va));
3905
3906         AUDIT_ARG(mode, uap->mode);
3907         AUDIT_ARG(value32, uap->dev);
3908
3909         if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
3910                 return (error);
3911         NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1,
3912                 UIO_USERSPACE, uap->path, ctx);
3913         error = namei(&nd);
3914         if (error)
3915                 return (error);
3916         dvp = nd.ni_dvp;
3917         vp = nd.ni_vp;
3918
3919         if (vp != NULL) {
3920                 error = EEXIST;
3921                 goto out;
3922         }
3923
3924         switch (uap->mode & S_IFMT) {
3925         case S_IFCHR:
3926                 VATTR_SET(&va, va_type, VCHR);
3927                 break;
3928         case S_IFBLK:
3929                 VATTR_SET(&va, va_type, VBLK);
3930                 break;
3931         default:
3932                 error = EINVAL;
3933                 goto out;
3934         }
3935
3936 #if CONFIG_MACF
3937         error = mac_vnode_check_create(ctx,
3938             nd.ni_dvp, &nd.ni_cnd, &va);
3939         if (error)
3940                 goto out;
3941 #endif
3942
3943         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
3944                 goto out;
3945
3946         if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0)
3947                 goto out;
3948
3949         if (vp) {
3950                 int     update_flags = 0;
3951
3952                 // Make sure the name & parent pointers are hooked up
3953                 if (vp->v_name == NULL)
3954                         update_flags |= VNODE_UPDATE_NAME;
3955                 if (vp->v_parent == NULLVP)
3956                         update_flags |= VNODE_UPDATE_PARENT;
3957
3958                 if (update_flags)
3959                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
3960
3961 #if CONFIG_FSE
3962                 add_fsevent(FSE_CREATE_FILE, ctx,
3963                     FSE_ARG_VNODE, vp,
3964                     FSE_ARG_DONE);
3965 #endif
3966         }
3967
3968 out:
3969         /*
3970          * nameidone has to happen before we vnode_put(dvp)
3971          * since it may need to release the fs_nodelock on the dvp
3972          */
3973         nameidone(&nd);
3974
3975         if (vp)
3976                 vnode_put(vp);
3977         vnode_put(dvp);
3978
3979         return (error);
3980 }
3981
3982 /*
3983  * Create a named pipe.
3984  *
3985  * Returns:     0                       Success
3986  *              EEXIST
3987  *      namei:???
3988  *      vnode_authorize:???
3989  *      vn_create:???
3990  */
3991 static int
3992 mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap)
3993 {
3994         vnode_t vp, dvp;
3995         int error;
3996         struct nameidata nd;
3997
3998         NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1,
3999                 UIO_USERSPACE, upath, ctx);
4000         error = namei(&nd);
4001         if (error)
4002                 return (error);
4003         dvp = nd.ni_dvp;
4004         vp = nd.ni_vp;
4005
4006         /* check that this is a new file and authorize addition */
4007         if (vp != NULL) {
4008                 error = EEXIST;
4009                 goto out;
4010         }
4011         VATTR_SET(vap, va_type, VFIFO);
4012
4013         if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0)
4014                 goto out;
4015
4016         error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx);
4017 out:
4018         /*
4019          * nameidone has to happen before we vnode_put(dvp)
4020          * since it may need to release the fs_nodelock on the dvp
4021          */
4022         nameidone(&nd);
4023
4024         if (vp)
4025                 vnode_put(vp);
4026         vnode_put(dvp);
4027
4028         return error;
4029 }
4030
4031
4032 /*
4033  * mkfifo_extended: Create a named pipe; with extended argument list (including extended security (ACL)).
4034  *
4035  * Parameters:  p                       Process requesting the open
4036  *              uap                     User argument descriptor (see below)
4037  *              retval                  (Ignored)
4038  *
4039  * Indirect:    uap->path               Path to fifo (same as 'mkfifo')
4040  *              uap->uid                UID to set
4041  *              uap->gid                GID to set
4042  *              uap->mode               File mode to set (same as 'mkfifo')
4043  *              uap->xsecurity          ACL to set, if creating
4044  *
4045  * Returns:     0                       Success
4046  *              !0                      errno value
4047  *
4048  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
4049  *
4050  * XXX:         We should enummerate the possible errno values here, and where
4051  *              in the code they originated.
4052  */
4053 int
4054 mkfifo_extended(proc_t p, struct mkfifo_extended_args *uap, __unused int32_t *retval)
4055 {
4056         int ciferror;
4057         kauth_filesec_t xsecdst;
4058         struct vnode_attr va;
4059
4060         AUDIT_ARG(owner, uap->uid, uap->gid);
4061
4062         xsecdst = KAUTH_FILESEC_NONE;
4063         if (uap->xsecurity != USER_ADDR_NULL) {
4064                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
4065                         return ciferror;
4066         }
4067
4068         VATTR_INIT(&va);
4069         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4070         if (uap->uid != KAUTH_UID_NONE)
4071                 VATTR_SET(&va, va_uid, uap->uid);
4072         if (uap->gid != KAUTH_GID_NONE)
4073                 VATTR_SET(&va, va_gid, uap->gid);
4074         if (xsecdst != KAUTH_FILESEC_NONE)
4075                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
4076
4077         ciferror = mkfifo1(vfs_context_current(), uap->path, &va);
4078
4079         if (xsecdst != KAUTH_FILESEC_NONE)
4080                 kauth_filesec_free(xsecdst);
4081         return ciferror;
4082 }
4083
4084 /* ARGSUSED */
4085 int
4086 mkfifo(proc_t p, struct mkfifo_args *uap, __unused int32_t *retval)
4087 {
4088         struct vnode_attr va;
4089
4090         VATTR_INIT(&va);
4091         VATTR_SET(&va, va_mode, (uap->mode & ALLPERMS) & ~p->p_fd->fd_cmask);
4092
4093         return(mkfifo1(vfs_context_current(), uap->path, &va));
4094 }
4095
4096
4097 static char *
4098 my_strrchr(char *p, int ch)
4099 {
4100         char *save;
4101
4102         for (save = NULL;; ++p) {
4103                 if (*p == ch)
4104                         save = p;
4105                 if (!*p)
4106                         return(save);
4107         }
4108         /* NOTREACHED */
4109 }
4110
4111 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
4112
4113 int
4114 safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path)
4115 {
4116         int ret, len = _len;
4117
4118         *truncated_path = 0;
4119         ret = vn_getpath(dvp, path, &len);
4120         if (ret == 0 && len < (MAXPATHLEN - 1)) {
4121                 if (leafname) {
4122                         path[len-1] = '/';
4123                         len += strlcpy(&path[len], leafname, MAXPATHLEN-len) + 1;
4124                         if (len > MAXPATHLEN) {
4125                                 char *ptr;
4126
4127                                 // the string got truncated!
4128                                 *truncated_path = 1;
4129                                 ptr = my_strrchr(path, '/');
4130                                 if (ptr) {
4131                                         *ptr = '\0';   // chop off the string at the last directory component
4132                                 }
4133                                 len = strlen(path) + 1;
4134                         }
4135                 }
4136         } else if (ret == 0) {
4137                 *truncated_path = 1;
4138         } else if (ret != 0) {
4139                 struct vnode *mydvp=dvp;
4140
4141                 if (ret != ENOSPC) {
4142                         printf("safe_getpath: failed to get the path for vp %p (%s) : err %d\n",
4143                                dvp, dvp->v_name ? dvp->v_name : "no-name", ret);
4144                 }
4145                 *truncated_path = 1;
4146
4147                 do {
4148                         if (mydvp->v_parent != NULL) {
4149                                 mydvp = mydvp->v_parent;
4150                         } else if (mydvp->v_mount) {
4151                                 strlcpy(path, mydvp->v_mount->mnt_vfsstat.f_mntonname, _len);
4152                                 break;
4153                         } else {
4154                                 // no parent and no mount point?  only thing is to punt and say "/" changed
4155                                 strlcpy(path, "/", _len);
4156                                 len = 2;
4157                                 mydvp = NULL;
4158                         }
4159
4160                         if (mydvp == NULL) {
4161                                 break;
4162                         }
4163
4164                         len = _len;
4165                         ret = vn_getpath(mydvp, path, &len);
4166                 } while (ret == ENOSPC);
4167         }
4168
4169         return len;
4170 }
4171
4172
4173 /*
4174  * Make a hard file link.
4175  *
4176  * Returns:     0                       Success
4177  *              EPERM
4178  *              EEXIST
4179  *              EXDEV
4180  *      namei:???
4181  *      vnode_authorize:???
4182  *      VNOP_LINK:???
4183  */
4184 /* ARGSUSED */
4185 static int
4186 linkat_internal(vfs_context_t ctx, int fd1, user_addr_t path, int fd2,
4187     user_addr_t link, int flag, enum uio_seg segflg)
4188 {
4189         vnode_t vp, dvp, lvp;
4190         struct nameidata nd;
4191         int follow;
4192         int error;
4193 #if CONFIG_FSE
4194         fse_info finfo;
4195 #endif
4196         int need_event, has_listeners;
4197         char *target_path = NULL;
4198         int truncated=0;
4199
4200         vp = dvp = lvp = NULLVP;
4201
4202         /* look up the object we are linking to */
4203         follow = (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW;
4204         NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow,
4205             segflg, path, ctx);
4206
4207         error = nameiat(&nd, fd1);
4208         if (error)
4209                 return (error);
4210         vp = nd.ni_vp;
4211
4212         nameidone(&nd);
4213
4214         /*
4215          * Normally, linking to directories is not supported.
4216          * However, some file systems may have limited support.
4217          */
4218         if (vp->v_type == VDIR) {
4219                 if (!ISSET(vp->v_mount->mnt_kern_flag, MNTK_DIR_HARDLINKS)) {
4220                         error = EPERM;   /* POSIX */
4221                         goto out;
4222                 }
4223
4224                 /* Linking to a directory requires ownership. */
4225                 if (!kauth_cred_issuser(vfs_context_ucred(ctx))) {
4226                         struct vnode_attr dva;
4227
4228                         VATTR_INIT(&dva);
4229                         VATTR_WANTED(&dva, va_uid);
4230                         if (vnode_getattr(vp, &dva, ctx) != 0 ||
4231                             !VATTR_IS_SUPPORTED(&dva, va_uid) ||
4232                             (dva.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)))) {
4233                                 error = EACCES;
4234                                 goto out;
4235                         }
4236                 }
4237         }
4238
4239         /* lookup the target node */
4240 #if CONFIG_TRIGGERS
4241         nd.ni_op = OP_LINK;
4242 #endif
4243         nd.ni_cnd.cn_nameiop = CREATE;
4244         nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK;
4245         nd.ni_dirp = link;
4246         error = nameiat(&nd, fd2);
4247         if (error != 0)
4248                 goto out;
4249         dvp = nd.ni_dvp;
4250         lvp = nd.ni_vp;
4251
4252 #if CONFIG_MACF
4253         if ((error = mac_vnode_check_link(ctx, dvp, vp, &nd.ni_cnd)) != 0)
4254                 goto out2;
4255 #endif
4256
4257         /* or to anything that kauth doesn't want us to (eg. immutable items) */
4258         if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_LINKTARGET, ctx)) != 0)
4259                 goto out2;
4260
4261         /* target node must not exist */
4262         if (lvp != NULLVP) {
4263                 error = EEXIST;
4264                 goto out2;
4265         }
4266         /* cannot link across mountpoints */
4267         if (vnode_mount(vp) != vnode_mount(dvp)) {
4268                 error = EXDEV;
4269                 goto out2;
4270         }
4271
4272         /* authorize creation of the target note */
4273         if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
4274                 goto out2;
4275
4276         /* and finally make the link */
4277         error = VNOP_LINK(vp, dvp, &nd.ni_cnd, ctx);
4278         if (error)
4279                 goto out2;
4280
4281 #if CONFIG_MACF
4282         (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd);
4283 #endif
4284
4285 #if CONFIG_FSE
4286         need_event = need_fsevent(FSE_CREATE_FILE, dvp);
4287 #else
4288         need_event = 0;
4289 #endif
4290         has_listeners = kauth_authorize_fileop_has_listeners();
4291
4292         if (need_event || has_listeners) {
4293                 char *link_to_path = NULL;
4294                 int len, link_name_len;
4295
4296                 /* build the path to the new link file */
4297                 GET_PATH(target_path);
4298                 if (target_path == NULL) {
4299                         error = ENOMEM;
4300                         goto out2;
4301                 }
4302
4303                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, target_path, MAXPATHLEN, &truncated);
4304
4305                 if (has_listeners) {
4306                         /* build the path to file we are linking to */
4307                         GET_PATH(link_to_path);
4308                         if (link_to_path == NULL) {
4309                                 error = ENOMEM;
4310                                 goto out2;
4311                         }
4312
4313                         link_name_len = MAXPATHLEN;
4314                         if (vn_getpath(vp, link_to_path, &link_name_len) == 0) {
4315                                 /*
4316                                  * Call out to allow 3rd party notification of rename.
4317                                  * Ignore result of kauth_authorize_fileop call.
4318                                  */
4319                                 kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_LINK,
4320                                                        (uintptr_t)link_to_path,
4321                                                        (uintptr_t)target_path);
4322                         }
4323                         if (link_to_path != NULL) {
4324                                 RELEASE_PATH(link_to_path);
4325                         }
4326                 }
4327 #if CONFIG_FSE
4328                 if (need_event) {
4329                         /* construct fsevent */
4330                         if (get_fse_info(vp, &finfo, ctx) == 0) {
4331                                 if (truncated) {
4332                                         finfo.mode |= FSE_TRUNCATED_PATH;
4333                                 }
4334
4335                                 // build the path to the destination of the link
4336                                 add_fsevent(FSE_CREATE_FILE, ctx,
4337                                             FSE_ARG_STRING, len, target_path,
4338                                             FSE_ARG_FINFO, &finfo,
4339                                             FSE_ARG_DONE);
4340                         }
4341                         if (vp->v_parent) {
4342                             add_fsevent(FSE_STAT_CHANGED, ctx,
4343                                 FSE_ARG_VNODE, vp->v_parent,
4344                                 FSE_ARG_DONE);
4345                         }
4346                 }
4347 #endif
4348         }
4349 out2:
4350         /*
4351          * nameidone has to happen before we vnode_put(dvp)
4352          * since it may need to release the fs_nodelock on the dvp
4353          */
4354         nameidone(&nd);
4355         if (target_path != NULL) {
4356                 RELEASE_PATH(target_path);
4357         }
4358 out:
4359         if (lvp)
4360                 vnode_put(lvp);
4361         if (dvp)
4362                 vnode_put(dvp);
4363         vnode_put(vp);
4364         return (error);
4365 }
4366
4367 int
4368 link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval)
4369 {
4370         return (linkat_internal(vfs_context_current(), AT_FDCWD, uap->path,
4371             AT_FDCWD, uap->link, AT_SYMLINK_FOLLOW, UIO_USERSPACE));
4372 }
4373
4374 int
4375 linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval)
4376 {
4377         if (uap->flag & ~AT_SYMLINK_FOLLOW)
4378                 return (EINVAL);
4379
4380         return (linkat_internal(vfs_context_current(), uap->fd1, uap->path,
4381             uap->fd2, uap->link, uap->flag, UIO_USERSPACE));
4382 }
4383
4384 /*
4385  * Make a symbolic link.
4386  *
4387  * We could add support for ACLs here too...
4388  */
4389 /* ARGSUSED */
4390 static int
4391 symlinkat_internal(vfs_context_t ctx, user_addr_t path_data, int fd,
4392     user_addr_t link, enum uio_seg segflg)
4393 {
4394         struct vnode_attr va;
4395         char *path;
4396         int error;
4397         struct nameidata nd;
4398         vnode_t vp, dvp;
4399         size_t dummy=0;
4400         proc_t p;
4401
4402         error = 0;
4403         if (UIO_SEG_IS_USER_SPACE(segflg)) {
4404                 MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
4405                 error = copyinstr(path_data, path, MAXPATHLEN, &dummy);
4406         } else {
4407                 path = (char *)path_data;
4408         }
4409         if (error)
4410                 goto out;
4411         AUDIT_ARG(text, path);  /* This is the link string */
4412
4413         NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1,
4414             segflg, link, ctx);
4415
4416         error = nameiat(&nd, fd);
4417         if (error)
4418                 goto out;
4419         dvp = nd.ni_dvp;
4420         vp = nd.ni_vp;
4421
4422         p = vfs_context_proc(ctx);
4423         VATTR_INIT(&va);
4424         VATTR_SET(&va, va_type, VLNK);
4425         VATTR_SET(&va, va_mode, ACCESSPERMS & ~p->p_fd->fd_cmask);
4426
4427 #if CONFIG_MACF
4428         error = mac_vnode_check_create(ctx,
4429                         dvp, &nd.ni_cnd, &va);
4430 #endif
4431         if (error != 0) {
4432             goto skipit;
4433         }
4434
4435         if (vp != NULL) {
4436             error = EEXIST;
4437             goto skipit;
4438         }
4439
4440         /* authorize */
4441         if (error == 0)
4442                 error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx);
4443         /* get default ownership, etc. */
4444         if (error == 0)
4445                 error = vnode_authattr_new(dvp, &va, 0, ctx);
4446         if (error == 0)
4447                 error = VNOP_SYMLINK(dvp, &vp, &nd.ni_cnd, &va, path, ctx);
4448
4449 #if CONFIG_MACF
4450         if (error == 0 && vp)
4451                 error = vnode_label(vnode_mount(vp), dvp, vp, &nd.ni_cnd, VNODE_LABEL_CREATE, ctx);
4452 #endif
4453
4454         /* do fallback attribute handling */
4455         if (error == 0 && vp)
4456                 error = vnode_setattr_fallback(vp, &va, ctx);
4457
4458         if (error == 0) {
4459                 int     update_flags = 0;
4460
4461                 /*check if a new vnode was created, else try to get one*/
4462                 if (vp == NULL) {
4463                         nd.ni_cnd.cn_nameiop = LOOKUP;
4464 #if CONFIG_TRIGGERS
4465                         nd.ni_op = OP_LOOKUP;
4466 #endif
4467                         nd.ni_cnd.cn_flags = 0;
4468                         error = nameiat(&nd, fd);
4469                         vp = nd.ni_vp;
4470
4471                         if (vp == NULL)
4472                                 goto skipit;
4473                 }
4474
4475 #if 0  /* XXX - kauth_todo - is KAUTH_FILEOP_SYMLINK needed? */
4476                 /* call out to allow 3rd party notification of rename.
4477                  * Ignore result of kauth_authorize_fileop call.
4478                  */
4479                 if (kauth_authorize_fileop_has_listeners() &&
4480                     namei(&nd) == 0) {
4481                         char *new_link_path = NULL;
4482                         int             len;
4483
4484                         /* build the path to the new link file */
4485                         new_link_path = get_pathbuff();
4486                         len = MAXPATHLEN;
4487                         vn_getpath(dvp, new_link_path, &len);
4488                         if ((len + 1 + nd.ni_cnd.cn_namelen + 1) < MAXPATHLEN) {
4489                                 new_link_path[len - 1] = '/';
4490                                 strlcpy(&new_link_path[len], nd.ni_cnd.cn_nameptr, MAXPATHLEN-len);
4491                         }
4492
4493                         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_SYMLINK,
4494                                            (uintptr_t)path, (uintptr_t)new_link_path);
4495                         if (new_link_path != NULL)
4496                                 release_pathbuff(new_link_path);
4497                 }
4498 #endif
4499                 // Make sure the name & parent pointers are hooked up
4500                 if (vp->v_name == NULL)
4501                         update_flags |= VNODE_UPDATE_NAME;
4502                 if (vp->v_parent == NULLVP)
4503                         update_flags |= VNODE_UPDATE_PARENT;
4504
4505                 if (update_flags)
4506                         vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
4507
4508 #if CONFIG_FSE
4509                 add_fsevent(FSE_CREATE_FILE, ctx,
4510                             FSE_ARG_VNODE, vp,
4511                             FSE_ARG_DONE);
4512 #endif
4513         }
4514
4515 skipit:
4516         /*
4517          * nameidone has to happen before we vnode_put(dvp)
4518          * since it may need to release the fs_nodelock on the dvp
4519          */
4520         nameidone(&nd);
4521
4522         if (vp)
4523                 vnode_put(vp);
4524         vnode_put(dvp);
4525 out:
4526         if (path && (path != (char *)path_data))
4527                 FREE_ZONE(path, MAXPATHLEN, M_NAMEI);
4528
4529         return (error);
4530 }
4531
4532 int
4533 symlink(__unused proc_t p, struct symlink_args *uap, __unused int32_t *retval)
4534 {
4535         return (symlinkat_internal(vfs_context_current(), uap->path, AT_FDCWD,
4536             uap->link, UIO_USERSPACE));
4537 }
4538
4539 int
4540 symlinkat(__unused proc_t p, struct symlinkat_args *uap,
4541     __unused int32_t *retval)
4542 {
4543         return (symlinkat_internal(vfs_context_current(), uap->path1, uap->fd,
4544             uap->path2, UIO_USERSPACE));
4545 }
4546
4547 /*
4548  * Delete a whiteout from the filesystem.
4549  * No longer supported.
4550  */
4551 int
4552 undelete(__unused proc_t p, __unused struct undelete_args *uap, __unused int32_t *retval)
4553 {
4554         return (ENOTSUP);
4555 }
4556
4557 /*
4558  * Delete a name from the filesystem.
4559  */
4560 /* ARGSUSED */
4561 static int
4562 unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp,
4563     user_addr_t path_arg, enum uio_seg segflg, int unlink_flags)
4564 {
4565         struct nameidata nd;
4566         vnode_t vp, dvp;
4567         int error;
4568         struct componentname *cnp;
4569         char  *path = NULL;
4570         int  len=0;
4571 #if CONFIG_FSE
4572         fse_info  finfo;
4573         struct vnode_attr va;
4574 #endif
4575         int flags;
4576         int need_event;
4577         int has_listeners;
4578         int truncated_path;
4579         int batched;
4580         struct vnode_attr *vap;
4581         int do_retry;
4582         int retry_count = 0;
4583         int cn_flags;
4584
4585         cn_flags = LOCKPARENT;
4586         if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH))
4587                 cn_flags |= AUDITVNPATH1;
4588         /* If a starting dvp is passed, it trumps any fd passed. */
4589         if (start_dvp)
4590                 cn_flags |= USEDVP;
4591
4592 #if NAMEDRSRCFORK
4593         /* unlink or delete is allowed on rsrc forks and named streams */
4594         cn_flags |= CN_ALLOWRSRCFORK;
4595 #endif
4596
4597 retry:
4598         do_retry = 0;
4599         flags = 0;
4600         need_event = 0;
4601         has_listeners = 0;
4602         truncated_path = 0;
4603         vap = NULL;
4604
4605         NDINIT(&nd, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx);
4606
4607         nd.ni_dvp = start_dvp;
4608         nd.ni_flag |= NAMEI_COMPOUNDREMOVE;
4609         cnp = &nd.ni_cnd;
4610
4611 continue_lookup:
4612         error = nameiat(&nd, fd);
4613         if (error)
4614                 return (error);
4615
4616         dvp = nd.ni_dvp;
4617         vp = nd.ni_vp;
4618
4619
4620         /* With Carbon delete semantics, busy files cannot be deleted */
4621         if (unlink_flags & VNODE_REMOVE_NODELETEBUSY) {
4622                 flags |= VNODE_REMOVE_NODELETEBUSY;
4623         }
4624
4625         /* Skip any potential upcalls if told to. */
4626         if (unlink_flags & VNODE_REMOVE_SKIP_NAMESPACE_EVENT) {
4627                 flags |= VNODE_REMOVE_SKIP_NAMESPACE_EVENT;
4628         }
4629
4630         if (vp) {
4631                 batched = vnode_compound_remove_available(vp);
4632                 /*
4633                  * The root of a mounted filesystem cannot be deleted.
4634                  */
4635                 if (vp->v_flag & VROOT) {
4636                         error = EBUSY;
4637                 }
4638
4639                 if (!batched) {
4640                         error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL);
4641                         if (error) {
4642                                 if (error == ENOENT) {
4643                                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4644                                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4645                                                 do_retry = 1;
4646                                                 retry_count++;
4647                                         }
4648                                 }
4649                                 goto out;
4650                         }
4651                 }
4652         } else {
4653                 batched = 1;
4654
4655                 if (!vnode_compound_remove_available(dvp)) {
4656                         panic("No vp, but no compound remove?");
4657                 }
4658         }
4659
4660 #if CONFIG_FSE
4661         need_event = need_fsevent(FSE_DELETE, dvp);
4662         if (need_event) {
4663                 if (!batched) {
4664                         if ((vp->v_flag & VISHARDLINK) == 0) {
4665                                 /* XXX need to get these data in batched VNOP */
4666                                 get_fse_info(vp, &finfo, ctx);
4667                         }
4668                 } else {
4669                         error = vfs_get_notify_attributes(&va);
4670                         if (error) {
4671                                 goto out;
4672                         }
4673
4674                         vap = &va;
4675                 }
4676         }
4677 #endif
4678         has_listeners = kauth_authorize_fileop_has_listeners();
4679         if (need_event || has_listeners) {
4680                 if (path == NULL) {
4681                         GET_PATH(path);
4682                         if (path == NULL) {
4683                                 error = ENOMEM;
4684                                 goto out;
4685                         }
4686                 }
4687                 len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path);
4688         }
4689
4690 #if NAMEDRSRCFORK
4691         if (nd.ni_cnd.cn_flags & CN_WANTSRSRCFORK)
4692                 error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx);
4693         else
4694 #endif
4695         {
4696                 error = vn_remove(dvp, &nd.ni_vp, &nd, flags, vap, ctx);
4697                 vp = nd.ni_vp;
4698                 if (error == EKEEPLOOKING) {
4699                         if (!batched) {
4700                                 panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?");
4701                         }
4702
4703                         if ((nd.ni_flag & NAMEI_CONTLOOKUP) == 0) {
4704                                 panic("EKEEPLOOKING, but continue flag not set?");
4705                         }
4706
4707                         if (vnode_isdir(vp)) {
4708                                 error = EISDIR;
4709                                 goto out;
4710                         }
4711                         goto continue_lookup;
4712                 } else if (error == ENOENT && batched) {
4713                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
4714                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
4715                                 /*
4716                                  * For compound VNOPs, the authorization callback may
4717                                  * return ENOENT in case of racing hardlink lookups
4718                                  * hitting the name  cache, redrive the lookup.
4719                                  */
4720                                 do_retry = 1;
4721                                 retry_count += 1;
4722                                 goto out;
4723                         }
4724                 }
4725         }
4726
4727         /*
4728          * Call out to allow 3rd party notification of delete.
4729          * Ignore result of kauth_authorize_fileop call.
4730          */
4731         if (!error) {
4732                 if (has_listeners) {
4733                         kauth_authorize_fileop(vfs_context_ucred(ctx),
4734                                 KAUTH_FILEOP_DELETE,
4735                                 (uintptr_t)vp,
4736                                 (uintptr_t)path);
4737                 }
4738
4739                 if (vp->v_flag & VISHARDLINK) {
4740                     //
4741                     // if a hardlink gets deleted we want to blow away the
4742                     // v_parent link because the path that got us to this
4743                     // instance of the link is no longer valid.  this will
4744                     // force the next call to get the path to ask the file
4745                     // system instead of just following the v_parent link.
4746                     //
4747                     vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
4748                 }
4749
4750 #if CONFIG_FSE
4751                 if (need_event) {
4752                         if (vp->v_flag & VISHARDLINK) {
4753                                 get_fse_info(vp, &finfo, ctx);
4754                         } else if (vap) {
4755                                 vnode_get_fse_info_from_vap(vp, &finfo, vap);
4756                         }
4757                         if (truncated_path) {
4758                                 finfo.mode |= FSE_TRUNCATED_PATH;
4759                         }
4760                         add_fsevent(FSE_DELETE, ctx,
4761                                                 FSE_ARG_STRING, len, path,
4762                                                 FSE_ARG_FINFO, &finfo,
4763                                                 FSE_ARG_DONE);
4764                 }
4765 #endif
4766         }
4767
4768 out:
4769         if (path != NULL)
4770                 RELEASE_PATH(path);
4771
4772 #if NAMEDRSRCFORK
4773         /* recycle the deleted rsrc fork vnode to force a reclaim, which
4774          * will cause its shadow file to go away if necessary.
4775          */
4776          if (vp && (vnode_isnamedstream(vp)) &&
4777                 (vp->v_parent != NULLVP) &&
4778                 vnode_isshadow(vp)) {
4779                         vnode_recycle(vp);
4780          }
4781 #endif
4782         /*
4783          * nameidone has to happen before we vnode_put(dvp)
4784          * since it may need to release the fs_nodelock on the dvp
4785          */
4786         nameidone(&nd);
4787         vnode_put(dvp);
4788         if (vp) {
4789                 vnode_put(vp);
4790         }
4791
4792         if (do_retry) {
4793                 goto retry;
4794         }
4795
4796         return (error);
4797 }
4798
4799 int
4800 unlink1(vfs_context_t ctx, vnode_t start_dvp, user_addr_t path_arg,
4801     enum uio_seg segflg, int unlink_flags)
4802 {
4803         return (unlinkat_internal(ctx, AT_FDCWD, start_dvp, path_arg, segflg,
4804             unlink_flags));
4805 }
4806
4807 /*
4808  * Delete a name from the filesystem using Carbon semantics.
4809  */
4810 int
4811 delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval)
4812 {
4813         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4814             uap->path, UIO_USERSPACE, VNODE_REMOVE_NODELETEBUSY));
4815 }
4816
4817 /*
4818  * Delete a name from the filesystem using POSIX semantics.
4819  */
4820 int
4821 unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval)
4822 {
4823         return (unlinkat_internal(vfs_context_current(), AT_FDCWD, NULLVP,
4824             uap->path, UIO_USERSPACE, 0));
4825 }
4826
4827 int
4828 unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval)
4829 {
4830         if (uap->flag & ~AT_REMOVEDIR)
4831                 return (EINVAL);
4832
4833         if (uap->flag & AT_REMOVEDIR)
4834                 return (rmdirat_internal(vfs_context_current(), uap->fd,
4835                     uap->path, UIO_USERSPACE));
4836         else
4837                 return (unlinkat_internal(vfs_context_current(), uap->fd,
4838                     NULLVP, uap->path, UIO_USERSPACE, 0));
4839 }
4840
4841 /*
4842  * Reposition read/write file offset.
4843  */
4844 int
4845 lseek(proc_t p, struct lseek_args *uap, off_t *retval)
4846 {
4847         struct fileproc *fp;
4848         vnode_t vp;
4849         struct vfs_context *ctx;
4850         off_t offset = uap->offset, file_size;
4851         int error;
4852
4853         if ( (error = fp_getfvp(p,uap->fd, &fp, &vp)) ) {
4854                 if (error == ENOTSUP)
4855                         return (ESPIPE);
4856                 return (error);
4857         }
4858         if (vnode_isfifo(vp)) {
4859                 file_drop(uap->fd);
4860                 return(ESPIPE);
4861         }
4862
4863
4864         ctx = vfs_context_current();
4865 #if CONFIG_MACF
4866         if (uap->whence == L_INCR && uap->offset == 0)
4867                 error = mac_file_check_get_offset(vfs_context_ucred(ctx),
4868                     fp->f_fglob);
4869         else
4870                 error = mac_file_check_change_offset(vfs_context_ucred(ctx),
4871                     fp->f_fglob);
4872         if (error) {
4873                 file_drop(uap->fd);
4874                 return (error);
4875         }
4876 #endif
4877         if ( (error = vnode_getwithref(vp)) ) {
4878                 file_drop(uap->fd);
4879                 return(error);
4880         }
4881
4882         switch (uap->whence) {
4883         case L_INCR:
4884                 offset += fp->f_fglob->fg_offset;
4885                 break;
4886         case L_XTND:
4887                 if ((error = vnode_size(vp, &file_size, ctx)) != 0)
4888                         break;
4889                 offset += file_size;
4890                 break;
4891         case L_SET:
4892                 break;
4893         case SEEK_HOLE:
4894         error = VNOP_IOCTL(vp, FSCTL_FIOSEEKHOLE, (caddr_t)&offset, 0, ctx);
4895                 break;
4896         case SEEK_DATA:
4897         error = VNOP_IOCTL(vp, FSCTL_FIOSEEKDATA, (caddr_t)&offset, 0, ctx);
4898                 break;
4899         default:
4900                 error = EINVAL;
4901         }
4902         if (error == 0) {
4903                 if (uap->offset > 0 && offset < 0) {
4904                         /* Incremented/relative move past max size */
4905                         error = EOVERFLOW;
4906                 } else {
4907                         /*
4908                          * Allow negative offsets on character devices, per
4909                          * POSIX 1003.1-2001.  Most likely for writing disk
4910                          * labels.
4911                          */
4912                         if (offset < 0 && vp->v_type != VCHR) {
4913                                 /* Decremented/relative move before start */
4914                                 error = EINVAL;
4915                         } else {
4916                                 /* Success */
4917                                 fp->f_fglob->fg_offset = offset;
4918                                 *retval = fp->f_fglob->fg_offset;
4919                         }
4920                 }
4921         }
4922
4923         /*
4924          * An lseek can affect whether data is "available to read."  Use
4925          * hint of NOTE_NONE so no EVFILT_VNODE events fire
4926          */
4927         post_event_if_success(vp, error, NOTE_NONE);
4928         (void)vnode_put(vp);
4929         file_drop(uap->fd);
4930         return (error);
4931 }
4932
4933
4934 /*
4935  * Check access permissions.
4936  *
4937  * Returns:     0                       Success
4938  *              vnode_authorize:???
4939  */
4940 static int
4941 access1(vnode_t vp, vnode_t dvp, int uflags, vfs_context_t ctx)
4942 {
4943         kauth_action_t action;
4944         int error;
4945
4946         /*
4947          * If just the regular access bits, convert them to something
4948          * that vnode_authorize will understand.
4949          */
4950         if (!(uflags & _ACCESS_EXTENDED_MASK)) {
4951                 action = 0;
4952                 if (uflags & R_OK)
4953                         action |= KAUTH_VNODE_READ_DATA;        /* aka KAUTH_VNODE_LIST_DIRECTORY */
4954                 if (uflags & W_OK) {
4955                         if (vnode_isdir(vp)) {
4956                                 action |= KAUTH_VNODE_ADD_FILE |
4957                                     KAUTH_VNODE_ADD_SUBDIRECTORY;
4958                                 /* might want delete rights here too */
4959                         } else {
4960                                 action |= KAUTH_VNODE_WRITE_DATA;
4961                         }
4962                 }
4963                 if (uflags & X_OK) {
4964                         if (vnode_isdir(vp)) {
4965                                 action |= KAUTH_VNODE_SEARCH;
4966                         } else {
4967                                 action |= KAUTH_VNODE_EXECUTE;
4968                         }
4969                 }
4970         } else {
4971                 /* take advantage of definition of uflags */
4972                 action = uflags >> 8;
4973         }
4974
4975 #if CONFIG_MACF
4976         error = mac_vnode_check_access(ctx, vp, uflags);
4977         if (error)
4978                 return (error);
4979 #endif /* MAC */
4980
4981         /* action == 0 means only check for existence */
4982         if (action != 0) {
4983                 error = vnode_authorize(vp, dvp, action | KAUTH_VNODE_ACCESS, ctx);
4984         } else {
4985                 error = 0;
4986         }
4987
4988         return(error);
4989 }
4990
4991
4992
4993 /*
4994  * access_extended: Check access permissions in bulk.
4995  *
4996  * Description: uap->entries            Pointer to an array of accessx
4997  *                                      descriptor structs, plus one or
4998  *                                      more NULL terminated strings (see
4999  *                                      "Notes" section below).
5000  *              uap->size               Size of the area pointed to by
5001  *                                      uap->entries.
5002  *              uap->results            Pointer to the results array.
5003  *
5004  * Returns:     0                       Success
5005  *              ENOMEM                  Insufficient memory
5006  *              EINVAL                  Invalid arguments
5007  *              namei:EFAULT            Bad address
5008  *              namei:ENAMETOOLONG      Filename too long
5009  *              namei:ENOENT            No such file or directory
5010  *              namei:ELOOP             Too many levels of symbolic links
5011  *              namei:EBADF             Bad file descriptor
5012  *              namei:ENOTDIR           Not a directory
5013  *              namei:???
5014  *              access1:
5015  *
5016  * Implicit returns:
5017  *              uap->results            Array contents modified
5018  *
5019  * Notes:       The uap->entries are structured as an arbitrary length array
5020  *              of accessx descriptors, followed by one or more NULL terminated
5021  *              strings
5022  *
5023  *                      struct accessx_descriptor[0]
5024  *                      ...
5025  *                      struct accessx_descriptor[n]
5026  *                      char name_data[0];
5027  *
5028  *              We determine the entry count by walking the buffer containing
5029  *              the uap->entries argument descriptor.  For each descriptor we
5030  *              see, the valid values for the offset ad_name_offset will be
5031  *              in the byte range:
5032  *
5033  *                      [ uap->entries + sizeof(struct accessx_descriptor) ]
5034  *                                              to
5035  *                              [ uap->entries + uap->size - 2 ]
5036  *
5037  *              since we must have at least one string, and the string must
5038  *              be at least one character plus the NULL terminator in length.
5039  *
5040  * XXX:         Need to support the check-as uid argument
5041  */
5042 int
5043 access_extended(__unused proc_t p, struct access_extended_args *uap, __unused int32_t *retval)
5044 {
5045         struct accessx_descriptor *input = NULL;
5046         errno_t *result = NULL;
5047         errno_t error = 0;
5048         int wantdelete = 0;
5049         unsigned int desc_max, desc_actual, i, j;
5050         struct vfs_context context;
5051         struct nameidata nd;
5052         int niopts;
5053         vnode_t vp = NULL;
5054         vnode_t dvp = NULL;
5055 #define ACCESSX_MAX_DESCR_ON_STACK 10
5056         struct accessx_descriptor stack_input[ACCESSX_MAX_DESCR_ON_STACK];
5057
5058         context.vc_ucred = NULL;
5059
5060         /*
5061          * Validate parameters; if valid, copy the descriptor array and string
5062          * arguments into local memory.  Before proceeding, the following
5063          * conditions must have been met:
5064          *
5065          * o    The total size is not permitted to exceed ACCESSX_MAX_TABLESIZE
5066          * o    There must be sufficient room in the request for at least one
5067          *      descriptor and a one yte NUL terminated string.
5068          * o    The allocation of local storage must not fail.
5069          */
5070         if (uap->size > ACCESSX_MAX_TABLESIZE)
5071                 return(ENOMEM);
5072         if (uap->size < (sizeof(struct accessx_descriptor) + 2))
5073                 return(EINVAL);
5074         if (uap->size <= sizeof (stack_input)) {
5075                 input = stack_input;
5076         } else {
5077         MALLOC(input, struct accessx_descriptor *, uap->size, M_TEMP, M_WAITOK);
5078         if (input == NULL) {
5079                 error = ENOMEM;
5080                 goto out;
5081         }
5082         }
5083         error = copyin(uap->entries, input, uap->size);
5084         if (error)
5085                 goto out;
5086
5087         AUDIT_ARG(opaque, input, uap->size);
5088
5089         /*
5090          * Force NUL termination of the copyin buffer to avoid nami() running
5091          * off the end.  If the caller passes us bogus data, they may get a
5092          * bogus result.
5093          */
5094         ((char *)input)[uap->size - 1] = 0;
5095
5096         /*
5097          * Access is defined as checking against the process' real identity,
5098          * even if operations are checking the effective identity.  This
5099          * requires that we use a local vfs context.
5100          */
5101         context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5102         context.vc_thread = current_thread();
5103
5104         /*
5105          * Find out how many entries we have, so we can allocate the result
5106          * array by walking the list and adjusting the count downward by the
5107          * earliest string offset we see.
5108          */
5109         desc_max = (uap->size - 2) / sizeof(struct accessx_descriptor);
5110         desc_actual = desc_max;
5111         for (i = 0; i < desc_actual; i++) {
5112                 /*
5113                  * Take the offset to the name string for this entry and
5114                  * convert to an input array index, which would be one off
5115                  * the end of the array if this entry was the lowest-addressed
5116                  * name string.
5117                  */
5118                 j = input[i].ad_name_offset / sizeof(struct accessx_descriptor);
5119
5120                 /*
5121                  * An offset greater than the max allowable offset is an error.
5122                  * It is also an error for any valid entry to point
5123                  * to a location prior to the end of the current entry, if
5124                  * it's not a reference to the string of the previous entry.
5125                  */
5126                 if (j > desc_max || (j != 0 && j <= i)) {
5127                         error = EINVAL;
5128                         goto out;
5129                 }
5130
5131                 /* Also do not let ad_name_offset point to something beyond the size of the input */
5132                 if (input[i].ad_name_offset >= uap->size) {
5133                         error = EINVAL;
5134                         goto out;
5135                 }
5136
5137                 /*
5138                  * An offset of 0 means use the previous descriptor's offset;
5139                  * this is used to chain multiple requests for the same file
5140                  * to avoid multiple lookups.
5141                  */
5142                 if (j == 0) {
5143                         /* This is not valid for the first entry */
5144                         if (i == 0) {
5145                                 error = EINVAL;
5146                                 goto out;
5147                         }
5148                         continue;
5149                 }
5150
5151                 /*
5152                  * If the offset of the string for this descriptor is before
5153                  * what we believe is the current actual last descriptor,
5154                  * then we need to adjust our estimate downward; this permits
5155                  * the string table following the last descriptor to be out
5156                  * of order relative to the descriptor list.
5157                  */
5158                 if (j < desc_actual)
5159                         desc_actual = j;
5160         }
5161
5162         /*
5163          * We limit the actual number of descriptors we are willing to process
5164          * to a hard maximum of ACCESSX_MAX_DESCRIPTORS.  If the number being
5165          * requested does not exceed this limit,
5166          */
5167         if (desc_actual > ACCESSX_MAX_DESCRIPTORS) {
5168                 error = ENOMEM;
5169                 goto out;
5170         }
5171         MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK);
5172         if (result == NULL) {
5173                 error = ENOMEM;
5174                 goto out;
5175         }
5176
5177         /*
5178          * Do the work by iterating over the descriptor entries we know to
5179          * at least appear to contain valid data.
5180          */
5181         error = 0;
5182         for (i = 0; i < desc_actual; i++) {
5183                 /*
5184                  * If the ad_name_offset is 0, then we use the previous
5185                  * results to make the check; otherwise, we are looking up
5186                  * a new file name.
5187                  */
5188                 if (input[i].ad_name_offset != 0) {
5189                         /* discard old vnodes */
5190                         if (vp) {
5191                                 vnode_put(vp);
5192                                 vp = NULL;
5193                         }
5194                         if (dvp) {
5195                                 vnode_put(dvp);
5196                                 dvp = NULL;
5197                         }
5198
5199                         /*
5200                          * Scan forward in the descriptor list to see if we
5201                          * need the parent vnode.  We will need it if we are
5202                          * deleting, since we must have rights  to remove
5203                          * entries in the parent directory, as well as the
5204                          * rights to delete the object itself.
5205                          */
5206                         wantdelete = input[i].ad_flags & _DELETE_OK;
5207                         for (j = i + 1; (j < desc_actual) && (input[j].ad_name_offset == 0); j++)
5208                                 if (input[j].ad_flags & _DELETE_OK)
5209                                         wantdelete = 1;
5210
5211                         niopts = FOLLOW | AUDITVNPATH1;
5212
5213                         /* need parent for vnode_authorize for deletion test */
5214                         if (wantdelete)
5215                                 niopts |= WANTPARENT;
5216
5217                         /* do the lookup */
5218                         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE,
5219                                CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset),
5220                                &context);
5221                         error = namei(&nd);
5222                         if (!error) {
5223                                 vp = nd.ni_vp;
5224                                 if (wantdelete)
5225                                         dvp = nd.ni_dvp;
5226                         }
5227                         nameidone(&nd);
5228                 }
5229
5230                 /*
5231                  * Handle lookup errors.
5232                  */
5233                 switch(error) {
5234                 case ENOENT:
5235                 case EACCES:
5236                 case EPERM:
5237                 case ENOTDIR:
5238                         result[i] = error;
5239                         break;
5240                 case 0:
5241                         /* run this access check */
5242                         result[i] = access1(vp, dvp, input[i].ad_flags, &context);
5243                         break;
5244                 default:
5245                         /* fatal lookup error */
5246
5247                         goto out;
5248                 }
5249         }
5250
5251         AUDIT_ARG(data, result, sizeof(errno_t), desc_actual);
5252
5253         /* copy out results */
5254         error = copyout(result, uap->results, desc_actual * sizeof(errno_t));
5255
5256 out:
5257         if (input && input != stack_input)
5258                 FREE(input, M_TEMP);
5259         if (result)
5260                 FREE(result, M_TEMP);
5261         if (vp)
5262                 vnode_put(vp);
5263         if (dvp)
5264                 vnode_put(dvp);
5265         if (IS_VALID_CRED(context.vc_ucred))
5266                 kauth_cred_unref(&context.vc_ucred);
5267         return(error);
5268 }
5269
5270
5271 /*
5272  * Returns:     0                       Success
5273  *              namei:EFAULT            Bad address
5274  *              namei:ENAMETOOLONG      Filename too long
5275  *              namei:ENOENT            No such file or directory
5276  *              namei:ELOOP             Too many levels of symbolic links
5277  *              namei:EBADF             Bad file descriptor
5278  *              namei:ENOTDIR           Not a directory
5279  *              namei:???
5280  *              access1:
5281  */
5282 static int
5283 faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode,
5284     int flag, enum uio_seg segflg)
5285 {
5286         int error;
5287         struct nameidata nd;
5288         int niopts;
5289         struct vfs_context context;
5290 #if NAMEDRSRCFORK
5291         int is_namedstream = 0;
5292 #endif
5293
5294         /*
5295          * Unless the AT_EACCESS option is used, Access is defined as checking
5296          * against the process' real identity, even if operations are checking
5297          * the effective identity.  So we need to tweak the credential
5298          * in the context for that case.
5299          */
5300         if (!(flag & AT_EACCESS))
5301                 context.vc_ucred = kauth_cred_copy_real(kauth_cred_get());
5302         else
5303                 context.vc_ucred = ctx->vc_ucred;
5304         context.vc_thread = ctx->vc_thread;
5305
5306
5307         niopts = FOLLOW | AUDITVNPATH1;
5308         /* need parent for vnode_authorize for deletion test */
5309         if (amode & _DELETE_OK)
5310                 niopts |= WANTPARENT;
5311         NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, segflg,
5312                path, &context);
5313
5314 #if NAMEDRSRCFORK
5315         /* access(F_OK) calls are allowed for resource forks. */
5316         if (amode == F_OK)
5317                 nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5318 #endif
5319         error = nameiat(&nd, fd);
5320         if (error)
5321                 goto out;
5322
5323 #if NAMEDRSRCFORK
5324         /* Grab reference on the shadow stream file vnode to
5325          * force an inactive on release which will mark it
5326          * for recycle.
5327          */
5328         if (vnode_isnamedstream(nd.ni_vp) &&
5329             (nd.ni_vp->v_parent != NULLVP) &&
5330             vnode_isshadow(nd.ni_vp)) {
5331                 is_namedstream = 1;
5332                 vnode_ref(nd.ni_vp);
5333         }
5334 #endif
5335
5336         error = access1(nd.ni_vp, nd.ni_dvp, amode, &context);
5337
5338 #if NAMEDRSRCFORK
5339         if (is_namedstream) {
5340                 vnode_rele(nd.ni_vp);
5341         }
5342 #endif
5343
5344         vnode_put(nd.ni_vp);
5345         if (amode & _DELETE_OK)
5346                 vnode_put(nd.ni_dvp);
5347         nameidone(&nd);
5348
5349 out:
5350         if (!(flag & AT_EACCESS))
5351                 kauth_cred_unref(&context.vc_ucred);
5352         return (error);
5353 }
5354
5355 int
5356 access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval)
5357 {
5358         return (faccessat_internal(vfs_context_current(), AT_FDCWD,
5359             uap->path, uap->flags, 0, UIO_USERSPACE));
5360 }
5361
5362 int
5363 faccessat(__unused proc_t p, struct faccessat_args *uap,
5364           __unused int32_t *retval)
5365 {
5366         if (uap->flag & ~AT_EACCESS)
5367                 return (EINVAL);
5368
5369         return (faccessat_internal(vfs_context_current(), uap->fd,
5370             uap->path, uap->amode, uap->flag, UIO_USERSPACE));
5371 }
5372
5373 /*
5374  * Returns:     0                       Success
5375  *              EFAULT
5376  *      copyout:EFAULT
5377  *      namei:???
5378  *      vn_stat:???
5379  */
5380 static int
5381 fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub,
5382     user_addr_t xsecurity, user_addr_t xsecurity_size, int isstat64,
5383     enum uio_seg segflg, int fd, int flag)
5384 {
5385         struct nameidata nd;
5386         int follow;
5387         union {
5388                 struct stat sb;
5389                 struct stat64 sb64;
5390         } source;
5391         union {
5392                 struct user64_stat user64_sb;
5393                 struct user32_stat user32_sb;
5394                 struct user64_stat64 user64_sb64;
5395                 struct user32_stat64 user32_sb64;
5396         } dest;
5397         caddr_t sbp;
5398         int error, my_size;
5399         kauth_filesec_t fsec;
5400         size_t xsecurity_bufsize;
5401         void * statptr;
5402
5403         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5404         NDINIT(&nd, LOOKUP, OP_GETATTR, follow | AUDITVNPATH1,
5405             segflg, path, ctx);
5406
5407 #if NAMEDRSRCFORK
5408         int is_namedstream = 0;
5409         /* stat calls are allowed for resource forks. */
5410         nd.ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
5411 #endif
5412         error = nameiat(&nd, fd);
5413         if (error)
5414                 return (error);
5415         fsec = KAUTH_FILESEC_NONE;
5416
5417         statptr = (void *)&source;
5418
5419 #if NAMEDRSRCFORK
5420         /* Grab reference on the shadow stream file vnode to
5421          * force an inactive on release which will mark it
5422          * for recycle.
5423          */
5424         if (vnode_isnamedstream(nd.ni_vp) &&
5425             (nd.ni_vp->v_parent != NULLVP) &&
5426             vnode_isshadow(nd.ni_vp)) {
5427                 is_namedstream = 1;
5428                 vnode_ref(nd.ni_vp);
5429         }
5430 #endif
5431
5432         error = vn_stat(nd.ni_vp, statptr, (xsecurity != USER_ADDR_NULL ? &fsec : NULL), isstat64, ctx);
5433
5434 #if NAMEDRSRCFORK
5435         if (is_namedstream) {
5436                 vnode_rele(nd.ni_vp);
5437         }
5438 #endif
5439         vnode_put(nd.ni_vp);
5440         nameidone(&nd);
5441
5442         if (error)
5443                 return (error);
5444         /* Zap spare fields */
5445         if (isstat64 != 0) {
5446                 source.sb64.st_lspare = 0;
5447                 source.sb64.st_qspare[0] = 0LL;
5448                 source.sb64.st_qspare[1] = 0LL;
5449                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5450                         munge_user64_stat64(&source.sb64, &dest.user64_sb64);
5451                         my_size = sizeof(dest.user64_sb64);
5452                         sbp = (caddr_t)&dest.user64_sb64;
5453                 } else {
5454                         munge_user32_stat64(&source.sb64, &dest.user32_sb64);
5455                         my_size = sizeof(dest.user32_sb64);
5456                         sbp = (caddr_t)&dest.user32_sb64;
5457                 }
5458                 /*
5459                  * Check if we raced (post lookup) against the last unlink of a file.
5460                  */
5461                 if ((source.sb64.st_nlink == 0) && S_ISREG(source.sb64.st_mode)) {
5462                         source.sb64.st_nlink = 1;
5463                 }
5464         } else {
5465                 source.sb.st_lspare = 0;
5466                 source.sb.st_qspare[0] = 0LL;
5467                 source.sb.st_qspare[1] = 0LL;
5468                 if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
5469                         munge_user64_stat(&source.sb, &dest.user64_sb);
5470                         my_size = sizeof(dest.user64_sb);
5471                         sbp = (caddr_t)&dest.user64_sb;
5472                 } else {
5473                         munge_user32_stat(&source.sb, &dest.user32_sb);
5474                         my_size = sizeof(dest.user32_sb);
5475                         sbp = (caddr_t)&dest.user32_sb;
5476                 }
5477
5478                 /*
5479                  * Check if we raced (post lookup) against the last unlink of a file.
5480                  */
5481                 if ((source.sb.st_nlink == 0) && S_ISREG(source.sb.st_mode)) {
5482                         source.sb.st_nlink = 1;
5483                 }
5484         }
5485         if ((error = copyout(sbp, ub, my_size)) != 0)
5486                 goto out;
5487
5488         /* caller wants extended security information? */
5489         if (xsecurity != USER_ADDR_NULL) {
5490
5491                 /* did we get any? */
5492                 if (fsec == KAUTH_FILESEC_NONE) {
5493                         if (susize(xsecurity_size, 0) != 0) {
5494                                 error = EFAULT;
5495                                 goto out;
5496                         }
5497                 } else {
5498                         /* find the user buffer size */
5499                         xsecurity_bufsize = fusize(xsecurity_size);
5500
5501                         /* copy out the actual data size */
5502                         if (susize(xsecurity_size, KAUTH_FILESEC_COPYSIZE(fsec)) != 0) {
5503                                 error = EFAULT;
5504                                 goto out;
5505                         }
5506
5507                         /* if the caller supplied enough room, copy out to it */
5508                         if (xsecurity_bufsize >= KAUTH_FILESEC_COPYSIZE(fsec))
5509                                 error = copyout(fsec, xsecurity, KAUTH_FILESEC_COPYSIZE(fsec));
5510                 }
5511         }
5512 out:
5513         if (fsec != KAUTH_FILESEC_NONE)
5514                 kauth_filesec_free(fsec);
5515         return (error);
5516 }
5517
5518 /*
5519  * stat_extended: Get file status; with extended security (ACL).
5520  *
5521  * Parameters:    p                       (ignored)
5522  *                uap                     User argument descriptor (see below)
5523  *                retval                  (ignored)
5524  *
5525  * Indirect:      uap->path               Path of file to get status from
5526  *                uap->ub                 User buffer (holds file status info)
5527  *                uap->xsecurity          ACL to get (extended security)
5528  *                uap->xsecurity_size     Size of ACL
5529  *
5530  * Returns:        0                      Success
5531  *                !0                      errno value
5532  *
5533  */
5534 int
5535 stat_extended(__unused proc_t p, struct stat_extended_args *uap,
5536     __unused int32_t *retval)
5537 {
5538         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5539             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5540             0));
5541 }
5542
5543 /*
5544  * Returns:     0                       Success
5545  *      fstatat_internal:???            [see fstatat_internal() in this file]
5546  */
5547 int
5548 stat(__unused proc_t p, struct stat_args *uap, __unused int32_t *retval)
5549 {
5550         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5551             0, 0, 0, UIO_USERSPACE, AT_FDCWD, 0));
5552 }
5553
5554 int
5555 stat64(__unused proc_t p, struct stat64_args *uap, __unused int32_t *retval)
5556 {
5557         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5558             0, 0, 1, UIO_USERSPACE, AT_FDCWD, 0));
5559 }
5560
5561 /*
5562  * stat64_extended: Get file status; can handle large inode numbers; with extended security (ACL).
5563  *
5564  * Parameters:    p                       (ignored)
5565  *                uap                     User argument descriptor (see below)
5566  *                retval                  (ignored)
5567  *
5568  * Indirect:      uap->path               Path of file to get status from
5569  *                uap->ub                 User buffer (holds file status info)
5570  *                uap->xsecurity          ACL to get (extended security)
5571  *                uap->xsecurity_size     Size of ACL
5572  *
5573  * Returns:        0                      Success
5574  *                !0                      errno value
5575  *
5576  */
5577 int
5578 stat64_extended(__unused proc_t p, struct stat64_extended_args *uap, __unused int32_t *retval)
5579 {
5580         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5581             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5582             0));
5583 }
5584
5585 /*
5586  * lstat_extended: Get file status; does not follow links; with extended security (ACL).
5587  *
5588  * Parameters:    p                       (ignored)
5589  *                uap                     User argument descriptor (see below)
5590  *                retval                  (ignored)
5591  *
5592  * Indirect:      uap->path               Path of file to get status from
5593  *                uap->ub                 User buffer (holds file status info)
5594  *                uap->xsecurity          ACL to get (extended security)
5595  *                uap->xsecurity_size     Size of ACL
5596  *
5597  * Returns:        0                      Success
5598  *                !0                      errno value
5599  *
5600  */
5601 int
5602 lstat_extended(__unused proc_t p, struct lstat_extended_args *uap, __unused int32_t *retval)
5603 {
5604         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5605             uap->xsecurity, uap->xsecurity_size, 0, UIO_USERSPACE, AT_FDCWD,
5606             AT_SYMLINK_NOFOLLOW));
5607 }
5608
5609 /*
5610  * Get file status; this version does not follow links.
5611  */
5612 int
5613 lstat(__unused proc_t p, struct lstat_args *uap, __unused int32_t *retval)
5614 {
5615         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5616             0, 0, 0, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5617 }
5618
5619 int
5620 lstat64(__unused proc_t p, struct lstat64_args *uap, __unused int32_t *retval)
5621 {
5622         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5623             0, 0, 1, UIO_USERSPACE, AT_FDCWD, AT_SYMLINK_NOFOLLOW));
5624 }
5625
5626 /*
5627  * lstat64_extended: Get file status; can handle large inode numbers; does not
5628  * follow links; with extended security (ACL).
5629  *
5630  * Parameters:    p                       (ignored)
5631  *                uap                     User argument descriptor (see below)
5632  *                retval                  (ignored)
5633  *
5634  * Indirect:      uap->path               Path of file to get status from
5635  *                uap->ub                 User buffer (holds file status info)
5636  *                uap->xsecurity          ACL to get (extended security)
5637  *                uap->xsecurity_size     Size of ACL
5638  *
5639  * Returns:        0                      Success
5640  *                !0                      errno value
5641  *
5642  */
5643 int
5644 lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int32_t *retval)
5645 {
5646         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5647             uap->xsecurity, uap->xsecurity_size, 1, UIO_USERSPACE, AT_FDCWD,
5648             AT_SYMLINK_NOFOLLOW));
5649 }
5650
5651 int
5652 fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval)
5653 {
5654         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5655                 return (EINVAL);
5656
5657         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5658             0, 0, 0, UIO_USERSPACE, uap->fd, uap->flag));
5659 }
5660
5661 int
5662 fstatat64(__unused proc_t p, struct fstatat64_args *uap,
5663     __unused int32_t *retval)
5664 {
5665         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
5666                 return (EINVAL);
5667
5668         return (fstatat_internal(vfs_context_current(), uap->path, uap->ub,
5669             0, 0, 1, UIO_USERSPACE, uap->fd, uap->flag));
5670 }
5671
5672 /*
5673  * Get configurable pathname variables.
5674  *
5675  * Returns:     0                       Success
5676  *      namei:???
5677  *      vn_pathconf:???
5678  *
5679  * Notes:       Global implementation  constants are intended to be
5680  *              implemented in this function directly; all other constants
5681  *              are per-FS implementation, and therefore must be handled in
5682  *              each respective FS, instead.
5683  *
5684  * XXX We implement some things globally right now that should actually be
5685  * XXX per-FS; we will need to deal with this at some point.
5686  */
5687 /* ARGSUSED */
5688 int
5689 pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval)
5690 {
5691         int error;
5692         struct nameidata nd;
5693         vfs_context_t ctx = vfs_context_current();
5694
5695         NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1,
5696                 UIO_USERSPACE, uap->path, ctx);
5697         error = namei(&nd);
5698         if (error)
5699                 return (error);
5700
5701         error = vn_pathconf(nd.ni_vp, uap->name, retval, ctx);
5702
5703         vnode_put(nd.ni_vp);
5704         nameidone(&nd);
5705         return (error);
5706 }
5707
5708 /*
5709  * Return target name of a symbolic link.
5710  */
5711 /* ARGSUSED */
5712 static int
5713 readlinkat_internal(vfs_context_t ctx, int fd, user_addr_t path,
5714     enum uio_seg seg, user_addr_t buf, size_t bufsize, enum uio_seg bufseg,
5715     int *retval)
5716 {
5717         vnode_t vp;
5718         uio_t auio;
5719         int error;
5720         struct nameidata nd;
5721         char uio_buf[ UIO_SIZEOF(1) ];
5722
5723         NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1,
5724             seg, path, ctx);
5725
5726         error = nameiat(&nd, fd);
5727         if (error)
5728                 return (error);
5729         vp = nd.ni_vp;
5730
5731         nameidone(&nd);
5732
5733         auio = uio_createwithbuffer(1, 0, bufseg, UIO_READ,
5734                                     &uio_buf[0], sizeof(uio_buf));
5735         uio_addiov(auio, buf, bufsize);
5736         if (vp->v_type != VLNK) {
5737                 error = EINVAL;
5738         } else {
5739 #if CONFIG_MACF
5740                 error = mac_vnode_check_readlink(ctx, vp);
5741 #endif
5742                 if (error == 0)
5743                         error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA,
5744                                                 ctx);
5745                 if (error == 0)
5746                         error = VNOP_READLINK(vp, auio, ctx);
5747         }
5748         vnode_put(vp);
5749
5750         *retval = bufsize - (int)uio_resid(auio);
5751         return (error);
5752 }
5753
5754 int
5755 readlink(proc_t p, struct readlink_args *uap, int32_t *retval)
5756 {
5757         enum uio_seg procseg;
5758
5759         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5760         return (readlinkat_internal(vfs_context_current(), AT_FDCWD,
5761             CAST_USER_ADDR_T(uap->path), procseg, CAST_USER_ADDR_T(uap->buf),
5762             uap->count, procseg, retval));
5763 }
5764
5765 int
5766 readlinkat(proc_t p, struct readlinkat_args *uap, int32_t *retval)
5767 {
5768         enum uio_seg procseg;
5769
5770         procseg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
5771         return (readlinkat_internal(vfs_context_current(), uap->fd, uap->path,
5772             procseg, uap->buf, uap->bufsize, procseg, retval));
5773 }
5774
5775 /*
5776  * Change file flags.
5777  *
5778  * NOTE: this will vnode_put() `vp'
5779  */
5780 static int
5781 chflags1(vnode_t vp, int flags, vfs_context_t ctx)
5782 {
5783         struct vnode_attr va;
5784         kauth_action_t action;
5785         int error;
5786
5787         VATTR_INIT(&va);
5788         VATTR_SET(&va, va_flags, flags);
5789
5790 #if CONFIG_MACF
5791         error = mac_vnode_check_setflags(ctx, vp, flags);
5792         if (error)
5793                 goto out;
5794 #endif
5795
5796         /* request authorisation, disregard immutability */
5797         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
5798                 goto out;
5799         /*
5800          * Request that the auth layer disregard those file flags it's allowed to when
5801          * authorizing this operation; we need to do this in order to be able to
5802          * clear immutable flags.
5803          */
5804         if (action && ((error = vnode_authorize(vp, NULL, action | KAUTH_VNODE_NOIMMUTABLE, ctx)) != 0))
5805                 goto out;
5806         error = vnode_setattr(vp, &va, ctx);
5807
5808 #if CONFIG_MACF
5809         if (error == 0)
5810                 mac_vnode_notify_setflags(ctx, vp, flags);
5811 #endif
5812
5813         if ((error == 0) && !VATTR_IS_SUPPORTED(&va, va_flags)) {
5814                 error = ENOTSUP;
5815         }
5816 out:
5817         vnode_put(vp);
5818         return(error);
5819 }
5820
5821 /*
5822  * Change flags of a file given a path name.
5823  */
5824 /* ARGSUSED */
5825 int
5826 chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval)
5827 {
5828         vnode_t vp;
5829         vfs_context_t ctx = vfs_context_current();
5830         int error;
5831         struct nameidata nd;
5832
5833         AUDIT_ARG(fflags, uap->flags);
5834         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
5835                 UIO_USERSPACE, uap->path, ctx);
5836         error = namei(&nd);
5837         if (error)
5838                 return (error);
5839         vp = nd.ni_vp;
5840         nameidone(&nd);
5841
5842         /* we don't vnode_put() here because chflags1 does internally */
5843         error = chflags1(vp, uap->flags, ctx);
5844
5845         return(error);
5846 }
5847
5848 /*
5849  * Change flags of a file given a file descriptor.
5850  */
5851 /* ARGSUSED */
5852 int
5853 fchflags(__unused proc_t p, struct fchflags_args *uap, __unused int32_t *retval)
5854 {
5855         vnode_t vp;
5856         int error;
5857
5858         AUDIT_ARG(fd, uap->fd);
5859         AUDIT_ARG(fflags, uap->flags);
5860         if ( (error = file_vnode(uap->fd, &vp)) )
5861                 return (error);
5862
5863         if ((error = vnode_getwithref(vp))) {
5864                 file_drop(uap->fd);
5865                 return(error);
5866         }
5867
5868         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
5869
5870         /* we don't vnode_put() here because chflags1 does internally */
5871         error = chflags1(vp, uap->flags, vfs_context_current());
5872
5873         file_drop(uap->fd);
5874         return (error);
5875 }
5876
5877 /*
5878  * Change security information on a filesystem object.
5879  *
5880  * Returns:     0                       Success
5881  *              EPERM                   Operation not permitted
5882  *              vnode_authattr:???      [anything vnode_authattr can return]
5883  *              vnode_authorize:???     [anything vnode_authorize can return]
5884  *              vnode_setattr:???       [anything vnode_setattr can return]
5885  *
5886  * Notes:       If vnode_authattr or vnode_authorize return EACCES, it will be
5887  *              translated to EPERM before being returned.
5888  */
5889 static int
5890 chmod_vnode(vfs_context_t ctx, vnode_t vp, struct vnode_attr *vap)
5891 {
5892         kauth_action_t action;
5893         int error;
5894
5895         AUDIT_ARG(mode, vap->va_mode);
5896         /* XXX audit new args */
5897
5898 #if NAMEDSTREAMS
5899         /* chmod calls are not allowed for resource forks. */
5900         if (vp->v_flag & VISNAMEDSTREAM) {
5901                 return (EPERM);
5902         }
5903 #endif
5904
5905 #if CONFIG_MACF
5906         if (VATTR_IS_ACTIVE(vap, va_mode) &&
5907             (error = mac_vnode_check_setmode(ctx, vp, (mode_t)vap->va_mode)) != 0)
5908                 return (error);
5909
5910         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
5911                 if ((error = mac_vnode_check_setowner(ctx, vp,
5912                     VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
5913                     VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1)))
5914                         return (error);
5915         }
5916
5917         if (VATTR_IS_ACTIVE(vap, va_acl) &&
5918             (error = mac_vnode_check_setacl(ctx, vp, vap->va_acl)))
5919                 return (error);
5920 #endif
5921
5922         /* make sure that the caller is allowed to set this security information */
5923         if (((error = vnode_authattr(vp, vap, &action, ctx)) != 0) ||
5924             ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
5925                 if (error == EACCES)
5926                         error = EPERM;
5927                 return(error);
5928         }
5929
5930         if ((error = vnode_setattr(vp, vap, ctx)) != 0)
5931                 return (error);
5932
5933 #if CONFIG_MACF
5934         if (VATTR_IS_ACTIVE(vap, va_mode))
5935                 mac_vnode_notify_setmode(ctx, vp, (mode_t)vap->va_mode);
5936
5937         if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))
5938                 mac_vnode_notify_setowner(ctx, vp,
5939                         VATTR_IS_ACTIVE(vap, va_uid) ? vap->va_uid : -1,
5940                         VATTR_IS_ACTIVE(vap, va_gid) ? vap->va_gid : -1);
5941
5942         if (VATTR_IS_ACTIVE(vap, va_acl))
5943                 mac_vnode_notify_setacl(ctx, vp, vap->va_acl);
5944 #endif
5945
5946         return (error);
5947 }
5948
5949
5950 /*
5951  * Change mode of a file given a path name.
5952  *
5953  * Returns:     0                       Success
5954  *              namei:???               [anything namei can return]
5955  *              chmod_vnode:???         [anything chmod_vnode can return]
5956  */
5957 static int
5958 chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap,
5959     int fd, int flag, enum uio_seg segflg)
5960 {
5961         struct nameidata nd;
5962         int follow, error;
5963
5964         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
5965         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1,
5966             segflg, path, ctx);
5967         if ((error = nameiat(&nd, fd)))
5968                 return (error);
5969         error = chmod_vnode(ctx, nd.ni_vp, vap);
5970         vnode_put(nd.ni_vp);
5971         nameidone(&nd);
5972         return(error);
5973 }
5974
5975 /*
5976  * chmod_extended: Change the mode of a file given a path name; with extended
5977  * argument list (including extended security (ACL)).
5978  *
5979  * Parameters:  p                       Process requesting the open
5980  *              uap                     User argument descriptor (see below)
5981  *              retval                  (ignored)
5982  *
5983  * Indirect:    uap->path               Path to object (same as 'chmod')
5984  *              uap->uid                UID to set
5985  *              uap->gid                GID to set
5986  *              uap->mode               File mode to set (same as 'chmod')
5987  *              uap->xsecurity          ACL to set (or delete)
5988  *
5989  * Returns:     0                       Success
5990  *              !0                      errno value
5991  *
5992  * Notes:       The kauth_filesec_t in 'va', if any, is in host byte order.
5993  *
5994  * XXX:         We should enummerate the possible errno values here, and where
5995  *              in the code they originated.
5996  */
5997 int
5998 chmod_extended(__unused proc_t p, struct chmod_extended_args *uap, __unused int32_t *retval)
5999 {
6000         int error;
6001         struct vnode_attr va;
6002         kauth_filesec_t xsecdst;
6003
6004         AUDIT_ARG(owner, uap->uid, uap->gid);
6005
6006         VATTR_INIT(&va);
6007         if (uap->mode != -1)
6008                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6009         if (uap->uid != KAUTH_UID_NONE)
6010                 VATTR_SET(&va, va_uid, uap->uid);
6011         if (uap->gid != KAUTH_GID_NONE)
6012                 VATTR_SET(&va, va_gid, uap->gid);
6013
6014         xsecdst = NULL;
6015         switch(uap->xsecurity) {
6016                 /* explicit remove request */
6017         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6018                 VATTR_SET(&va, va_acl, NULL);
6019                 break;
6020                 /* not being set */
6021         case USER_ADDR_NULL:
6022                 break;
6023         default:
6024                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
6025                         return(error);
6026                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6027                 KAUTH_DEBUG("CHMOD - setting ACL with %d entries", va.va_acl->acl_entrycount);
6028         }
6029
6030         error = chmodat(vfs_context_current(), uap->path, &va, AT_FDCWD, 0,
6031             UIO_USERSPACE);
6032
6033         if (xsecdst != NULL)
6034                 kauth_filesec_free(xsecdst);
6035         return(error);
6036 }
6037
6038 /*
6039  * Returns:     0                       Success
6040  *              chmodat:???             [anything chmodat can return]
6041  */
6042 static int
6043 fchmodat_internal(vfs_context_t ctx, user_addr_t path, int mode, int fd,
6044     int flag, enum uio_seg segflg)
6045 {
6046         struct vnode_attr va;
6047
6048         VATTR_INIT(&va);
6049         VATTR_SET(&va, va_mode, mode & ALLPERMS);
6050
6051         return (chmodat(ctx, path, &va, fd, flag, segflg));
6052 }
6053
6054 int
6055 chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval)
6056 {
6057         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6058             AT_FDCWD, 0, UIO_USERSPACE));
6059 }
6060
6061 int
6062 fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval)
6063 {
6064         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6065                 return (EINVAL);
6066
6067         return (fchmodat_internal(vfs_context_current(), uap->path, uap->mode,
6068             uap->fd, uap->flag, UIO_USERSPACE));
6069 }
6070
6071 /*
6072  * Change mode of a file given a file descriptor.
6073  */
6074 static int
6075 fchmod1(__unused proc_t p, int fd, struct vnode_attr *vap)
6076 {
6077         vnode_t vp;
6078         int error;
6079
6080         AUDIT_ARG(fd, fd);
6081
6082         if ((error = file_vnode(fd, &vp)) != 0)
6083                 return (error);
6084         if ((error = vnode_getwithref(vp)) != 0) {
6085                 file_drop(fd);
6086                 return(error);
6087         }
6088         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6089
6090         error = chmod_vnode(vfs_context_current(), vp, vap);
6091         (void)vnode_put(vp);
6092         file_drop(fd);
6093
6094         return (error);
6095 }
6096
6097 /*
6098  * fchmod_extended: Change mode of a file given a file descriptor; with
6099  * extended argument list (including extended security (ACL)).
6100  *
6101  * Parameters:    p                       Process requesting to change file mode
6102  *                uap                     User argument descriptor (see below)
6103  *                retval                  (ignored)
6104  *
6105  * Indirect:      uap->mode               File mode to set (same as 'chmod')
6106  *                uap->uid                UID to set
6107  *                uap->gid                GID to set
6108  *                uap->xsecurity          ACL to set (or delete)
6109  *                uap->fd                 File descriptor of file to change mode
6110  *
6111  * Returns:        0                      Success
6112  *                !0                      errno value
6113  *
6114  */
6115 int
6116 fchmod_extended(proc_t p, struct fchmod_extended_args *uap, __unused int32_t *retval)
6117 {
6118         int error;
6119         struct vnode_attr va;
6120         kauth_filesec_t xsecdst;
6121
6122         AUDIT_ARG(owner, uap->uid, uap->gid);
6123
6124         VATTR_INIT(&va);
6125         if (uap->mode != -1)
6126                 VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6127         if (uap->uid != KAUTH_UID_NONE)
6128                 VATTR_SET(&va, va_uid, uap->uid);
6129         if (uap->gid != KAUTH_GID_NONE)
6130                 VATTR_SET(&va, va_gid, uap->gid);
6131
6132         xsecdst = NULL;
6133         switch(uap->xsecurity) {
6134         case USER_ADDR_NULL:
6135                 VATTR_SET(&va, va_acl, NULL);
6136                 break;
6137         case CAST_USER_ADDR_T((void *)1):       /* _FILESEC_REMOVE_ACL */
6138                 VATTR_SET(&va, va_acl, NULL);
6139                 break;
6140                 /* not being set */
6141         case CAST_USER_ADDR_T(-1):
6142                 break;
6143         default:
6144                 if ((error = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
6145                         return(error);
6146                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
6147         }
6148
6149         error = fchmod1(p, uap->fd, &va);
6150
6151
6152         switch(uap->xsecurity) {
6153         case USER_ADDR_NULL:
6154         case CAST_USER_ADDR_T(-1):
6155                 break;
6156         default:
6157                 if (xsecdst != NULL)
6158                         kauth_filesec_free(xsecdst);
6159         }
6160         return(error);
6161 }
6162
6163 int
6164 fchmod(proc_t p, struct fchmod_args *uap, __unused int32_t *retval)
6165 {
6166         struct vnode_attr va;
6167
6168         VATTR_INIT(&va);
6169         VATTR_SET(&va, va_mode, uap->mode & ALLPERMS);
6170
6171         return(fchmod1(p, uap->fd, &va));
6172 }
6173
6174
6175 /*
6176  * Set ownership given a path name.
6177  */
6178 /* ARGSUSED */
6179 static int
6180 fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid,
6181    gid_t gid, int flag, enum uio_seg segflg)
6182 {
6183         vnode_t vp;
6184         struct vnode_attr va;
6185         int error;
6186         struct nameidata nd;
6187         int follow;
6188         kauth_action_t action;
6189
6190         AUDIT_ARG(owner, uid, gid);
6191
6192         follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6193         NDINIT(&nd, LOOKUP, OP_SETATTR, follow | AUDITVNPATH1, segflg,
6194             path, ctx);
6195         error = nameiat(&nd, fd);
6196         if (error)
6197                 return (error);
6198         vp = nd.ni_vp;
6199
6200         nameidone(&nd);
6201
6202         VATTR_INIT(&va);
6203         if (uid != (uid_t)VNOVAL)
6204                 VATTR_SET(&va, va_uid, uid);
6205         if (gid != (gid_t)VNOVAL)
6206                 VATTR_SET(&va, va_gid, gid);
6207
6208 #if CONFIG_MACF
6209         error = mac_vnode_check_setowner(ctx, vp, uid, gid);
6210         if (error)
6211                 goto out;
6212 #endif
6213
6214         /* preflight and authorize attribute changes */
6215         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6216                 goto out;
6217         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6218                 goto out;
6219         error = vnode_setattr(vp, &va, ctx);
6220
6221 #if CONFIG_MACF
6222         if (error == 0)
6223                 mac_vnode_notify_setowner(ctx, vp, uid, gid);
6224 #endif
6225
6226 out:
6227         /*
6228          * EACCES is only allowed from namei(); permissions failure should
6229          * return EPERM, so we need to translate the error code.
6230          */
6231         if (error == EACCES)
6232                 error = EPERM;
6233
6234         vnode_put(vp);
6235         return (error);
6236 }
6237
6238 int
6239 chown(__unused proc_t p, struct chown_args *uap, __unused int32_t *retval)
6240 {
6241         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6242             uap->uid, uap->gid, 0, UIO_USERSPACE));
6243 }
6244
6245 int
6246 lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval)
6247 {
6248         return (fchownat_internal(vfs_context_current(), AT_FDCWD, uap->path,
6249             uap->owner, uap->group, AT_SYMLINK_NOFOLLOW, UIO_USERSPACE));
6250 }
6251
6252 int
6253 fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval)
6254 {
6255         if (uap->flag & ~AT_SYMLINK_NOFOLLOW)
6256                 return (EINVAL);
6257
6258         return (fchownat_internal(vfs_context_current(), uap->fd, uap->path,
6259             uap->uid, uap->gid, uap->flag, UIO_USERSPACE));
6260 }
6261
6262 /*
6263  * Set ownership given a file descriptor.
6264  */
6265 /* ARGSUSED */
6266 int
6267 fchown(__unused proc_t p, struct fchown_args *uap, __unused int32_t *retval)
6268 {
6269         struct vnode_attr va;
6270         vfs_context_t ctx = vfs_context_current();
6271         vnode_t vp;
6272         int error;
6273         kauth_action_t action;
6274
6275         AUDIT_ARG(owner, uap->uid, uap->gid);
6276         AUDIT_ARG(fd, uap->fd);
6277
6278         if ( (error = file_vnode(uap->fd, &vp)) )
6279                 return (error);
6280
6281         if ( (error = vnode_getwithref(vp)) ) {
6282                 file_drop(uap->fd);
6283                 return(error);
6284         }
6285         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6286
6287         VATTR_INIT(&va);
6288         if (uap->uid != VNOVAL)
6289                 VATTR_SET(&va, va_uid, uap->uid);
6290         if (uap->gid != VNOVAL)
6291                 VATTR_SET(&va, va_gid, uap->gid);
6292
6293 #if NAMEDSTREAMS
6294         /* chown calls are not allowed for resource forks. */
6295         if (vp->v_flag & VISNAMEDSTREAM) {
6296                 error = EPERM;
6297                 goto out;
6298         }
6299 #endif
6300
6301 #if CONFIG_MACF
6302         error = mac_vnode_check_setowner(ctx, vp, uap->uid, uap->gid);
6303         if (error)
6304                 goto out;
6305 #endif
6306
6307         /* preflight and authorize attribute changes */
6308         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6309                 goto out;
6310         if (action && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6311                 if (error == EACCES)
6312                         error = EPERM;
6313                 goto out;
6314         }
6315         error = vnode_setattr(vp, &va, ctx);
6316
6317 #if CONFIG_MACF
6318         if (error == 0)
6319                 mac_vnode_notify_setowner(ctx, vp, uap->uid, uap->gid);
6320 #endif
6321
6322 out:
6323         (void)vnode_put(vp);
6324         file_drop(uap->fd);
6325         return (error);
6326 }
6327
6328 static int
6329 getutimes(user_addr_t usrtvp, struct timespec *tsp)
6330 {
6331         int error;
6332
6333         if (usrtvp == USER_ADDR_NULL) {
6334                 struct timeval old_tv;
6335                 /* XXX Y2038 bug because of microtime argument */
6336                 microtime(&old_tv);
6337                 TIMEVAL_TO_TIMESPEC(&old_tv, &tsp[0]);
6338                 tsp[1] = tsp[0];
6339         } else {
6340                 if (IS_64BIT_PROCESS(current_proc())) {
6341                         struct user64_timeval tv[2];
6342                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6343                         if (error)
6344                                 return (error);
6345                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6346                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6347                 } else {
6348                         struct user32_timeval tv[2];
6349                         error = copyin(usrtvp, (void *)tv, sizeof(tv));
6350                         if (error)
6351                                 return (error);
6352                         TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
6353                         TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
6354                 }
6355         }
6356         return 0;
6357 }
6358
6359 static int
6360 setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts,
6361         int nullflag)
6362 {
6363         int error;
6364         struct vnode_attr va;
6365         kauth_action_t action;
6366
6367         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6368
6369         VATTR_INIT(&va);
6370         VATTR_SET(&va, va_access_time, ts[0]);
6371         VATTR_SET(&va, va_modify_time, ts[1]);
6372         if (nullflag)
6373                 va.va_vaflags |= VA_UTIMES_NULL;
6374
6375 #if NAMEDSTREAMS
6376         /* utimes calls are not allowed for resource forks. */
6377         if (vp->v_flag & VISNAMEDSTREAM) {
6378                 error = EPERM;
6379                 goto out;
6380         }
6381 #endif
6382
6383 #if CONFIG_MACF
6384         error = mac_vnode_check_setutimes(ctx, vp, ts[0], ts[1]);
6385         if (error)
6386                 goto out;
6387 #endif
6388         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0) {
6389                 if (!nullflag && error == EACCES)
6390                         error = EPERM;
6391                 goto out;
6392         }
6393
6394         /* since we may not need to auth anything, check here */
6395         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)) {
6396                 if (!nullflag && error == EACCES)
6397                         error = EPERM;
6398                 goto out;
6399         }
6400         error = vnode_setattr(vp, &va, ctx);
6401
6402 #if CONFIG_MACF
6403         if (error == 0)
6404                 mac_vnode_notify_setutimes(ctx, vp, ts[0], ts[1]);
6405 #endif
6406
6407 out:
6408         return error;
6409 }
6410
6411 /*
6412  * Set the access and modification times of a file.
6413  */
6414 /* ARGSUSED */
6415 int
6416 utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval)
6417 {
6418         struct timespec ts[2];
6419         user_addr_t usrtvp;
6420         int error;
6421         struct nameidata nd;
6422         vfs_context_t ctx = vfs_context_current();
6423
6424         /*
6425          * AUDIT: Needed to change the order of operations to do the
6426          * name lookup first because auditing wants the path.
6427          */
6428         NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1,
6429                 UIO_USERSPACE, uap->path, ctx);
6430         error = namei(&nd);
6431         if (error)
6432                 return (error);
6433         nameidone(&nd);
6434
6435         /*
6436          * Fetch the user-supplied time.  If usrtvp is USER_ADDR_NULL, we fetch
6437          * the current time instead.
6438          */
6439         usrtvp = uap->tptr;
6440         if ((error = getutimes(usrtvp, ts)) != 0)
6441                 goto out;
6442
6443         error = setutimes(ctx, nd.ni_vp, ts, usrtvp == USER_ADDR_NULL);
6444
6445 out:
6446         vnode_put(nd.ni_vp);
6447         return (error);
6448 }
6449
6450 /*
6451  * Set the access and modification times of a file.
6452  */
6453 /* ARGSUSED */
6454 int
6455 futimes(__unused proc_t p, struct futimes_args *uap, __unused int32_t *retval)
6456 {
6457         struct timespec ts[2];
6458         vnode_t vp;
6459         user_addr_t usrtvp;
6460         int error;
6461
6462         AUDIT_ARG(fd, uap->fd);
6463         usrtvp = uap->tptr;
6464         if ((error = getutimes(usrtvp, ts)) != 0)
6465                 return (error);
6466         if ((error = file_vnode(uap->fd, &vp)) != 0)
6467                 return (error);
6468         if((error = vnode_getwithref(vp))) {
6469                 file_drop(uap->fd);
6470                 return(error);
6471         }
6472
6473         error =  setutimes(vfs_context_current(), vp, ts, usrtvp == 0);
6474         vnode_put(vp);
6475         file_drop(uap->fd);
6476         return(error);
6477 }
6478
6479 /*
6480  * Truncate a file given its path name.
6481  */
6482 /* ARGSUSED */
6483 int
6484 truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval)
6485 {
6486         vnode_t vp;
6487         struct vnode_attr va;
6488         vfs_context_t ctx = vfs_context_current();
6489         int error;
6490         struct nameidata nd;
6491         kauth_action_t action;
6492
6493         if (uap->length < 0)
6494                 return(EINVAL);
6495         NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1,
6496                 UIO_USERSPACE, uap->path, ctx);
6497         if ((error = namei(&nd)))
6498                 return (error);
6499         vp = nd.ni_vp;
6500
6501         nameidone(&nd);
6502
6503         VATTR_INIT(&va);
6504         VATTR_SET(&va, va_data_size, uap->length);
6505
6506 #if CONFIG_MACF
6507         error = mac_vnode_check_truncate(ctx, NOCRED, vp);
6508         if (error)
6509                 goto out;
6510 #endif
6511
6512         if ((error = vnode_authattr(vp, &va, &action, ctx)) != 0)
6513                 goto out;
6514         if ((action != 0) && ((error = vnode_authorize(vp, NULL, action, ctx)) != 0))
6515                 goto out;
6516         error = vnode_setattr(vp, &va, ctx);
6517
6518 #if CONFIG_MACF
6519         if (error == 0)
6520                 mac_vnode_notify_truncate(ctx, NOCRED, vp);
6521 #endif
6522
6523 out:
6524         vnode_put(vp);
6525         return (error);
6526 }
6527
6528 /*
6529  * Truncate a file given a file descriptor.
6530  */
6531 /* ARGSUSED */
6532 int
6533 ftruncate(proc_t p, struct ftruncate_args *uap, int32_t *retval)
6534 {
6535         vfs_context_t ctx = vfs_context_current();
6536         struct vnode_attr va;
6537         vnode_t vp;
6538         struct fileproc *fp;
6539         int error ;
6540         int fd = uap->fd;
6541
6542         AUDIT_ARG(fd, uap->fd);
6543         if (uap->length < 0)
6544                 return(EINVAL);
6545
6546         if ( (error = fp_lookup(p,fd,&fp,0)) ) {
6547                 return(error);
6548         }
6549
6550         switch (FILEGLOB_DTYPE(fp->f_fglob)) {
6551         case DTYPE_PSXSHM:
6552                 error = pshm_truncate(p, fp, uap->fd, uap->length, retval);
6553                 goto out;
6554         case DTYPE_VNODE:
6555                 break;
6556         default:
6557                 error = EINVAL;
6558                 goto out;
6559         }
6560
6561         vp = (vnode_t)fp->f_fglob->fg_data;
6562
6563         if ((fp->f_fglob->fg_flag & FWRITE) == 0) {
6564                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
6565                 error = EINVAL;
6566                 goto out;
6567         }
6568
6569         if ((error = vnode_getwithref(vp)) != 0) {
6570                 goto out;
6571         }
6572
6573         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6574
6575 #if CONFIG_MACF
6576         error = mac_vnode_check_truncate(ctx,
6577             fp->f_fglob->fg_cred, vp);
6578         if (error) {
6579                 (void)vnode_put(vp);
6580                 goto out;
6581         }
6582 #endif
6583         VATTR_INIT(&va);
6584         VATTR_SET(&va, va_data_size, uap->length);
6585         error = vnode_setattr(vp, &va, ctx);
6586
6587 #if CONFIG_MACF
6588         if (error == 0)
6589                 mac_vnode_notify_truncate(ctx, fp->f_fglob->fg_cred, vp);
6590 #endif
6591
6592         (void)vnode_put(vp);
6593 out:
6594         file_drop(fd);
6595         return (error);
6596 }
6597
6598
6599 /*
6600  * Sync an open file with synchronized I/O _file_ integrity completion
6601  */
6602 /* ARGSUSED */
6603 int
6604 fsync(proc_t p, struct fsync_args *uap, __unused int32_t *retval)
6605 {
6606         __pthread_testcancel(1);
6607         return(fsync_common(p, uap, MNT_WAIT));
6608 }
6609
6610
6611 /*
6612  * Sync an open file with synchronized I/O _file_ integrity completion
6613  *
6614  * Notes:       This is a legacy support function that does not test for
6615  *              thread cancellation points.
6616  */
6617 /* ARGSUSED */
6618 int
6619 fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused int32_t *retval)
6620 {
6621         return(fsync_common(p, (struct fsync_args *)uap, MNT_WAIT));
6622 }
6623
6624
6625 /*
6626  * Sync an open file with synchronized I/O _data_ integrity completion
6627  */
6628 /* ARGSUSED */
6629 int
6630 fdatasync(proc_t p, struct fdatasync_args *uap, __unused int32_t *retval)
6631 {
6632         __pthread_testcancel(1);
6633         return(fsync_common(p, (struct fsync_args *)uap, MNT_DWAIT));
6634 }
6635
6636
6637 /*
6638  * fsync_common
6639  *
6640  * Common fsync code to support both synchronized I/O file integrity completion
6641  * (normal fsync) and synchronized I/O data integrity completion (fdatasync).
6642  *
6643  * If 'flags' is MNT_DWAIT, the caller is requesting data integrity, which
6644  * will only guarantee that the file data contents are retrievable.  If
6645  * 'flags' is MNT_WAIT, the caller is rewuesting file integrity, which also
6646  * includes additional metadata unnecessary for retrieving the file data
6647  * contents, such as atime, mtime, ctime, etc., also be committed to stable
6648  * storage.
6649  *
6650  * Parameters:  p                               The process
6651  *              uap->fd                         The descriptor to synchronize
6652  *              flags                           The data integrity flags
6653  *
6654  * Returns:     int                             Success
6655  *      fp_getfvp:EBADF                         Bad file descriptor
6656  *      fp_getfvp:ENOTSUP                       fd does not refer to a vnode
6657  *      VNOP_FSYNC:???                          unspecified
6658  *
6659  * Notes:       We use struct fsync_args because it is a short name, and all
6660  *              caller argument structures are otherwise identical.
6661  */
6662 static int
6663 fsync_common(proc_t p, struct fsync_args *uap, int flags)
6664 {
6665         vnode_t vp;
6666         struct fileproc *fp;
6667         vfs_context_t ctx = vfs_context_current();
6668         int error;
6669
6670         AUDIT_ARG(fd, uap->fd);
6671
6672         if ( (error = fp_getfvp(p, uap->fd, &fp, &vp)) )
6673                 return (error);
6674         if ( (error = vnode_getwithref(vp)) ) {
6675                 file_drop(uap->fd);
6676                 return(error);
6677         }
6678
6679         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
6680
6681         error = VNOP_FSYNC(vp, flags, ctx);
6682
6683 #if NAMEDRSRCFORK
6684         /* Sync resource fork shadow file if necessary. */
6685         if ((error == 0) &&
6686             (vp->v_flag & VISNAMEDSTREAM) &&
6687             (vp->v_parent != NULLVP) &&
6688             vnode_isshadow(vp) &&
6689             (fp->f_flags & FP_WRITTEN)) {
6690                 (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
6691         }
6692 #endif
6693
6694         (void)vnode_put(vp);
6695         file_drop(uap->fd);
6696         return (error);
6697 }
6698
6699 /*
6700  * Duplicate files.  Source must be a file, target must be a file or
6701  * must not exist.
6702  *
6703  * XXX Copyfile authorisation checking is woefully inadequate, and will not
6704  *     perform inheritance correctly.
6705  */
6706 /* ARGSUSED */
6707 int
6708 copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval)
6709 {
6710         vnode_t tvp, fvp, tdvp, sdvp;
6711         struct nameidata fromnd, tond;
6712         int error;
6713         vfs_context_t ctx = vfs_context_current();
6714 #if CONFIG_MACF
6715         struct filedesc *fdp = (vfs_context_proc(ctx))->p_fd;
6716         struct vnode_attr va;
6717 #endif
6718
6719         /* Check that the flags are valid. */
6720
6721         if (uap->flags & ~CPF_MASK) {
6722                 return(EINVAL);
6723         }
6724
6725         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, AUDITVNPATH1,
6726                 UIO_USERSPACE, uap->from, ctx);
6727         if ((error = namei(&fromnd)))
6728                 return (error);
6729         fvp = fromnd.ni_vp;
6730
6731         NDINIT(&tond, CREATE, OP_LINK,
6732                LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK,
6733                UIO_USERSPACE, uap->to, ctx);
6734         if ((error = namei(&tond))) {
6735                 goto out1;
6736         }
6737         tdvp = tond.ni_dvp;
6738         tvp = tond.ni_vp;
6739
6740         if (tvp != NULL) {
6741                 if (!(uap->flags & CPF_OVERWRITE)) {
6742                         error = EEXIST;
6743                         goto out;
6744                 }
6745         }
6746
6747         if (fvp->v_type == VDIR || (tvp && tvp->v_type == VDIR)) {
6748                 error = EISDIR;
6749                 goto out;
6750         }
6751
6752         /* This calls existing MAC hooks for open  */
6753         if ((error = vn_authorize_open_existing(fvp, &fromnd.ni_cnd, FREAD, ctx,
6754             NULL))) {
6755                 goto out;
6756         }
6757
6758         if (tvp) {
6759                 /*
6760                  * See unlinkat_internal for an explanation of the potential
6761                  * ENOENT from the MAC hook but the gist is that the MAC hook
6762                  * can fail because vn_getpath isn't able to return the full
6763                  * path. We choose to ignore this failure.
6764                  */
6765                 error = vn_authorize_unlink(tdvp, tvp, &tond.ni_cnd, ctx, NULL);
6766                 if (error && error != ENOENT)
6767                         goto out;
6768                 error = 0;
6769         }
6770
6771 #if CONFIG_MACF
6772         VATTR_INIT(&va);
6773         VATTR_SET(&va, va_type, fvp->v_type);
6774         /* Mask off all but regular access permissions */
6775         VATTR_SET(&va, va_mode,
6776             ((((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT) & ACCESSPERMS));
6777         error = mac_vnode_check_create(ctx, tdvp, &tond.ni_cnd, &va);
6778         if (error)
6779                 goto out;
6780 #endif /* CONFIG_MACF */
6781
6782         if ((error = vnode_authorize(tdvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0)
6783                 goto out;
6784
6785         if (fvp == tdvp)
6786                 error = EINVAL;
6787         /*
6788          * If source is the same as the destination (that is the
6789          * same inode number) then there is nothing to do.
6790          * (fixed to have POSIX semantics - CSM 3/2/98)
6791          */
6792         if (fvp == tvp)
6793                 error = -1;
6794         if (!error)
6795                 error = VNOP_COPYFILE(fvp, tdvp, tvp, &tond.ni_cnd, uap->mode, uap->flags, ctx);
6796 out:
6797         sdvp = tond.ni_startdir;
6798         /*
6799          * nameidone has to happen before we vnode_put(tdvp)
6800          * since it may need to release the fs_nodelock on the tdvp
6801          */
6802         nameidone(&tond);
6803
6804         if (tvp)
6805                 vnode_put(tvp);
6806         vnode_put(tdvp);
6807         vnode_put(sdvp);
6808 out1:
6809         vnode_put(fvp);
6810
6811         nameidone(&fromnd);
6812
6813         if (error == -1)
6814                 return (0);
6815         return (error);
6816 }
6817
6818 #define CLONE_SNAPSHOT_FALLBACKS_ENABLED 1
6819
6820 /*
6821  * Helper function for doing clones. The caller is expected to provide an
6822  * iocounted source vnode and release it.
6823  */
6824 static int
6825 clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd,
6826     user_addr_t dst, uint32_t flags, vfs_context_t ctx)
6827 {
6828         vnode_t tvp, tdvp;
6829         struct nameidata tond;
6830         int error;
6831         int follow;
6832         boolean_t free_src_acl;
6833         boolean_t attr_cleanup;
6834         enum vtype v_type;
6835         kauth_action_t action;
6836         struct componentname *cnp;
6837         uint32_t defaulted;
6838         struct vnode_attr va;
6839         struct vnode_attr nva;
6840
6841         v_type = vnode_vtype(fvp);
6842         switch (v_type) {
6843         case VLNK:
6844                 /* FALLTHRU */
6845         case VREG:
6846                 action = KAUTH_VNODE_ADD_FILE;
6847                 break;
6848         case VDIR:
6849                 if (vnode_isvroot(fvp) || vnode_ismount(fvp) ||
6850                     fvp->v_mountedhere) {
6851                         return (EINVAL);
6852                 }
6853                 action = KAUTH_VNODE_ADD_SUBDIRECTORY;
6854                 break;
6855         default:
6856                 return (EINVAL);
6857         }
6858
6859         AUDIT_ARG(fd2, dst_dirfd);
6860         AUDIT_ARG(value32, flags);
6861
6862         follow = (flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
6863         NDINIT(&tond, CREATE, OP_LINK, follow | WANTPARENT | AUDITVNPATH2,
6864             UIO_USERSPACE, dst, ctx);
6865         if ((error = nameiat(&tond, dst_dirfd)))
6866                 return (error);
6867         cnp = &tond.ni_cnd;
6868         tdvp = tond.ni_dvp;
6869         tvp = tond.ni_vp;
6870
6871         free_src_acl = FALSE;
6872         attr_cleanup = FALSE;
6873
6874         if (tvp != NULL) {
6875                 error = EEXIST;
6876                 goto out;
6877         }
6878
6879         if (vnode_mount(tdvp) != vnode_mount(fvp)) {
6880                 error = EXDEV;
6881                 goto out;
6882         }
6883
6884 #if CONFIG_MACF
6885         if ((error = mac_vnode_check_clone(ctx, tdvp, fvp, cnp)))
6886                 goto out;
6887 #endif
6888         if ((error = vnode_authorize(tdvp, NULL, action, ctx)))
6889                 goto out;
6890
6891         action = KAUTH_VNODE_GENERIC_READ_BITS;
6892         if (data_read_authorised)
6893                 action &= ~KAUTH_VNODE_READ_DATA;
6894         if ((error = vnode_authorize(fvp, NULL, action, ctx)))
6895                 goto out;
6896
6897         /*
6898          * certain attributes may need to be changed from the source, we ask for
6899          * those here.
6900          */
6901         VATTR_INIT(&va);
6902         VATTR_WANTED(&va, va_uid);
6903         VATTR_WANTED(&va, va_gid);
6904         VATTR_WANTED(&va, va_mode);
6905         VATTR_WANTED(&va, va_flags);
6906         VATTR_WANTED(&va, va_acl);
6907
6908         if ((error = vnode_getattr(fvp, &va, ctx)) != 0)
6909                 goto out;
6910
6911         VATTR_INIT(&nva);
6912         VATTR_SET(&nva, va_type, v_type);
6913         if (VATTR_IS_SUPPORTED(&va, va_acl) && va.va_acl != NULL) {
6914                 VATTR_SET(&nva, va_acl, va.va_acl);
6915                 free_src_acl = TRUE;
6916         }
6917
6918         /* Handle ACL inheritance, initialize vap. */
6919         if (v_type == VLNK) {
6920                 error = vnode_authattr_new(tdvp, &nva, 0, ctx);
6921         } else {
6922                 error = vn_attribute_prepare(tdvp, &nva, &defaulted, ctx);
6923                 if (error)
6924                         goto out;
6925                 attr_cleanup = TRUE;
6926         }
6927
6928         /*
6929          * We've got initial values for all security parameters,
6930          * If we are superuser, then we can change owners to be the
6931          * same as the source. Both superuser and the owner have default
6932          * WRITE_SECURITY privileges so all other fields can be taken
6933          * from source as well.
6934          */
6935         if (vfs_context_issuser(ctx)) {
6936                 if (VATTR_IS_SUPPORTED(&va, va_uid))
6937                         VATTR_SET(&nva, va_uid, va.va_uid);
6938                 if (VATTR_IS_SUPPORTED(&va, va_gid))
6939                         VATTR_SET(&nva, va_gid, va.va_gid);
6940         }
6941         if (VATTR_IS_SUPPORTED(&va, va_mode))
6942                 VATTR_SET(&nva, va_mode, va.va_mode);
6943         if (VATTR_IS_SUPPORTED(&va, va_flags)) {
6944                 VATTR_SET(&nva, va_flags,
6945                     ((va.va_flags & ~SF_RESTRICTED) | /* Turn off from source */
6946                     (nva.va_flags & SF_RESTRICTED)));
6947         }
6948
6949         error = VNOP_CLONEFILE(fvp, tdvp, &tvp, cnp, &nva,
6950             VNODE_CLONEFILE_DEFAULT, ctx);
6951
6952         if (!error && tvp) {
6953                 int     update_flags = 0;
6954 #if CONFIG_FSE
6955                 int fsevent;
6956 #endif /* CONFIG_FSE */
6957
6958 #if CONFIG_MACF
6959                 (void)vnode_label(vnode_mount(tvp), tdvp, tvp, cnp,
6960                     VNODE_LABEL_CREATE, ctx);
6961 #endif
6962                 /*
6963                  * If some of the requested attributes weren't handled by the
6964                  * VNOP, use our fallback code.
6965                  */
6966                 if (!VATTR_ALL_SUPPORTED(&va))
6967                         (void)vnode_setattr_fallback(tvp, &nva, ctx);
6968
6969                 // Make sure the name & parent pointers are hooked up
6970                 if (tvp->v_name == NULL)
6971                         update_flags |= VNODE_UPDATE_NAME;
6972                 if (tvp->v_parent == NULLVP)
6973                         update_flags |= VNODE_UPDATE_PARENT;
6974
6975                 if (update_flags) {
6976                         (void)vnode_update_identity(tvp, tdvp, cnp->cn_nameptr,
6977                             cnp->cn_namelen, cnp->cn_hash, update_flags);
6978                 }
6979
6980 #if CONFIG_FSE
6981                 switch (vnode_vtype(tvp)) {
6982                 case VLNK:
6983                         /* FALLTHRU */
6984                 case VREG:
6985                         fsevent = FSE_CREATE_FILE;
6986                         break;
6987                 case VDIR:
6988                         fsevent = FSE_CREATE_DIR;
6989                         break;
6990                 default:
6991                         goto out;
6992                 }
6993
6994                 if (need_fsevent(fsevent, tvp)) {
6995                         add_fsevent(fsevent, ctx, FSE_ARG_VNODE, tvp,
6996                             FSE_ARG_DONE);
6997                 }
6998 #endif /* CONFIG_FSE */
6999         }
7000
7001 out:
7002         if (attr_cleanup)
7003                 vn_attribute_cleanup(&nva, defaulted);
7004         if (free_src_acl && va.va_acl)
7005                 kauth_acl_free(va.va_acl);
7006         nameidone(&tond);
7007         if (tvp)
7008                 vnode_put(tvp);
7009         vnode_put(tdvp);
7010         return (error);
7011 }
7012
7013 /*
7014  * clone files or directories, target must not exist.
7015  */
7016 /* ARGSUSED */
7017 int
7018 clonefileat(__unused proc_t p, struct clonefileat_args *uap,
7019     __unused int32_t *retval)
7020 {
7021         vnode_t fvp;
7022         struct nameidata fromnd;
7023         int follow;
7024         int error;
7025         vfs_context_t ctx = vfs_context_current();
7026
7027         /* Check that the flags are valid. */
7028         if (uap->flags & ~CLONE_NOFOLLOW)
7029                 return (EINVAL);
7030
7031         AUDIT_ARG(fd, uap->src_dirfd);
7032
7033         follow = (uap->flags & CLONE_NOFOLLOW) ? NOFOLLOW : FOLLOW;
7034         NDINIT(&fromnd, LOOKUP, OP_COPYFILE, follow | AUDITVNPATH1,
7035             UIO_USERSPACE, uap->src, ctx);
7036         if ((error = nameiat(&fromnd, uap->src_dirfd)))
7037                 return (error);
7038
7039         fvp = fromnd.ni_vp;
7040         nameidone(&fromnd);
7041
7042         error = clonefile_internal(fvp, FALSE, uap->dst_dirfd, uap->dst,
7043             uap->flags, ctx);
7044
7045         vnode_put(fvp);
7046         return (error);
7047 }
7048
7049 int
7050 fclonefileat(__unused proc_t p, struct fclonefileat_args *uap,
7051     __unused int32_t *retval)
7052 {
7053         vnode_t fvp;
7054         struct fileproc *fp;
7055         int error;
7056         vfs_context_t ctx = vfs_context_current();
7057
7058         AUDIT_ARG(fd, uap->src_fd);
7059         error = fp_getfvp(p, uap->src_fd, &fp, &fvp);
7060         if (error)
7061                 return (error);
7062
7063         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
7064                 AUDIT_ARG(vnpath_withref, fvp, ARG_VNODE1);
7065                 error = EBADF;
7066                 goto out;
7067         }
7068
7069         if ((error = vnode_getwithref(fvp)))
7070                 goto out;
7071
7072         AUDIT_ARG(vnpath, fvp, ARG_VNODE1);
7073
7074         error = clonefile_internal(fvp, TRUE, uap->dst_dirfd, uap->dst,
7075             uap->flags, ctx);
7076
7077         vnode_put(fvp);
7078 out:
7079         file_drop(uap->src_fd);
7080         return (error);
7081 }
7082
7083 /*
7084  * Rename files.  Source and destination must either both be directories,
7085  * or both not be directories.  If target is a directory, it must be empty.
7086  */
7087 /* ARGSUSED */
7088 static int
7089 renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from,
7090     int tofd, user_addr_t to, int segflg, vfs_rename_flags_t flags)
7091 {
7092         if (flags & ~VFS_RENAME_FLAGS_MASK)
7093                 return EINVAL;
7094
7095         if (ISSET(flags, VFS_RENAME_SWAP) && ISSET(flags, VFS_RENAME_EXCL))
7096                 return EINVAL;
7097
7098         vnode_t tvp, tdvp;
7099         vnode_t fvp, fdvp;
7100         struct nameidata *fromnd, *tond;
7101         int error;
7102         int do_retry;
7103         int retry_count;
7104         int mntrename;
7105         int need_event;
7106         const char *oname = NULL;
7107         char *from_name = NULL, *to_name = NULL;
7108         int from_len=0, to_len=0;
7109         int holding_mntlock;
7110         mount_t locked_mp = NULL;
7111         vnode_t oparent = NULLVP;
7112 #if CONFIG_FSE
7113         fse_info from_finfo, to_finfo;
7114 #endif
7115         int from_truncated=0, to_truncated;
7116         int batched = 0;
7117         struct vnode_attr *fvap, *tvap;
7118         int continuing = 0;
7119         /* carving out a chunk for structs that are too big to be on stack. */
7120         struct {
7121                 struct nameidata from_node, to_node;
7122                 struct vnode_attr fv_attr, tv_attr;
7123         } * __rename_data;
7124         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
7125         fromnd = &__rename_data->from_node;
7126         tond = &__rename_data->to_node;
7127
7128         holding_mntlock = 0;
7129         do_retry = 0;
7130         retry_count = 0;
7131 retry:
7132         fvp = tvp = NULL;
7133         fdvp = tdvp = NULL;
7134         fvap = tvap = NULL;
7135         mntrename = FALSE;
7136
7137         NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1,
7138             segflg, from, ctx);
7139         fromnd->ni_flag = NAMEI_COMPOUNDRENAME;
7140
7141         NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK,
7142             segflg, to, ctx);
7143         tond->ni_flag = NAMEI_COMPOUNDRENAME;
7144
7145 continue_lookup:
7146         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
7147                 if ( (error = nameiat(fromnd, fromfd)) )
7148                         goto out1;
7149                 fdvp = fromnd->ni_dvp;
7150                 fvp  = fromnd->ni_vp;
7151
7152                 if (fvp && fvp->v_type == VDIR)
7153                         tond->ni_cnd.cn_flags |= WILLBEDIR;
7154         }
7155
7156         if ((tond->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) {
7157                 if ( (error = nameiat(tond, tofd)) ) {
7158                         /*
7159                          * Translate error code for rename("dir1", "dir2/.").
7160                          */
7161                         if (error == EISDIR && fvp->v_type == VDIR)
7162                                 error = EINVAL;
7163                         goto out1;
7164                 }
7165                 tdvp = tond->ni_dvp;
7166                 tvp  = tond->ni_vp;
7167         }
7168
7169         if (!tvp && ISSET(flags, VFS_RENAME_SWAP)) {
7170                 error = ENOENT;
7171                 goto out1;
7172         }
7173
7174         if (tvp && ISSET(flags, VFS_RENAME_EXCL)) {
7175                 error = EEXIST;
7176                 goto out1;
7177         }
7178
7179         batched = vnode_compound_rename_available(fdvp);
7180         if (!fvp) {
7181                 /*
7182                  * Claim: this check will never reject a valid rename.
7183                  * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp.
7184                  * Suppose fdvp and tdvp are not on the same mount.
7185                  * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem.  If fvp is the root,
7186                  *      then you can't move it to within another dir on the same mountpoint.
7187                  * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction.
7188                  *
7189                  * If this check passes, then we are safe to pass these vnodes to the same FS.
7190                  */
7191                 if (fdvp->v_mount != tdvp->v_mount) {
7192                         error = EXDEV;
7193                         goto out1;
7194                 }
7195                 goto skipped_lookup;
7196         }
7197
7198         if (!batched) {
7199                 error = vn_authorize_renamex(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, flags, NULL);
7200                 if (error) {
7201                         if (error == ENOENT) {
7202                                 assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7203                                 if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7204                                         /*
7205                                          * We encountered a race where after doing the namei, tvp stops
7206                                          * being valid. If so, simply re-drive the rename call from the
7207                                          * top.
7208                                          */
7209                                         do_retry = 1;
7210                                         retry_count += 1;
7211                                 }
7212                         }
7213                         goto out1;
7214                 }
7215         }
7216
7217         /*
7218          * If the source and destination are the same (i.e. they're
7219          * links to the same vnode) and the target file system is
7220          * case sensitive, then there is nothing to do.
7221          *
7222          * XXX Come back to this.
7223          */
7224         if (fvp == tvp) {
7225                 int pathconf_val;
7226
7227                 /*
7228                  * Note: if _PC_CASE_SENSITIVE selector isn't supported,
7229                  * then assume that this file system is case sensitive.
7230                  */
7231                 if (VNOP_PATHCONF(fvp, _PC_CASE_SENSITIVE, &pathconf_val, ctx) != 0 ||
7232                     pathconf_val != 0) {
7233                         goto out1;
7234                 }
7235         }
7236
7237         /*
7238          * Allow the renaming of mount points.
7239          * - target must not exist
7240          * - target must reside in the same directory as source
7241          * - union mounts cannot be renamed
7242          * - "/" cannot be renamed
7243          *
7244          * XXX Handle this in VFS after a continued lookup (if we missed
7245          * in the cache to start off)
7246          *
7247          * N.B. If RENAME_SWAP is being used, then @tvp != NULL and so
7248          * we'll skip past here.  The file system is responsible for
7249          * checking that @tvp is not a descendent of @fvp and vice versa
7250          * so it should always return EINVAL if either @tvp or @fvp is the
7251          * root of a volume.
7252          */
7253         if ((fvp->v_flag & VROOT) &&
7254             (fvp->v_type == VDIR) &&
7255             (tvp == NULL)  &&
7256             (fvp->v_mountedhere == NULL)  &&
7257             (fdvp == tdvp)  &&
7258             ((fvp->v_mount->mnt_flag & (MNT_UNION | MNT_ROOTFS)) == 0)  &&
7259             (fvp->v_mount->mnt_vnodecovered != NULLVP)) {
7260                 vnode_t coveredvp;
7261
7262                 /* switch fvp to the covered vnode */
7263                 coveredvp = fvp->v_mount->mnt_vnodecovered;
7264                 if ( (vnode_getwithref(coveredvp)) ) {
7265                         error = ENOENT;
7266                         goto out1;
7267                 }
7268                 vnode_put(fvp);
7269
7270                 fvp = coveredvp;
7271                 mntrename = TRUE;
7272         }
7273         /*
7274          * Check for cross-device rename.
7275          */
7276         if ((fvp->v_mount != tdvp->v_mount) ||
7277             (tvp && (fvp->v_mount != tvp->v_mount))) {
7278                 error = EXDEV;
7279                 goto out1;
7280         }
7281
7282         /*
7283          * If source is the same as the destination (that is the
7284          * same inode number) then there is nothing to do...
7285          * EXCEPT if the underlying file system supports case
7286          * insensitivity and is case preserving.  In this case
7287          * the file system needs to handle the special case of
7288          * getting the same vnode as target (fvp) and source (tvp).
7289          *
7290          * Only file systems that support pathconf selectors _PC_CASE_SENSITIVE
7291          * and _PC_CASE_PRESERVING can have this exception, and they need to
7292          * handle the special case of getting the same vnode as target and
7293          * source.  NOTE: Then the target is unlocked going into vnop_rename,
7294          * so not to cause locking problems. There is a single reference on tvp.
7295          *
7296          * NOTE - that fvp == tvp also occurs if they are hard linked and
7297          * that correct behaviour then is just to return success without doing
7298          * anything.
7299          *
7300          * XXX filesystem should take care of this itself, perhaps...
7301          */
7302         if (fvp == tvp && fdvp == tdvp) {
7303                 if (fromnd->ni_cnd.cn_namelen == tond->ni_cnd.cn_namelen &&
7304                     !bcmp(fromnd->ni_cnd.cn_nameptr, tond->ni_cnd.cn_nameptr,
7305                           fromnd->ni_cnd.cn_namelen)) {
7306                         goto out1;
7307                 }
7308         }
7309
7310         if (holding_mntlock && fvp->v_mount != locked_mp) {
7311                 /*
7312                  * we're holding a reference and lock
7313                  * on locked_mp, but it no longer matches
7314                  * what we want to do... so drop our hold
7315                  */
7316                 mount_unlock_renames(locked_mp);
7317                 mount_drop(locked_mp, 0);
7318                 holding_mntlock = 0;
7319         }
7320         if (tdvp != fdvp && fvp->v_type == VDIR) {
7321                 /*
7322                  * serialize renames that re-shape
7323                  * the tree... if holding_mntlock is
7324                  * set, then we're ready to go...
7325                  * otherwise we
7326                  * first need to drop the iocounts
7327                  * we picked up, second take the
7328                  * lock to serialize the access,
7329                  * then finally start the lookup
7330                  * process over with the lock held
7331                  */
7332                 if (!holding_mntlock) {
7333                         /*
7334                          * need to grab a reference on
7335                          * the mount point before we
7336                          * drop all the iocounts... once
7337                          * the iocounts are gone, the mount
7338                          * could follow
7339                          */
7340                         locked_mp = fvp->v_mount;
7341                         mount_ref(locked_mp, 0);
7342
7343                         /*
7344                          * nameidone has to happen before we vnode_put(tvp)
7345                          * since it may need to release the fs_nodelock on the tvp
7346                          */
7347                         nameidone(tond);
7348
7349                         if (tvp)
7350                                 vnode_put(tvp);
7351                         vnode_put(tdvp);
7352
7353                         /*
7354                          * nameidone has to happen before we vnode_put(fdvp)
7355                          * since it may need to release the fs_nodelock on the fvp
7356                          */
7357                         nameidone(fromnd);
7358
7359                         vnode_put(fvp);
7360                         vnode_put(fdvp);
7361
7362                         mount_lock_renames(locked_mp);
7363                         holding_mntlock = 1;
7364
7365                         goto retry;
7366                 }
7367         } else {
7368                 /*
7369                  * when we dropped the iocounts to take
7370                  * the lock, we allowed the identity of
7371                  * the various vnodes to change... if they did,
7372                  * we may no longer be dealing with a rename
7373                  * that reshapes the tree... once we're holding
7374                  * the iocounts, the vnodes can't change type
7375                  * so we're free to drop the lock at this point
7376                  * and continue on
7377                  */
7378                 if (holding_mntlock) {
7379                         mount_unlock_renames(locked_mp);
7380                         mount_drop(locked_mp, 0);
7381                         holding_mntlock = 0;
7382                 }
7383         }
7384
7385         // save these off so we can later verify that fvp is the same
7386         oname   = fvp->v_name;
7387         oparent = fvp->v_parent;
7388
7389 skipped_lookup:
7390 #if CONFIG_FSE
7391         need_event = need_fsevent(FSE_RENAME, fdvp);
7392         if (need_event) {
7393                 if (fvp) {
7394                         get_fse_info(fvp, &from_finfo, ctx);
7395                 } else {
7396                         error = vfs_get_notify_attributes(&__rename_data->fv_attr);
7397                         if (error) {
7398                                 goto out1;
7399                         }
7400
7401                         fvap = &__rename_data->fv_attr;
7402                 }
7403
7404                 if (tvp) {
7405                         get_fse_info(tvp, &to_finfo, ctx);
7406                 } else if (batched) {
7407                         error = vfs_get_notify_attributes(&__rename_data->tv_attr);
7408                         if (error) {
7409                                 goto out1;
7410                         }
7411
7412                         tvap = &__rename_data->tv_attr;
7413                 }
7414         }
7415 #else
7416         need_event = 0;
7417 #endif /* CONFIG_FSE */
7418
7419         if (need_event || kauth_authorize_fileop_has_listeners()) {
7420                 if (from_name == NULL) {
7421                         GET_PATH(from_name);
7422                         if (from_name == NULL) {
7423                                 error = ENOMEM;
7424                                 goto out1;
7425                         }
7426                 }
7427
7428                 from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
7429
7430                 if (to_name == NULL) {
7431                         GET_PATH(to_name);
7432                         if (to_name == NULL) {
7433                                 error = ENOMEM;
7434                                 goto out1;
7435                         }
7436                 }
7437
7438                 to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
7439         }
7440         error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
7441                             tdvp, &tvp, &tond->ni_cnd, tvap,
7442                             flags, ctx);
7443
7444         if (holding_mntlock) {
7445                 /*
7446                  * we can drop our serialization
7447                  * lock now
7448                  */
7449                 mount_unlock_renames(locked_mp);
7450                 mount_drop(locked_mp, 0);
7451                 holding_mntlock = 0;
7452         }
7453         if (error) {
7454                 if (error == EKEEPLOOKING) {
7455                         if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) == 0) {
7456                                 if ((tond->ni_flag & NAMEI_CONTLOOKUP) == 0) {
7457                                         panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?");
7458                                 }
7459                         }
7460
7461                         fromnd->ni_vp = fvp;
7462                         tond->ni_vp = tvp;
7463
7464                         goto continue_lookup;
7465                 }
7466
7467                 /*
7468                  * We may encounter a race in the VNOP where the destination didn't
7469                  * exist when we did the namei, but it does by the time we go and
7470                  * try to create the entry. In this case, we should re-drive this rename
7471                  * call from the top again.  Currently, only HFS bubbles out ERECYCLE,
7472                  * but other filesystems susceptible to this race could return it, too.
7473                  */
7474                 if (error == ERECYCLE) {
7475                         do_retry = 1;
7476                 }
7477
7478                 /*
7479                  * For compound VNOPs, the authorization callback may return
7480                  * ENOENT in case of racing hardlink lookups hitting the name
7481                  * cache, redrive the lookup.
7482                  */
7483                 if (batched && error == ENOENT) {
7484                         assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7485                         if (retry_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7486                                 do_retry = 1;
7487                                 retry_count += 1;
7488                         }
7489                 }
7490
7491                 goto out1;
7492         }
7493
7494         /* call out to allow 3rd party notification of rename.
7495          * Ignore result of kauth_authorize_fileop call.
7496          */
7497         kauth_authorize_fileop(vfs_context_ucred(ctx),
7498                         KAUTH_FILEOP_RENAME,
7499                         (uintptr_t)from_name, (uintptr_t)to_name);
7500         if (flags & VFS_RENAME_SWAP) {
7501                 kauth_authorize_fileop(vfs_context_ucred(ctx),
7502                                                            KAUTH_FILEOP_RENAME,
7503                                                            (uintptr_t)to_name, (uintptr_t)from_name);
7504         }
7505
7506 #if CONFIG_FSE
7507         if (from_name != NULL && to_name != NULL) {
7508                 if (from_truncated || to_truncated) {
7509                         // set it here since only the from_finfo gets reported up to user space
7510                         from_finfo.mode |= FSE_TRUNCATED_PATH;
7511                 }
7512
7513                 if (tvap && tvp) {
7514                         vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap);
7515                 }
7516                 if (fvap) {
7517                         vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap);
7518                 }
7519
7520                 if (tvp) {
7521                         add_fsevent(FSE_RENAME, ctx,
7522                                                 FSE_ARG_STRING, from_len, from_name,
7523                                                 FSE_ARG_FINFO, &from_finfo,
7524                                                 FSE_ARG_STRING, to_len, to_name,
7525                                                 FSE_ARG_FINFO, &to_finfo,
7526                                                 FSE_ARG_DONE);
7527                         if (flags & VFS_RENAME_SWAP) {
7528                                 /*
7529                                  * Strictly speaking, swap is the equivalent of
7530                                  * *three* renames.  FSEvents clients should only take
7531                                  * the events as a hint, so we only bother reporting
7532                                  * two.
7533                                  */
7534                                 add_fsevent(FSE_RENAME, ctx,
7535                                                         FSE_ARG_STRING, to_len, to_name,
7536                                                         FSE_ARG_FINFO, &to_finfo,
7537                                                         FSE_ARG_STRING, from_len, from_name,
7538                                                         FSE_ARG_FINFO, &from_finfo,
7539                                                         FSE_ARG_DONE);
7540                         }
7541                 } else {
7542                         add_fsevent(FSE_RENAME, ctx,
7543                                     FSE_ARG_STRING, from_len, from_name,
7544                                     FSE_ARG_FINFO, &from_finfo,
7545                                     FSE_ARG_STRING, to_len, to_name,
7546                                     FSE_ARG_DONE);
7547                 }
7548         }
7549 #endif /* CONFIG_FSE */
7550
7551         /*
7552          * update filesystem's mount point data
7553          */
7554         if (mntrename) {
7555                 char *cp, *pathend, *mpname;
7556                 char * tobuf;
7557                 struct mount *mp;
7558                 int maxlen;
7559                 size_t len = 0;
7560
7561                 mp = fvp->v_mountedhere;
7562
7563                 if (vfs_busy(mp, LK_NOWAIT)) {
7564                         error = EBUSY;
7565                         goto out1;
7566                 }
7567                 MALLOC_ZONE(tobuf, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
7568
7569                 if (UIO_SEG_IS_USER_SPACE(segflg))
7570                         error = copyinstr(to, tobuf, MAXPATHLEN, &len);
7571                 else
7572                         error = copystr((void *)to, tobuf, MAXPATHLEN, &len);
7573                 if (!error) {
7574                         /* find current mount point prefix */
7575                         pathend = &mp->mnt_vfsstat.f_mntonname[0];
7576                         for (cp = pathend; *cp != '\0'; ++cp) {
7577                                 if (*cp == '/')
7578                                         pathend = cp + 1;
7579                         }
7580                         /* find last component of target name */
7581                         for (mpname = cp = tobuf; *cp != '\0'; ++cp) {
7582                                 if (*cp == '/')
7583                                         mpname = cp + 1;
7584                         }
7585                         /* append name to prefix */
7586                         maxlen = MAXPATHLEN - (pathend - mp->mnt_vfsstat.f_mntonname);
7587                         bzero(pathend, maxlen);
7588                         strlcpy(pathend, mpname, maxlen);
7589                 }
7590                 FREE_ZONE(tobuf, MAXPATHLEN, M_NAMEI);
7591
7592                 vfs_unbusy(mp);
7593         }
7594         /*
7595          * fix up name & parent pointers.  note that we first
7596          * check that fvp has the same name/parent pointers it
7597          * had before the rename call... this is a 'weak' check
7598          * at best...
7599          *
7600          * XXX oparent and oname may not be set in the compound vnop case
7601          */
7602         if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) {
7603                 int update_flags;
7604
7605                 update_flags = VNODE_UPDATE_NAME;
7606
7607                 if (fdvp != tdvp)
7608                         update_flags |= VNODE_UPDATE_PARENT;
7609
7610                 vnode_update_identity(fvp, tdvp, tond->ni_cnd.cn_nameptr, tond->ni_cnd.cn_namelen, tond->ni_cnd.cn_hash, update_flags);
7611         }
7612 out1:
7613         if (to_name != NULL) {
7614                 RELEASE_PATH(to_name);
7615                 to_name = NULL;
7616         }
7617         if (from_name != NULL) {
7618                 RELEASE_PATH(from_name);
7619                 from_name = NULL;
7620         }
7621         if (holding_mntlock) {
7622                 mount_unlock_renames(locked_mp);
7623                 mount_drop(locked_mp, 0);
7624                 holding_mntlock = 0;
7625         }
7626         if (tdvp) {
7627                 /*
7628                  * nameidone has to happen before we vnode_put(tdvp)
7629                  * since it may need to release the fs_nodelock on the tdvp
7630                  */
7631                 nameidone(tond);
7632
7633                 if (tvp)
7634                         vnode_put(tvp);
7635                 vnode_put(tdvp);
7636         }
7637         if (fdvp) {
7638                 /*
7639                  * nameidone has to happen before we vnode_put(fdvp)
7640                  * since it may need to release the fs_nodelock on the fdvp
7641                  */
7642                 nameidone(fromnd);
7643
7644                 if (fvp)
7645                         vnode_put(fvp);
7646                 vnode_put(fdvp);
7647         }
7648
7649         /*
7650          * If things changed after we did the namei, then we will re-drive
7651          * this rename call from the top.
7652          */
7653         if (do_retry) {
7654                 do_retry = 0;
7655                 goto retry;
7656         }
7657
7658         FREE(__rename_data, M_TEMP);
7659         return (error);
7660 }
7661
7662 int
7663 rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval)
7664 {
7665         return (renameat_internal(vfs_context_current(), AT_FDCWD, uap->from,
7666             AT_FDCWD, uap->to, UIO_USERSPACE, 0));
7667 }
7668
7669 int renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval)
7670 {
7671         return renameat_internal(
7672                 vfs_context_current(),
7673                 uap->fromfd, uap->from,
7674                 uap->tofd, uap->to,
7675                 UIO_USERSPACE, uap->flags);
7676 }
7677
7678 int
7679 renameat(__unused proc_t p, struct renameat_args *uap, __unused int32_t *retval)
7680 {
7681         return (renameat_internal(vfs_context_current(), uap->fromfd, uap->from,
7682             uap->tofd, uap->to, UIO_USERSPACE, 0));
7683 }
7684
7685 /*
7686  * Make a directory file.
7687  *
7688  * Returns:     0                       Success
7689  *              EEXIST
7690  *      namei:???
7691  *      vnode_authorize:???
7692  *      vn_create:???
7693  */
7694 /* ARGSUSED */
7695 static int
7696 mkdir1at(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, int fd,
7697     enum uio_seg segflg)
7698 {
7699         vnode_t vp, dvp;
7700         int error;
7701         int update_flags = 0;
7702         int batched;
7703         struct nameidata nd;
7704
7705         AUDIT_ARG(mode, vap->va_mode);
7706         NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, segflg,
7707                path, ctx);
7708         nd.ni_cnd.cn_flags |= WILLBEDIR;
7709         nd.ni_flag = NAMEI_COMPOUNDMKDIR;
7710
7711 continue_lookup:
7712         error = nameiat(&nd, fd);
7713         if (error)
7714                 return (error);
7715         dvp = nd.ni_dvp;
7716         vp = nd.ni_vp;
7717
7718         if (vp != NULL) {
7719                 error = EEXIST;
7720                 goto out;
7721         }
7722
7723         batched = vnode_compound_mkdir_available(dvp);
7724
7725         VATTR_SET(vap, va_type, VDIR);
7726
7727         /*
7728          * XXX
7729          * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will
7730          * only get EXISTS or EISDIR for existing path components, and not that it could see
7731          * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz"
7732          * it will fail in a spurious  manner.  Need to figure out if this is valid behavior.
7733          */
7734         if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) {
7735                 if (error == EACCES || error == EPERM) {
7736                         int error2;
7737
7738                         nameidone(&nd);
7739                         vnode_put(dvp);
7740                         dvp = NULLVP;
7741
7742                         /*
7743                          * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST
7744                          * rather than EACCESS if the target exists.
7745                          */
7746                         NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, segflg,
7747                                         path, ctx);
7748                         error2 = nameiat(&nd, fd);
7749                         if (error2) {
7750                                 goto out;
7751                         } else {
7752                                 vp = nd.ni_vp;
7753                                 error = EEXIST;
7754                                 goto out;
7755                         }
7756                 }
7757
7758                 goto out;
7759         }
7760
7761         /*
7762          * make the directory
7763          */
7764         if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) {
7765                 if (error == EKEEPLOOKING) {
7766                         nd.ni_vp = vp;
7767                         goto continue_lookup;
7768                 }
7769
7770                 goto out;
7771         }
7772
7773         // Make sure the name & parent pointers are hooked up
7774         if (vp->v_name == NULL)
7775                 update_flags |= VNODE_UPDATE_NAME;
7776         if (vp->v_parent == NULLVP)
7777                 update_flags |= VNODE_UPDATE_PARENT;
7778
7779         if (update_flags)
7780                 vnode_update_identity(vp, dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, nd.ni_cnd.cn_hash, update_flags);
7781
7782 #if CONFIG_FSE
7783         add_fsevent(FSE_CREATE_DIR, ctx, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
7784 #endif
7785
7786 out:
7787         /*
7788          * nameidone has to happen before we vnode_put(dvp)
7789          * since it may need to release the fs_nodelock on the dvp
7790          */
7791         nameidone(&nd);
7792
7793         if (vp)
7794                 vnode_put(vp);
7795         if (dvp)
7796                 vnode_put(dvp);
7797
7798         return (error);
7799 }
7800
7801 /*
7802  * mkdir_extended: Create a directory; with extended security (ACL).
7803  *
7804  * Parameters:    p                       Process requesting to create the directory
7805  *                uap                     User argument descriptor (see below)
7806  *                retval                  (ignored)
7807  *
7808  * Indirect:      uap->path               Path of directory to create
7809  *                uap->mode               Access permissions to set
7810  *                uap->xsecurity          ACL to set
7811  *
7812  * Returns:        0                      Success
7813  *                !0                      Not success
7814  *
7815  */
7816 int
7817 mkdir_extended(proc_t p, struct mkdir_extended_args *uap, __unused int32_t *retval)
7818 {
7819         int ciferror;
7820         kauth_filesec_t xsecdst;
7821         struct vnode_attr va;
7822
7823         AUDIT_ARG(owner, uap->uid, uap->gid);
7824
7825         xsecdst = NULL;
7826         if ((uap->xsecurity != USER_ADDR_NULL) &&
7827             ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0))
7828                 return ciferror;
7829
7830         VATTR_INIT(&va);
7831         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7832         if (xsecdst != NULL)
7833                 VATTR_SET(&va, va_acl, &xsecdst->fsec_acl);
7834
7835         ciferror = mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7836             UIO_USERSPACE);
7837         if (xsecdst != NULL)
7838                 kauth_filesec_free(xsecdst);
7839         return ciferror;
7840 }
7841
7842 int
7843 mkdir(proc_t p, struct mkdir_args *uap, __unused int32_t *retval)
7844 {
7845         struct vnode_attr va;
7846
7847         VATTR_INIT(&va);
7848         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7849
7850         return (mkdir1at(vfs_context_current(), uap->path, &va, AT_FDCWD,
7851             UIO_USERSPACE));
7852 }
7853
7854 int
7855 mkdirat(proc_t p, struct mkdirat_args *uap, __unused int32_t *retval)
7856 {
7857         struct vnode_attr va;
7858
7859         VATTR_INIT(&va);
7860         VATTR_SET(&va, va_mode, (uap->mode & ACCESSPERMS) & ~p->p_fd->fd_cmask);
7861
7862         return(mkdir1at(vfs_context_current(), uap->path, &va, uap->fd,
7863             UIO_USERSPACE));
7864 }
7865
7866 static int
7867 rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath,
7868     enum uio_seg segflg)
7869 {
7870         vnode_t vp, dvp;
7871         int error;
7872         struct nameidata nd;
7873         char     *path = NULL;
7874         int       len=0;
7875         int has_listeners = 0;
7876         int need_event = 0;
7877         int truncated = 0;
7878 #if CONFIG_FSE
7879         struct vnode_attr va;
7880 #endif /* CONFIG_FSE */
7881         struct vnode_attr *vap = NULL;
7882         int restart_count = 0;
7883         int batched;
7884
7885         int restart_flag;
7886
7887         /*
7888          * This loop exists to restart rmdir in the unlikely case that two
7889          * processes are simultaneously trying to remove the same directory
7890          * containing orphaned appleDouble files.
7891          */
7892         do {
7893                 NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1,
7894                     segflg, dirpath, ctx);
7895                 nd.ni_flag = NAMEI_COMPOUNDRMDIR;
7896 continue_lookup:
7897                 restart_flag = 0;
7898                 vap = NULL;
7899
7900                 error = nameiat(&nd, fd);
7901                 if (error)
7902                         return (error);
7903
7904                 dvp = nd.ni_dvp;
7905                 vp = nd.ni_vp;
7906
7907                 if (vp) {
7908                         batched = vnode_compound_rmdir_available(vp);
7909
7910                         if (vp->v_flag & VROOT) {
7911                                 /*
7912                                  * The root of a mounted filesystem cannot be deleted.
7913                                  */
7914                                 error = EBUSY;
7915                                 goto out;
7916                         }
7917
7918                         /*
7919                          * Removed a check here; we used to abort if vp's vid
7920                          * was not the same as what we'd seen the last time around.
7921                          * I do not think that check was valid, because if we retry
7922                          * and all dirents are gone, the directory could legitimately
7923                          * be recycled but still be present in a situation where we would
7924                          * have had permission to delete.  Therefore, we won't make
7925                          * an effort to preserve that check now that we may not have a
7926                          * vp here.
7927                          */
7928
7929                         if (!batched) {
7930                                 error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL);
7931                                 if (error) {
7932                                         if (error == ENOENT) {
7933                                                 assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7934                                                 if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7935                                                         restart_flag = 1;
7936                                                         restart_count += 1;
7937                                                 }
7938                                         }
7939                                         goto out;
7940                                 }
7941                         }
7942                 } else {
7943                         batched = 1;
7944
7945                         if (!vnode_compound_rmdir_available(dvp)) {
7946                                 panic("No error, but no compound rmdir?");
7947                         }
7948                 }
7949
7950 #if CONFIG_FSE
7951                 fse_info  finfo;
7952
7953                 need_event = need_fsevent(FSE_DELETE, dvp);
7954                 if (need_event) {
7955                         if (!batched) {
7956                                 get_fse_info(vp, &finfo, ctx);
7957                         } else {
7958                                 error = vfs_get_notify_attributes(&va);
7959                                 if (error) {
7960                                         goto out;
7961                                 }
7962
7963                                 vap = &va;
7964                         }
7965                 }
7966 #endif
7967                 has_listeners = kauth_authorize_fileop_has_listeners();
7968                 if (need_event || has_listeners) {
7969                         if (path == NULL) {
7970                                 GET_PATH(path);
7971                                 if (path == NULL) {
7972                                         error = ENOMEM;
7973                                         goto out;
7974                                 }
7975                         }
7976
7977                         len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated);
7978 #if CONFIG_FSE
7979                         if (truncated) {
7980                                 finfo.mode |= FSE_TRUNCATED_PATH;
7981                         }
7982 #endif
7983                 }
7984
7985                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
7986                 nd.ni_vp = vp;
7987                 if (vp == NULLVP) {
7988                         /* Couldn't find a vnode */
7989                         goto out;
7990                 }
7991
7992                 if (error == EKEEPLOOKING) {
7993                         goto continue_lookup;
7994                 } else if (batched && error == ENOENT) {
7995                         assert(restart_count < MAX_AUTHORIZE_ENOENT_RETRIES);
7996                         if (restart_count < MAX_AUTHORIZE_ENOENT_RETRIES) {
7997                                 /*
7998                                  * For compound VNOPs, the authorization callback
7999                                  * may return ENOENT in case of racing hard link lookups
8000                                  * redrive the lookup.
8001                                  */
8002                                 restart_flag = 1;
8003                                 restart_count += 1;
8004                                 goto out;
8005                         }
8006                 }
8007 #if CONFIG_APPLEDOUBLE
8008                 /*
8009                  * Special case to remove orphaned AppleDouble
8010                  * files. I don't like putting this in the kernel,
8011                  * but carbon does not like putting this in carbon either,
8012                  * so here we are.
8013                  */
8014                 if (error == ENOTEMPTY) {
8015                         error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag);
8016                         if (error == EBUSY) {
8017                                 goto out;
8018                         }
8019
8020
8021                         /*
8022                          * Assuming everything went well, we will try the RMDIR again
8023                          */
8024                         if (!error)
8025                                 error = vn_rmdir(dvp, &vp, &nd, vap, ctx);
8026                 }
8027 #endif /* CONFIG_APPLEDOUBLE */
8028                 /*
8029                  * Call out to allow 3rd party notification of delete.
8030                  * Ignore result of kauth_authorize_fileop call.
8031                  */
8032                 if (!error) {
8033                         if (has_listeners) {
8034                                 kauth_authorize_fileop(vfs_context_ucred(ctx),
8035                                                 KAUTH_FILEOP_DELETE,
8036                                                 (uintptr_t)vp,
8037                                                 (uintptr_t)path);
8038                         }
8039
8040                         if (vp->v_flag & VISHARDLINK) {
8041                                 // see the comment in unlink1() about why we update
8042                                 // the parent of a hard link when it is removed
8043                                 vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT);
8044                         }
8045
8046 #if CONFIG_FSE
8047                         if (need_event) {
8048                                 if (vap) {
8049                                         vnode_get_fse_info_from_vap(vp, &finfo, vap);
8050                                 }
8051                                 add_fsevent(FSE_DELETE, ctx,
8052                                                 FSE_ARG_STRING, len, path,
8053                                                 FSE_ARG_FINFO, &finfo,
8054                                                 FSE_ARG_DONE);
8055                         }
8056 #endif
8057                 }
8058
8059 out:
8060                 if (path != NULL) {
8061                         RELEASE_PATH(path);
8062                         path = NULL;
8063                 }
8064                 /*
8065                  * nameidone has to happen before we vnode_put(dvp)
8066                  * since it may need to release the fs_nodelock on the dvp
8067                  */
8068                 nameidone(&nd);
8069                 vnode_put(dvp);
8070
8071                 if (vp)
8072                         vnode_put(vp);
8073
8074                 if (restart_flag == 0) {
8075                         wakeup_one((caddr_t)vp);
8076                         return (error);
8077                 }
8078                 tsleep(vp, PVFS, "rm AD", 1);
8079
8080         } while (restart_flag != 0);
8081
8082         return (error);
8083
8084 }
8085
8086 /*
8087  * Remove a directory file.
8088  */
8089 /* ARGSUSED */
8090 int
8091 rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval)
8092 {
8093         return (rmdirat_internal(vfs_context_current(), AT_FDCWD,
8094             CAST_USER_ADDR_T(uap->path), UIO_USERSPACE));
8095 }
8096
8097 /* Get direntry length padded to 8 byte alignment */
8098 #define DIRENT64_LEN(namlen) \
8099         ((sizeof(struct direntry) + (namlen) - (MAXPATHLEN-1) + 7) & ~7)
8100
8101 errno_t
8102 vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag,
8103                 int *numdirent, vfs_context_t ctxp)
8104 {
8105         /* Check if fs natively supports VNODE_READDIR_EXTENDED */
8106         if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) &&
8107                    ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0))  {
8108                 return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp);
8109         } else {
8110                 size_t bufsize;
8111                 void * bufptr;
8112                 uio_t auio;
8113                 struct direntry *entry64;
8114                 struct dirent *dep;
8115                 int bytesread;
8116                 int error;
8117
8118                 /*
8119                  * Our kernel buffer needs to be smaller since re-packing
8120                  * will expand each dirent.  The worse case (when the name
8121                  * length is 3) corresponds to a struct direntry size of 32
8122                  * bytes (8-byte aligned) and a struct dirent size of 12 bytes
8123                  * (4-byte aligned).  So having a buffer that is 3/8 the size
8124                  * will prevent us from reading more than we can pack.
8125                  *
8126                  * Since this buffer is wired memory, we will limit the
8127                  * buffer size to a maximum of 32K. We would really like to
8128                  * use 32K in the MIN(), but we use magic number 87371 to
8129                  * prevent uio_resid() * 3 / 8 from overflowing.
8130                  */
8131                 bufsize = 3 * MIN((user_size_t)uio_resid(uio), 87371u) / 8;
8132                 MALLOC(bufptr, void *, bufsize, M_TEMP, M_WAITOK);
8133                 if (bufptr == NULL) {
8134                         return ENOMEM;
8135                 }
8136
8137                 auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
8138                 uio_addiov(auio, (uintptr_t)bufptr, bufsize);
8139                 auio->uio_offset = uio->uio_offset;
8140
8141                 error = VNOP_READDIR(vp, auio, 0, eofflag, numdirent, ctxp);
8142
8143                 dep = (struct dirent *)bufptr;
8144                 bytesread = bufsize - uio_resid(auio);
8145
8146                 MALLOC(entry64, struct direntry *, sizeof(struct direntry),
8147                        M_TEMP, M_WAITOK);
8148                 /*
8149                  * Convert all the entries and copy them out to user's buffer.
8150                  */
8151                 while (error == 0 && (char *)dep < ((char *)bufptr + bytesread)) {
8152                         size_t  enbufsize = DIRENT64_LEN(dep->d_namlen);
8153
8154                         bzero(entry64, enbufsize);
8155                         /* Convert a dirent to a dirent64. */
8156                         entry64->d_ino = dep->d_ino;
8157                         entry64->d_seekoff = 0;
8158                         entry64->d_reclen = enbufsize;
8159                         entry64->d_namlen = dep->d_namlen;
8160                         entry64->d_type = dep->d_type;
8161                         bcopy(dep->d_name, entry64->d_name, dep->d_namlen + 1);
8162
8163                         /* Move to next entry. */
8164                         dep = (struct dirent *)((char *)dep + dep->d_reclen);
8165
8166                         /* Copy entry64 to user's buffer. */
8167                         error = uiomove((caddr_t)entry64, entry64->d_reclen, uio);
8168                 }
8169
8170                 /* Update the real offset using the offset we got from VNOP_READDIR. */
8171                 if (error == 0) {
8172                         uio->uio_offset = auio->uio_offset;
8173                 }
8174                 uio_free(auio);
8175                 FREE(bufptr, M_TEMP);
8176                 FREE(entry64, M_TEMP);
8177                 return (error);
8178         }
8179 }
8180
8181 #define GETDIRENTRIES_MAXBUFSIZE        (128 * 1024 * 1024U)
8182
8183 /*
8184  * Read a block of directory entries in a file system independent format.
8185  */
8186 static int
8187 getdirentries_common(int fd, user_addr_t bufp, user_size_t bufsize, ssize_t *bytesread,
8188                      off_t *offset, int flags)
8189 {
8190         vnode_t vp;
8191         struct vfs_context context = *vfs_context_current();    /* local copy */
8192         struct fileproc *fp;
8193         uio_t auio;
8194         int spacetype = proc_is64bit(vfs_context_proc(&context)) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8195         off_t loff;
8196         int error, eofflag, numdirent;
8197         char uio_buf[ UIO_SIZEOF(1) ];
8198
8199         error = fp_getfvp(vfs_context_proc(&context), fd, &fp, &vp);
8200         if (error) {
8201                 return (error);
8202         }
8203         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
8204                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8205                 error = EBADF;
8206                 goto out;
8207         }
8208
8209         if (bufsize > GETDIRENTRIES_MAXBUFSIZE)
8210                 bufsize = GETDIRENTRIES_MAXBUFSIZE;
8211
8212 #if CONFIG_MACF
8213         error = mac_file_check_change_offset(vfs_context_ucred(&context), fp->f_fglob);
8214         if (error)
8215                 goto out;
8216 #endif
8217         if ( (error = vnode_getwithref(vp)) ) {
8218                 goto out;
8219         }
8220         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8221
8222 unionread:
8223         if (vp->v_type != VDIR) {
8224                 (void)vnode_put(vp);
8225                 error = EINVAL;
8226                 goto out;
8227         }
8228
8229 #if CONFIG_MACF
8230         error = mac_vnode_check_readdir(&context, vp);
8231         if (error != 0) {
8232                 (void)vnode_put(vp);
8233                 goto out;
8234         }
8235 #endif /* MAC */
8236
8237         loff = fp->f_fglob->fg_offset;
8238         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8239         uio_addiov(auio, bufp, bufsize);
8240
8241         if (flags & VNODE_READDIR_EXTENDED) {
8242                 error = vnode_readdir64(vp, auio, flags, &eofflag, &numdirent, &context);
8243                 fp->f_fglob->fg_offset = uio_offset(auio);
8244         } else {
8245                 error = VNOP_READDIR(vp, auio, 0, &eofflag, &numdirent, &context);
8246                 fp->f_fglob->fg_offset = uio_offset(auio);
8247         }
8248         if (error) {
8249                 (void)vnode_put(vp);
8250                 goto out;
8251         }
8252
8253         if ((user_ssize_t)bufsize == uio_resid(auio)){
8254                 if (union_dircheckp) {
8255                         error = union_dircheckp(&vp, fp, &context);
8256                         if (error == -1)
8257                                 goto unionread;
8258                         if (error) {
8259                                 (void)vnode_put(vp);
8260                                 goto out;
8261                         }
8262                 }
8263
8264                 if ((vp->v_mount->mnt_flag & MNT_UNION)) {
8265                         struct vnode *tvp = vp;
8266                         if (lookup_traverse_union(tvp, &vp, &context) == 0) {
8267                                 vnode_ref(vp);
8268                                 fp->f_fglob->fg_data = (caddr_t) vp;
8269                                 fp->f_fglob->fg_offset = 0;
8270                                 vnode_rele(tvp);
8271                                 vnode_put(tvp);
8272                                 goto unionread;
8273                         }
8274                         vp = tvp;
8275                 }
8276         }
8277
8278         vnode_put(vp);
8279         if (offset) {
8280                 *offset = loff;
8281         }
8282
8283         *bytesread = bufsize - uio_resid(auio);
8284 out:
8285         file_drop(fd);
8286         return (error);
8287 }
8288
8289
8290 int
8291 getdirentries(__unused struct proc *p, struct getdirentries_args *uap, int32_t *retval)
8292 {
8293         off_t offset;
8294         ssize_t bytesread;
8295         int error;
8296
8297         AUDIT_ARG(fd, uap->fd);
8298         error = getdirentries_common(uap->fd, uap->buf, uap->count, &bytesread, &offset, 0);
8299
8300         if (error == 0) {
8301                 if (proc_is64bit(p)) {
8302                         user64_long_t base = (user64_long_t)offset;
8303                         error = copyout((caddr_t)&base, uap->basep, sizeof(user64_long_t));
8304                 } else {
8305                         user32_long_t base = (user32_long_t)offset;
8306                         error = copyout((caddr_t)&base, uap->basep, sizeof(user32_long_t));
8307                 }
8308                 *retval = bytesread;
8309         }
8310         return (error);
8311 }
8312
8313 int
8314 getdirentries64(__unused struct proc *p, struct getdirentries64_args *uap, user_ssize_t *retval)
8315 {
8316         off_t offset;
8317         ssize_t bytesread;
8318         int error;
8319
8320         AUDIT_ARG(fd, uap->fd);
8321         error = getdirentries_common(uap->fd, uap->buf, uap->bufsize, &bytesread, &offset, VNODE_READDIR_EXTENDED);
8322
8323         if (error == 0) {
8324                 *retval = bytesread;
8325                 error = copyout((caddr_t)&offset, uap->position, sizeof(off_t));
8326         }
8327         return (error);
8328 }
8329
8330
8331 /*
8332  * Set the mode mask for creation of filesystem nodes.
8333  * XXX implement xsecurity
8334  */
8335 #define UMASK_NOXSECURITY        (void *)1      /* leave existing xsecurity alone */
8336 static int
8337 umask1(proc_t p, int newmask, __unused kauth_filesec_t fsec, int32_t *retval)
8338 {
8339         struct filedesc *fdp;
8340
8341         AUDIT_ARG(mask, newmask);
8342         proc_fdlock(p);
8343         fdp = p->p_fd;
8344         *retval = fdp->fd_cmask;
8345         fdp->fd_cmask = newmask & ALLPERMS;
8346         proc_fdunlock(p);
8347         return (0);
8348 }
8349
8350 /*
8351  * umask_extended: Set the mode mask for creation of filesystem nodes; with extended security (ACL).
8352  *
8353  * Parameters:    p                       Process requesting to set the umask
8354  *                uap                     User argument descriptor (see below)
8355  *                retval                  umask of the process (parameter p)
8356  *
8357  * Indirect:      uap->newmask            umask to set
8358  *                uap->xsecurity          ACL to set
8359  *
8360  * Returns:        0                      Success
8361  *                !0                      Not success
8362  *
8363  */
8364 int
8365 umask_extended(proc_t p, struct umask_extended_args *uap, int32_t *retval)
8366 {
8367         int ciferror;
8368         kauth_filesec_t xsecdst;
8369
8370         xsecdst = KAUTH_FILESEC_NONE;
8371         if (uap->xsecurity != USER_ADDR_NULL) {
8372                 if ((ciferror = kauth_copyinfilesec(uap->xsecurity, &xsecdst)) != 0)
8373                         return ciferror;
8374         } else {
8375                 xsecdst = KAUTH_FILESEC_NONE;
8376         }
8377
8378         ciferror = umask1(p, uap->newmask, xsecdst, retval);
8379
8380         if (xsecdst != KAUTH_FILESEC_NONE)
8381                 kauth_filesec_free(xsecdst);
8382         return ciferror;
8383 }
8384
8385 int
8386 umask(proc_t p, struct umask_args *uap, int32_t *retval)
8387 {
8388         return(umask1(p, uap->newmask, UMASK_NOXSECURITY, retval));
8389 }
8390
8391 /*
8392  * Void all references to file by ripping underlying filesystem
8393  * away from vnode.
8394  */
8395 /* ARGSUSED */
8396 int
8397 revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval)
8398 {
8399         vnode_t vp;
8400         struct vnode_attr va;
8401         vfs_context_t ctx = vfs_context_current();
8402         int error;
8403         struct nameidata nd;
8404
8405         NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE,
8406                uap->path, ctx);
8407         error = namei(&nd);
8408         if (error)
8409                 return (error);
8410         vp = nd.ni_vp;
8411
8412         nameidone(&nd);
8413
8414         if (!(vnode_ischr(vp) || vnode_isblk(vp))) {
8415                 error = ENOTSUP;
8416                 goto out;
8417         }
8418
8419         if (vnode_isblk(vp) && vnode_ismountedon(vp)) {
8420                 error = EBUSY;
8421                 goto out;
8422         }
8423
8424 #if CONFIG_MACF
8425         error = mac_vnode_check_revoke(ctx, vp);
8426         if (error)
8427                 goto out;
8428 #endif
8429
8430         VATTR_INIT(&va);
8431         VATTR_WANTED(&va, va_uid);
8432         if ((error = vnode_getattr(vp, &va, ctx)))
8433                 goto out;
8434         if (kauth_cred_getuid(vfs_context_ucred(ctx)) != va.va_uid &&
8435             (error = suser(vfs_context_ucred(ctx), &p->p_acflag)))
8436                 goto out;
8437         if (vp->v_usecount > 0 || (vnode_isaliased(vp)))
8438                 VNOP_REVOKE(vp, REVOKEALL, ctx);
8439 out:
8440         vnode_put(vp);
8441         return (error);
8442 }
8443
8444
8445 /*
8446  *  HFS/HFS PlUS SPECIFIC SYSTEM CALLS
8447  *  The following system calls are designed to support features
8448  *  which are specific to the HFS & HFS Plus volume formats
8449  */
8450
8451
8452 /*
8453  * Obtain attribute information on objects in a directory while enumerating
8454  * the directory.
8455  */
8456 /* ARGSUSED */
8457 int
8458 getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval)
8459 {
8460         vnode_t vp;
8461         struct fileproc *fp;
8462         uio_t auio = NULL;
8463         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8464         uint32_t count, savecount;
8465         uint32_t newstate;
8466         int error, eofflag;
8467         uint32_t loff;
8468         struct attrlist attributelist;
8469         vfs_context_t ctx = vfs_context_current();
8470         int fd = uap->fd;
8471         char uio_buf[ UIO_SIZEOF(1) ];
8472         kauth_action_t action;
8473
8474         AUDIT_ARG(fd, fd);
8475
8476         /* Get the attributes into kernel space */
8477         if ((error = copyin(uap->alist, (caddr_t)&attributelist, sizeof(attributelist)))) {
8478                 return(error);
8479         }
8480         if ((error = copyin(uap->count, (caddr_t)&count, sizeof(count)))) {
8481                 return(error);
8482         }
8483         savecount = count;
8484         if ( (error = fp_getfvp(p, fd, &fp, &vp)) ) {
8485                 return (error);
8486         }
8487         if ((fp->f_fglob->fg_flag & FREAD) == 0) {
8488                 AUDIT_ARG(vnpath_withref, vp, ARG_VNODE1);
8489                 error = EBADF;
8490                 goto out;
8491         }
8492
8493
8494 #if CONFIG_MACF
8495         error = mac_file_check_change_offset(vfs_context_ucred(ctx),
8496             fp->f_fglob);
8497         if (error)
8498                 goto out;
8499 #endif
8500
8501
8502         if ( (error = vnode_getwithref(vp)) )
8503                 goto out;
8504
8505         AUDIT_ARG(vnpath, vp, ARG_VNODE1);
8506
8507 unionread:
8508         if (vp->v_type != VDIR) {
8509                 (void)vnode_put(vp);
8510                 error = EINVAL;
8511                 goto out;
8512         }
8513
8514 #if CONFIG_MACF
8515         error = mac_vnode_check_readdir(ctx, vp);
8516         if (error != 0) {
8517                 (void)vnode_put(vp);
8518                 goto out;
8519         }
8520 #endif /* MAC */
8521
8522         /* set up the uio structure which will contain the users return buffer */
8523         loff = fp->f_fglob->fg_offset;
8524         auio = uio_createwithbuffer(1, loff, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8525         uio_addiov(auio, uap->buffer, uap->buffersize);
8526
8527         /*
8528          * If the only item requested is file names, we can let that past with
8529          * just LIST_DIRECTORY.  If they want any other attributes, that means
8530          * they need SEARCH as well.
8531          */
8532         action = KAUTH_VNODE_LIST_DIRECTORY;
8533         if ((attributelist.commonattr & ~ATTR_CMN_NAME) ||
8534             attributelist.fileattr || attributelist.dirattr)
8535                 action |= KAUTH_VNODE_SEARCH;
8536
8537         if ((error = vnode_authorize(vp, NULL, action, ctx)) == 0) {
8538
8539                 /* Believe it or not, uap->options only has 32-bits of valid
8540                  * info, so truncate before extending again */
8541
8542                 error = VNOP_READDIRATTR(vp, &attributelist, auio, count,
8543                                 (u_long)(uint32_t)uap->options, &newstate, &eofflag, &count, ctx);
8544         }
8545
8546         if (error) {
8547                 (void) vnode_put(vp);
8548                 goto out;
8549         }
8550
8551         /*
8552          * If we've got the last entry of a directory in a union mount
8553          * then reset the eofflag and pretend there's still more to come.
8554          * The next call will again set eofflag and the buffer will be empty,
8555          * so traverse to the underlying directory and do the directory
8556          * read there.
8557          */
8558         if (eofflag && vp->v_mount->mnt_flag & MNT_UNION) {
8559                 if (uio_resid(auio) < (user_ssize_t) uap->buffersize) { // Got some entries
8560                         eofflag = 0;
8561                 } else {                                                // Empty buffer
8562                         struct vnode *tvp = vp;
8563                         if (lookup_traverse_union(tvp, &vp, ctx) == 0) {
8564                                 vnode_ref_ext(vp, fp->f_fglob->fg_flag & O_EVTONLY, 0);
8565                                 fp->f_fglob->fg_data = (caddr_t) vp;
8566                                 fp->f_fglob->fg_offset = 0; // reset index for new dir
8567                                 count = savecount;
8568                                 vnode_rele_internal(tvp, fp->f_fglob->fg_flag & O_EVTONLY, 0, 0);
8569                                 vnode_put(tvp);
8570                                 goto unionread;
8571                         }
8572                         vp = tvp;
8573                 }
8574         }
8575
8576         (void)vnode_put(vp);
8577
8578         if (error)
8579                 goto out;
8580         fp->f_fglob->fg_offset = uio_offset(auio); /* should be multiple of dirent, not variable */
8581
8582         if ((error = copyout((caddr_t) &count, uap->count, sizeof(count))))
8583                 goto out;
8584         if ((error = copyout((caddr_t) &newstate, uap->newstate, sizeof(newstate))))
8585                 goto out;
8586         if ((error = copyout((caddr_t) &loff, uap->basep, sizeof(loff))))
8587                 goto out;
8588
8589         *retval = eofflag;  /* similar to getdirentries */
8590         error = 0;
8591 out:
8592         file_drop(fd);
8593         return (error); /* return error earlier, an retval of 0 or 1 now */
8594
8595 } /* end of getdirentriesattr system call */
8596
8597 /*
8598 * Exchange data between two files
8599 */
8600
8601 /* ARGSUSED */
8602 int
8603 exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t *retval)
8604 {
8605
8606         struct nameidata fnd, snd;
8607         vfs_context_t ctx = vfs_context_current();
8608         vnode_t fvp;
8609         vnode_t svp;
8610         int error;
8611         u_int32_t nameiflags;
8612         char *fpath = NULL;
8613         char *spath = NULL;
8614         int   flen=0, slen=0;
8615         int from_truncated=0, to_truncated=0;
8616 #if CONFIG_FSE
8617         fse_info f_finfo, s_finfo;
8618 #endif
8619
8620         nameiflags = 0;
8621         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8622
8623         NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1,
8624                UIO_USERSPACE, uap->path1, ctx);
8625
8626         error = namei(&fnd);
8627         if (error)
8628                 goto out2;
8629
8630         nameidone(&fnd);
8631         fvp = fnd.ni_vp;
8632
8633         NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2,
8634                UIO_USERSPACE, uap->path2, ctx);
8635
8636         error = namei(&snd);
8637         if (error) {
8638                 vnode_put(fvp);
8639                 goto out2;
8640         }
8641         nameidone(&snd);
8642         svp = snd.ni_vp;
8643
8644         /*
8645          * if the files are the same, return an inval error
8646          */
8647         if (svp == fvp) {
8648                 error = EINVAL;
8649                 goto out;
8650         }
8651
8652         /*
8653          * if the files are on different volumes, return an error
8654          */
8655         if (svp->v_mount != fvp->v_mount) {
8656                 error = EXDEV;
8657                 goto out;
8658         }
8659
8660         /* If they're not files, return an error */
8661         if ( (vnode_isreg(fvp) == 0) || (vnode_isreg(svp) == 0)) {
8662                 error = EINVAL;
8663                 goto out;
8664         }
8665
8666 #if CONFIG_MACF
8667         error = mac_vnode_check_exchangedata(ctx,
8668             fvp, svp);
8669         if (error)
8670                 goto out;
8671 #endif
8672         if (((error = vnode_authorize(fvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) ||
8673             ((error = vnode_authorize(svp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0))
8674                 goto out;
8675
8676         if (
8677 #if CONFIG_FSE
8678         need_fsevent(FSE_EXCHANGE, fvp) ||
8679 #endif
8680         kauth_authorize_fileop_has_listeners()) {
8681                 GET_PATH(fpath);
8682                 GET_PATH(spath);
8683                 if (fpath == NULL || spath == NULL) {
8684                         error = ENOMEM;
8685                         goto out;
8686                 }
8687
8688                 flen = safe_getpath(fvp, NULL, fpath, MAXPATHLEN, &from_truncated);
8689                 slen = safe_getpath(svp, NULL, spath, MAXPATHLEN, &to_truncated);
8690
8691 #if CONFIG_FSE
8692                 get_fse_info(fvp, &f_finfo, ctx);
8693                 get_fse_info(svp, &s_finfo, ctx);
8694                 if (from_truncated || to_truncated) {
8695                         // set it here since only the f_finfo gets reported up to user space
8696                         f_finfo.mode |= FSE_TRUNCATED_PATH;
8697                 }
8698 #endif
8699         }
8700         /* Ok, make the call */
8701         error = VNOP_EXCHANGE(fvp, svp, 0, ctx);
8702
8703         if (error == 0) {
8704             const char *tmpname;
8705
8706             if (fpath != NULL && spath != NULL) {
8707                     /* call out to allow 3rd party notification of exchangedata.
8708                      * Ignore result of kauth_authorize_fileop call.
8709                      */
8710                     kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_EXCHANGE,
8711                                            (uintptr_t)fpath, (uintptr_t)spath);
8712             }
8713             name_cache_lock();
8714
8715             tmpname     = fvp->v_name;
8716             fvp->v_name = svp->v_name;
8717             svp->v_name = tmpname;
8718
8719             if (fvp->v_parent != svp->v_parent) {
8720                 vnode_t tmp;
8721
8722                 tmp           = fvp->v_parent;
8723                 fvp->v_parent = svp->v_parent;
8724                 svp->v_parent = tmp;
8725             }
8726             name_cache_unlock();
8727
8728 #if CONFIG_FSE
8729             if (fpath != NULL && spath != NULL) {
8730                     add_fsevent(FSE_EXCHANGE, ctx,
8731                                 FSE_ARG_STRING, flen, fpath,
8732                                 FSE_ARG_FINFO, &f_finfo,
8733                                 FSE_ARG_STRING, slen, spath,
8734                                 FSE_ARG_FINFO, &s_finfo,
8735                                 FSE_ARG_DONE);
8736             }
8737 #endif
8738         }
8739
8740 out:
8741         if (fpath != NULL)
8742                 RELEASE_PATH(fpath);
8743         if (spath != NULL)
8744                 RELEASE_PATH(spath);
8745         vnode_put(svp);
8746         vnode_put(fvp);
8747 out2:
8748         return (error);
8749 }
8750
8751 /*
8752  * Return (in MB) the amount of freespace on the given vnode's volume.
8753  */
8754 uint32_t freespace_mb(vnode_t vp);
8755
8756 uint32_t
8757 freespace_mb(vnode_t vp)
8758 {
8759         vfs_update_vfsstat(vp->v_mount, vfs_context_current(), VFS_USER_EVENT);
8760         return (((uint64_t)vp->v_mount->mnt_vfsstat.f_bavail *
8761                 vp->v_mount->mnt_vfsstat.f_bsize) >> 20);
8762 }
8763
8764 #if CONFIG_SEARCHFS
8765
8766 /* ARGSUSED */
8767
8768 int
8769 searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval)
8770 {
8771         vnode_t vp, tvp;
8772         int i, error=0;
8773         int fserror = 0;
8774         struct nameidata nd;
8775         struct user64_fssearchblock searchblock;
8776         struct searchstate *state;
8777         struct attrlist *returnattrs;
8778         struct timeval timelimit;
8779         void *searchparams1,*searchparams2;
8780         uio_t auio = NULL;
8781         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
8782         uint32_t nummatches;
8783         int mallocsize;
8784         uint32_t nameiflags;
8785         vfs_context_t ctx = vfs_context_current();
8786         char uio_buf[ UIO_SIZEOF(1) ];
8787
8788         /* Start by copying in fsearchblock parameter list */
8789     if (IS_64BIT_PROCESS(p)) {
8790         error = copyin(uap->searchblock, (caddr_t) &searchblock, sizeof(searchblock));
8791         timelimit.tv_sec = searchblock.timelimit.tv_sec;
8792         timelimit.tv_usec = searchblock.timelimit.tv_usec;
8793     }
8794     else {
8795         struct user32_fssearchblock tmp_searchblock;
8796
8797         error = copyin(uap->searchblock, (caddr_t) &tmp_searchblock, sizeof(tmp_searchblock));
8798         // munge into 64-bit version
8799         searchblock.returnattrs = CAST_USER_ADDR_T(tmp_searchblock.returnattrs);
8800         searchblock.returnbuffer = CAST_USER_ADDR_T(tmp_searchblock.returnbuffer);
8801         searchblock.returnbuffersize = tmp_searchblock.returnbuffersize;
8802         searchblock.maxmatches = tmp_searchblock.maxmatches;
8803                 /*
8804                  * These casts are safe. We will promote the tv_sec into a 64 bit long if necessary
8805                  * from a 32 bit long, and tv_usec is already a signed 32 bit int.
8806                  */
8807         timelimit.tv_sec = (__darwin_time_t) tmp_searchblock.timelimit.tv_sec;
8808         timelimit.tv_usec = (__darwin_useconds_t) tmp_searchblock.timelimit.tv_usec;
8809         searchblock.searchparams1 = CAST_USER_ADDR_T(tmp_searchblock.searchparams1);
8810         searchblock.sizeofsearchparams1 = tmp_searchblock.sizeofsearchparams1;
8811         searchblock.searchparams2 = CAST_USER_ADDR_T(tmp_searchblock.searchparams2);
8812         searchblock.sizeofsearchparams2 = tmp_searchblock.sizeofsearchparams2;
8813         searchblock.searchattrs = tmp_searchblock.searchattrs;
8814     }
8815         if (error)
8816                 return(error);
8817
8818         /* Do a sanity check on sizeofsearchparams1 and sizeofsearchparams2.
8819          */
8820         if (searchblock.sizeofsearchparams1 > SEARCHFS_MAX_SEARCHPARMS ||
8821                 searchblock.sizeofsearchparams2 > SEARCHFS_MAX_SEARCHPARMS)
8822                 return(EINVAL);
8823
8824         /* Now malloc a big bunch of space to hold the search parameters, the attrlists and the search state. */
8825         /* It all has to do into local memory and it's not that big so we might as well  put it all together. */
8826         /* Searchparams1 shall be first so we might as well use that to hold the base address of the allocated*/
8827         /* block.                                                                                             */
8828         /*                                                                                                    */
8829         /* NOTE: we allocate an extra 8 bytes to account for the difference in size of the searchstate        */
8830         /*       due to the changes in rdar://problem/12438273.  That way if a 3rd party file system          */
8831         /*       assumes the size is still 556 bytes it will continue to work                                 */
8832
8833         mallocsize = searchblock.sizeofsearchparams1 + searchblock.sizeofsearchparams2 +
8834                 sizeof(struct attrlist) + sizeof(struct searchstate) + (2*sizeof(uint32_t));
8835
8836         MALLOC(searchparams1, void *, mallocsize, M_TEMP, M_WAITOK);
8837
8838         /* Now set up the various pointers to the correct place in our newly allocated memory */
8839
8840         searchparams2 = (void *) (((caddr_t) searchparams1) + searchblock.sizeofsearchparams1);
8841         returnattrs = (struct attrlist *) (((caddr_t) searchparams2) + searchblock.sizeofsearchparams2);
8842         state = (struct searchstate *) (((caddr_t) returnattrs) + sizeof (struct attrlist));
8843
8844         /* Now copy in the stuff given our local variables. */
8845
8846         if ((error = copyin(searchblock.searchparams1, searchparams1, searchblock.sizeofsearchparams1)))
8847                 goto freeandexit;
8848
8849         if ((error = copyin(searchblock.searchparams2, searchparams2, searchblock.sizeofsearchparams2)))
8850                 goto freeandexit;
8851
8852         if ((error = copyin(searchblock.returnattrs, (caddr_t) returnattrs, sizeof(struct attrlist))))
8853                 goto freeandexit;
8854
8855         if ((error = copyin(uap->state, (caddr_t) state, sizeof(struct searchstate))))
8856                 goto freeandexit;
8857
8858         /*
8859          * When searching a union mount, need to set the
8860          * start flag at the first call on each layer to
8861          * reset state for the new volume.
8862          */
8863         if (uap->options & SRCHFS_START)
8864                 state->ss_union_layer = 0;
8865         else
8866                 uap->options |= state->ss_union_flags;
8867         state->ss_union_flags = 0;
8868
8869         /*
8870          * Because searchparams1 and searchparams2 may contain an ATTR_CMN_NAME search parameter,
8871          * which is passed in with an attrreference_t, we need to inspect the buffer manually here.
8872          * The KPI does not provide us the ability to pass in the length of the buffers searchparams1
8873          * and searchparams2. To obviate the need for all searchfs-supporting filesystems to
8874          * validate the user-supplied data offset of the attrreference_t, we'll do it here.
8875          */
8876
8877         if (searchblock.searchattrs.commonattr & ATTR_CMN_NAME) {
8878                 attrreference_t* string_ref;
8879                 u_int32_t* start_length;
8880                 user64_size_t param_length;
8881
8882                 /* validate searchparams1 */
8883                 param_length = searchblock.sizeofsearchparams1;
8884                 /* skip the word that specifies length of the buffer */
8885                 start_length= (u_int32_t*) searchparams1;
8886                 start_length= start_length+1;
8887                 string_ref= (attrreference_t*) start_length;
8888
8889                 /* ensure no negative offsets or too big offsets */
8890                 if (string_ref->attr_dataoffset < 0 ) {
8891                         error = EINVAL;
8892                         goto freeandexit;
8893                 }
8894                 if (string_ref->attr_length > MAXPATHLEN) {
8895                         error = EINVAL;
8896                         goto freeandexit;
8897                 }
8898
8899                 /* Check for pointer overflow in the string ref */
8900                 if (((char*) string_ref + string_ref->attr_dataoffset) < (char*) string_ref) {
8901                         error = EINVAL;
8902                         goto freeandexit;
8903                 }
8904
8905                 if (((char*) string_ref + string_ref->attr_dataoffset) > ((char*)searchparams1 + param_length)) {
8906                         error = EINVAL;
8907                         goto freeandexit;
8908                 }
8909                 if (((char*)string_ref + string_ref->attr_dataoffset + string_ref->attr_length) > ((char*)searchparams1 + param_length)) {
8910                         error = EINVAL;
8911                         goto freeandexit;
8912                 }
8913         }
8914
8915         /* set up the uio structure which will contain the users return buffer */
8916         auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf));
8917         uio_addiov(auio, searchblock.returnbuffer, searchblock.returnbuffersize);
8918
8919         nameiflags = 0;
8920         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
8921         NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1,
8922                UIO_USERSPACE, uap->path, ctx);
8923
8924         error = namei(&nd);
8925         if (error)
8926                 goto freeandexit;
8927         vp = nd.ni_vp;
8928         nameidone(&nd);
8929
8930         /*
8931          * Switch to the root vnode for the volume
8932          */
8933         error = VFS_ROOT(vnode_mount(vp), &tvp, ctx);
8934         vnode_put(vp);
8935         if (error)
8936                 goto freeandexit;
8937         vp = tvp;
8938
8939         /*
8940          * If it's a union mount, the path lookup takes
8941          * us to the top layer. But we may need to descend
8942          * to a lower layer. For non-union mounts the layer
8943          * is always zero.
8944          */
8945         for (i = 0; i < (int) state->ss_union_layer; i++) {
8946                 if ((vp->v_mount->mnt_flag & MNT_UNION) == 0)
8947                         break;
8948                 tvp = vp;
8949                 vp = vp->v_mount->mnt_vnodecovered;
8950                 if (vp == NULL) {
8951                         vnode_put(tvp);
8952                         error = ENOENT;
8953                         goto freeandexit;
8954                 }
8955                 error = vnode_getwithref(vp);
8956                 vnode_put(tvp);
8957                 if (error)
8958                         goto freeandexit;
8959         }
8960
8961 #if CONFIG_MACF
8962         error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs);
8963         if (error) {
8964                 vnode_put(vp);
8965                 goto freeandexit;
8966         }
8967 #endif
8968
8969
8970         /*
8971          * If searchblock.maxmatches == 0, then skip the search. This has happened
8972          * before and sometimes the underlying code doesnt deal with it well.
8973          */
8974          if (searchblock.maxmatches == 0) {
8975                 nummatches = 0;
8976                 goto saveandexit;
8977          }
8978
8979         /*
8980          * Allright, we have everything we need, so lets make that call.
8981          *
8982          * We keep special track of the return value from the file system:
8983          * EAGAIN is an acceptable error condition that shouldn't keep us
8984          * from copying out any results...
8985          */
8986
8987         fserror = VNOP_SEARCHFS(vp,
8988                 searchparams1,
8989                 searchparams2,
8990                 &searchblock.searchattrs,
8991                 (u_long)searchblock.maxmatches,
8992                 &timelimit,
8993                 returnattrs,
8994                 &nummatches,
8995                 (u_long)uap->scriptcode,
8996                 (u_long)uap->options,
8997                 auio,
8998                 (struct searchstate *) &state->ss_fsstate,
8999                 ctx);
9000
9001         /*
9002          * If it's a union mount we need to be called again
9003          * to search the mounted-on filesystem.
9004          */
9005         if ((vp->v_mount->mnt_flag & MNT_UNION) && fserror == 0) {
9006                 state->ss_union_flags = SRCHFS_START;
9007                 state->ss_union_layer++;        // search next layer down
9008                 fserror = EAGAIN;
9009         }
9010
9011 saveandexit:
9012
9013         vnode_put(vp);
9014
9015         /* Now copy out the stuff that needs copying out. That means the number of matches, the
9016            search state.  Everything was already put into he return buffer by the vop call. */
9017
9018         if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0)
9019                 goto freeandexit;
9020
9021         if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0)
9022                 goto freeandexit;
9023
9024         error = fserror;
9025
9026 freeandexit:
9027
9028         FREE(searchparams1,M_TEMP);
9029
9030         return(error);
9031
9032
9033 } /* end of searchfs system call */
9034
9035 #else /* CONFIG_SEARCHFS */
9036
9037 int
9038 searchfs(__unused proc_t p, __unused struct searchfs_args *uap, __unused int32_t *retval)
9039 {
9040         return (ENOTSUP);
9041 }
9042
9043 #endif /* CONFIG_SEARCHFS */
9044
9045
9046 lck_grp_attr_t *  nspace_group_attr;
9047 lck_attr_t *      nspace_lock_attr;
9048 lck_grp_t *       nspace_mutex_group;
9049
9050 lck_mtx_t         nspace_handler_lock;
9051 lck_mtx_t         nspace_handler_exclusion_lock;
9052
9053 time_t snapshot_timestamp=0;
9054 int nspace_allow_virtual_devs=0;
9055
9056 void nspace_handler_init(void);
9057
9058 typedef struct nspace_item_info {
9059         struct vnode *vp;
9060         void         *arg;
9061         uint64_t      op;
9062         uint32_t      vid;
9063         uint32_t      flags;
9064         uint32_t      token;
9065         uint32_t      refcount;
9066 } nspace_item_info;
9067
9068 #define MAX_NSPACE_ITEMS   128
9069 nspace_item_info nspace_items[MAX_NSPACE_ITEMS];
9070 uint32_t      nspace_item_idx=0;              // also used as the sleep/wakeup rendezvous address
9071 uint32_t      nspace_token_id=0;
9072 uint32_t      nspace_handler_timeout = 15;    // seconds
9073
9074 #define NSPACE_ITEM_NEW         0x0001
9075 #define NSPACE_ITEM_PROCESSING  0x0002
9076 #define NSPACE_ITEM_DEAD        0x0004
9077 #define NSPACE_ITEM_CANCELLED   0x0008
9078 #define NSPACE_ITEM_DONE        0x0010
9079 #define NSPACE_ITEM_RESET_TIMER 0x0020
9080
9081 #define NSPACE_ITEM_NSPACE_EVENT   0x0040
9082 #define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080
9083
9084 #define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT)
9085
9086 //#pragma optimization_level 0
9087
9088 typedef enum {
9089         NSPACE_HANDLER_NSPACE = 0,
9090         NSPACE_HANDLER_SNAPSHOT = 1,
9091
9092         NSPACE_HANDLER_COUNT,
9093 } nspace_type_t;
9094
9095 typedef struct {
9096         uint64_t handler_tid;
9097         struct proc *handler_proc;
9098         int handler_busy;
9099 } nspace_handler_t;
9100
9101 nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT];
9102
9103 /* namespace fsctl functions */
9104 static int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type);
9105 static int nspace_item_flags_for_type(nspace_type_t nspace_type);
9106 static int nspace_open_flags_for_type(nspace_type_t nspace_type);
9107 static nspace_type_t nspace_type_for_op(uint64_t op);
9108 static int nspace_is_special_process(struct proc *proc);
9109 static int vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx);
9110 static int wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type);
9111 static int validate_namespace_args (int is64bit, int size);
9112 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data);
9113
9114
9115 static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type)
9116 {
9117         switch(nspace_type) {
9118                 case NSPACE_HANDLER_NSPACE:
9119                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT;
9120                 case NSPACE_HANDLER_SNAPSHOT:
9121                         return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT;
9122                 default:
9123                         printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type);
9124                         return 0;
9125         }
9126 }
9127
9128 static inline int nspace_item_flags_for_type(nspace_type_t nspace_type)
9129 {
9130         switch(nspace_type) {
9131                 case NSPACE_HANDLER_NSPACE:
9132                         return NSPACE_ITEM_NSPACE_EVENT;
9133                 case NSPACE_HANDLER_SNAPSHOT:
9134                         return NSPACE_ITEM_SNAPSHOT_EVENT;
9135                 default:
9136                         printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type);
9137                         return 0;
9138         }
9139 }
9140
9141 static inline int nspace_open_flags_for_type(nspace_type_t nspace_type)
9142 {
9143         switch(nspace_type) {
9144                 case NSPACE_HANDLER_NSPACE:
9145                         return FREAD | FWRITE | O_EVTONLY;
9146                 case NSPACE_HANDLER_SNAPSHOT:
9147                         return FREAD | O_EVTONLY;
9148                 default:
9149                         printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type);
9150                         return 0;
9151         }
9152 }
9153
9154 static inline nspace_type_t nspace_type_for_op(uint64_t op)
9155 {
9156         switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) {
9157                 case NAMESPACE_HANDLER_NSPACE_EVENT:
9158                         return NSPACE_HANDLER_NSPACE;
9159                 case NAMESPACE_HANDLER_SNAPSHOT_EVENT:
9160                         return NSPACE_HANDLER_SNAPSHOT;
9161                 default:
9162                         printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK);
9163                         return NSPACE_HANDLER_NSPACE;
9164         }
9165 }
9166
9167 static inline int nspace_is_special_process(struct proc *proc)
9168 {
9169         int i;
9170         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
9171                 if (proc == nspace_handlers[i].handler_proc)
9172                         return 1;
9173         }
9174         return 0;
9175 }
9176
9177 void
9178 nspace_handler_init(void)
9179 {
9180         nspace_lock_attr    = lck_attr_alloc_init();
9181         nspace_group_attr   = lck_grp_attr_alloc_init();
9182         nspace_mutex_group  = lck_grp_alloc_init("nspace-mutex", nspace_group_attr);
9183         lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr);
9184         lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr);
9185         memset(&nspace_items[0], 0, sizeof(nspace_items));
9186 }
9187
9188 void
9189 nspace_proc_exit(struct proc *p)
9190 {
9191         int i, event_mask = 0;
9192
9193         for (i = 0; i < NSPACE_HANDLER_COUNT; i++) {
9194                 if (p == nspace_handlers[i].handler_proc) {
9195                         event_mask |= nspace_item_flags_for_type(i);
9196                         nspace_handlers[i].handler_tid = 0;
9197                         nspace_handlers[i].handler_proc = NULL;
9198                 }
9199         }
9200
9201         if (event_mask == 0) {
9202                 return;
9203         }
9204
9205         lck_mtx_lock(&nspace_handler_lock);
9206         if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) {
9207                 // if this process was the snapshot handler, zero snapshot_timeout
9208                 snapshot_timestamp = 0;
9209         }
9210
9211         //
9212         // unblock anyone that's waiting for the handler that died
9213         //
9214         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9215                 if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) {
9216
9217                         if ( nspace_items[i].flags & event_mask ) {
9218
9219                                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9220                                         vnode_lock_spin(nspace_items[i].vp);
9221                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9222                                         vnode_unlock(nspace_items[i].vp);
9223                                 }
9224                                 nspace_items[i].vp = NULL;
9225                                 nspace_items[i].vid = 0;
9226                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9227                                 nspace_items[i].token = 0;
9228
9229                                 wakeup((caddr_t)&(nspace_items[i].vp));
9230                         }
9231                 }
9232         }
9233
9234         wakeup((caddr_t)&nspace_item_idx);
9235         lck_mtx_unlock(&nspace_handler_lock);
9236 }
9237
9238
9239 int
9240 resolve_nspace_item(struct vnode *vp, uint64_t op)
9241 {
9242         return resolve_nspace_item_ext(vp, op, NULL);
9243 }
9244
9245 int
9246 resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg)
9247 {
9248         int i, error, keep_waiting;
9249         struct timespec ts;
9250         nspace_type_t nspace_type = nspace_type_for_op(op);
9251
9252         // only allow namespace events on regular files, directories and symlinks.
9253         if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
9254                 return 0;
9255         }
9256
9257         //
9258         // if this is a snapshot event and the vnode is on a
9259         // disk image just pretend nothing happened since any
9260         // change to the disk image will cause the disk image
9261         // itself to get backed up and this avoids multi-way
9262         // deadlocks between the snapshot handler and the ever
9263         // popular diskimages-helper process.  the variable
9264         // nspace_allow_virtual_devs allows this behavior to
9265         // be overridden (for use by the Mobile TimeMachine
9266         // testing infrastructure which uses disk images)
9267         //
9268         if (   (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT)
9269             && (vp->v_mount != NULL)
9270             && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
9271             && !nspace_allow_virtual_devs) {
9272
9273                 return 0;
9274         }
9275
9276         // if (thread_tid(current_thread()) == namespace_handler_tid) {
9277         if (nspace_handlers[nspace_type].handler_proc == NULL) {
9278                 return 0;
9279         }
9280
9281         if (nspace_is_special_process(current_proc())) {
9282                 return EDEADLK;
9283         }
9284
9285         lck_mtx_lock(&nspace_handler_lock);
9286
9287 retry:
9288         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9289                 if (vp == nspace_items[i].vp && op == nspace_items[i].op) {
9290                         break;
9291                 }
9292         }
9293
9294         if (i >= MAX_NSPACE_ITEMS) {
9295                 for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9296                         if (nspace_items[i].flags == 0) {
9297                                 break;
9298                         }
9299                 }
9300         } else {
9301                 nspace_items[i].refcount++;
9302         }
9303
9304         if (i >= MAX_NSPACE_ITEMS) {
9305                 ts.tv_sec = nspace_handler_timeout;
9306                 ts.tv_nsec = 0;
9307
9308                 error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS|PCATCH, "nspace-no-space", &ts);
9309                 if (error == 0) {
9310                         // an entry got free'd up, go see if we can get a slot
9311                         goto retry;
9312                 } else {
9313                         lck_mtx_unlock(&nspace_handler_lock);
9314                         return error;
9315                 }
9316         }
9317
9318         //
9319         // if it didn't already exist, add it.  if it did exist
9320         // we'll get woken up when someone does a wakeup() on
9321         // the slot in the nspace_items table.
9322         //
9323         if (vp != nspace_items[i].vp) {
9324                 nspace_items[i].vp = vp;
9325                 nspace_items[i].arg = (arg == NSPACE_REARM_NO_ARG) ? NULL : arg;  // arg is {NULL, true, uio *} - only pass uio thru to the user
9326                 nspace_items[i].op = op;
9327                 nspace_items[i].vid = vnode_vid(vp);
9328                 nspace_items[i].flags = NSPACE_ITEM_NEW;
9329                 nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type);
9330                 if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) {
9331                         if (arg) {
9332                                 vnode_lock_spin(vp);
9333                                 vp->v_flag |= VNEEDSSNAPSHOT;
9334                                 vnode_unlock(vp);
9335                         }
9336                 }
9337
9338                 nspace_items[i].token = 0;
9339                 nspace_items[i].refcount = 1;
9340
9341                 wakeup((caddr_t)&nspace_item_idx);
9342         }
9343
9344         //
9345         // Now go to sleep until the handler does a wakeup on this
9346         // slot in the nspace_items table (or we timeout).
9347         //
9348         keep_waiting = 1;
9349         while(keep_waiting) {
9350                 ts.tv_sec = nspace_handler_timeout;
9351                 ts.tv_nsec = 0;
9352                 error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS|PCATCH, "namespace-done", &ts);
9353
9354                 if (nspace_items[i].flags & NSPACE_ITEM_DONE) {
9355                         error = 0;
9356                 } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) {
9357                         error = nspace_items[i].token;
9358                 } else if (error == EWOULDBLOCK || error == ETIMEDOUT) {
9359                         if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) {
9360                                 nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER;
9361                                 continue;
9362                         } else {
9363                                 error = ETIMEDOUT;
9364                         }
9365                 } else if (error == 0) {
9366                         // hmmm, why did we get woken up?
9367                         printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n",
9368                                nspace_items[i].token);
9369                 }
9370
9371                 if (--nspace_items[i].refcount == 0) {
9372                         nspace_items[i].vp = NULL;     // clear this so that no one will match on it again
9373                         nspace_items[i].arg = NULL;
9374                         nspace_items[i].token = 0;     // clear this so that the handler will not find it anymore
9375                         nspace_items[i].flags = 0;     // this clears it for re-use
9376                 }
9377                 wakeup(&nspace_token_id);
9378                 keep_waiting = 0;
9379         }
9380
9381         lck_mtx_unlock(&nspace_handler_lock);
9382
9383         return error;
9384 }
9385
9386 int nspace_snapshot_event(vnode_t vp, time_t ctime, uint64_t op_type, void *arg)
9387 {
9388         int snapshot_error = 0;
9389
9390         if (vp == NULL) {
9391                 return 0;
9392         }
9393
9394         /* Swap files are special; skip them */
9395         if (vnode_isswap(vp)) {
9396                 return 0;
9397         }
9398
9399         if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) {
9400                 // the change time is within this epoch
9401                 int error;
9402
9403                 error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg);
9404                 if (error == EDEADLK) {
9405                         snapshot_error = 0;
9406                 } else if (error) {
9407                         if (error == EAGAIN) {
9408                                 printf("nspace_snapshot_event: timed out waiting for namespace handler...\n");
9409                         } else if (error == EINTR) {
9410                                 // printf("nspace_snapshot_event: got a signal while waiting for namespace handler...\n");
9411                                 snapshot_error = EINTR;
9412                         }
9413                 }
9414         }
9415
9416         return snapshot_error;
9417 }
9418
9419 int
9420 get_nspace_item_status(struct vnode *vp, int32_t *status)
9421 {
9422         int i;
9423
9424         lck_mtx_lock(&nspace_handler_lock);
9425         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
9426                 if (nspace_items[i].vp == vp) {
9427                         break;
9428                 }
9429         }
9430
9431         if (i >= MAX_NSPACE_ITEMS) {
9432                 lck_mtx_unlock(&nspace_handler_lock);
9433                 return ENOENT;
9434         }
9435
9436         *status = nspace_items[i].flags;
9437         lck_mtx_unlock(&nspace_handler_lock);
9438         return 0;
9439 }
9440
9441
9442 #if 0
9443 static int
9444 build_volfs_path(struct vnode *vp, char *path, int *len)
9445 {
9446         struct vnode_attr va;
9447         int ret;
9448
9449         VATTR_INIT(&va);
9450         VATTR_WANTED(&va, va_fsid);
9451         VATTR_WANTED(&va, va_fileid);
9452
9453         if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) {
9454                 *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1;
9455                 ret = -1;
9456         } else {
9457                 *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1;
9458                 ret = 0;
9459         }
9460
9461         return ret;
9462 }
9463 #endif
9464
9465 //
9466 // Note: this function does NOT check permissions on all of the
9467 // parent directories leading to this vnode.  It should only be
9468 // called on behalf of a root process.  Otherwise a process may
9469 // get access to a file because the file itself is readable even
9470 // though its parent directories would prevent access.
9471 //
9472 static int
9473 vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx)
9474 {
9475         int error, action;
9476
9477         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9478                 return error;
9479         }
9480
9481 #if CONFIG_MACF
9482         error = mac_vnode_check_open(ctx, vp, fmode);
9483         if (error)
9484                 return error;
9485 #endif
9486
9487         /* compute action to be authorized */
9488         action = 0;
9489         if (fmode & FREAD) {
9490                 action |= KAUTH_VNODE_READ_DATA;
9491         }
9492         if (fmode & (FWRITE | O_TRUNC)) {
9493                 /*
9494                  * If we are writing, appending, and not truncating,
9495                  * indicate that we are appending so that if the
9496                  * UF_APPEND or SF_APPEND bits are set, we do not deny
9497                  * the open.
9498                  */
9499                 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
9500                         action |= KAUTH_VNODE_APPEND_DATA;
9501                 } else {
9502                         action |= KAUTH_VNODE_WRITE_DATA;
9503                 }
9504         }
9505
9506         if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0)
9507                 return error;
9508
9509
9510         //
9511         // if the vnode is tagged VOPENEVT and the current process
9512         // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY
9513         // flag to the open mode so that this open won't count against
9514         // the vnode when carbon delete() does a vnode_isinuse() to see
9515         // if a file is currently in use.  this allows spotlight
9516         // importers to not interfere with carbon apps that depend on
9517         // the no-delete-if-busy semantics of carbon delete().
9518         //
9519         if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) {
9520                 fmode |= O_EVTONLY;
9521         }
9522
9523         if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) {
9524                 return error;
9525         }
9526         if ( (error = vnode_ref_ext(vp, fmode, 0)) ) {
9527                 VNOP_CLOSE(vp, fmode, ctx);
9528                 return error;
9529         }
9530
9531         /* Call out to allow 3rd party notification of open.
9532          * Ignore result of kauth_authorize_fileop call.
9533          */
9534 #if CONFIG_MACF
9535         mac_vnode_notify_open(ctx, vp, fmode);
9536 #endif
9537         kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN,
9538                                (uintptr_t)vp, 0);
9539
9540
9541         return 0;
9542 }
9543
9544 static int
9545 wait_for_namespace_event(namespace_handler_data *nhd, nspace_type_t nspace_type)
9546 {
9547         int i;
9548         int error = 0;
9549         int unblock = 0;
9550         task_t curtask;
9551
9552         lck_mtx_lock(&nspace_handler_exclusion_lock);
9553         if (nspace_handlers[nspace_type].handler_busy) {
9554                 lck_mtx_unlock(&nspace_handler_exclusion_lock);
9555                 return EBUSY;
9556         }
9557
9558         nspace_handlers[nspace_type].handler_busy = 1;
9559         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9560
9561         /*
9562          * Any process that gets here will be one of the namespace handlers.
9563          * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation
9564          * as we can cause deadlocks to occur, because the namespace handler may prevent
9565          * VNOP_INACTIVE from proceeding.  Mark the current task as a P_DEPENDENCY_CAPABLE
9566          * process.
9567          */
9568         curtask = current_task();
9569         bsd_set_dependency_capable (curtask);
9570
9571         lck_mtx_lock(&nspace_handler_lock);
9572         if (nspace_handlers[nspace_type].handler_proc == NULL) {
9573                 nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread());
9574                 nspace_handlers[nspace_type].handler_proc = current_proc();
9575         }
9576
9577         if (nspace_type == NSPACE_HANDLER_SNAPSHOT &&
9578                         (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9579                 error = EINVAL;
9580         }
9581
9582         while (error == 0) {
9583
9584                 /* Try to find matching namespace item */
9585                 for (i = 0; i < MAX_NSPACE_ITEMS; i++) {
9586                         if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9587                                 if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9588                                         break;
9589                                 }
9590                         }
9591                 }
9592
9593                 if (i >= MAX_NSPACE_ITEMS) {
9594                         /* Nothing is there yet. Wait for wake up and retry */
9595                         error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0);
9596                         if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9597                                 /* Prevent infinite loop if snapshot handler exited */
9598                                 error = EINVAL;
9599                                 break;
9600                         }
9601                         continue;
9602                 }
9603
9604                 nspace_items[i].flags  &= ~NSPACE_ITEM_NEW;
9605                 nspace_items[i].flags  |= NSPACE_ITEM_PROCESSING;
9606                 nspace_items[i].token  = ++nspace_token_id;
9607
9608                 assert(nspace_items[i].vp);
9609                 struct fileproc *fp;
9610                 int32_t indx;
9611                 int32_t fmode;
9612                 struct proc *p = current_proc();
9613                 vfs_context_t ctx = vfs_context_current();
9614                 struct vnode_attr va;
9615                 bool vn_get_succsessful = false;
9616                 bool vn_open_successful = false;
9617                 bool fp_alloc_successful = false;
9618
9619                 /*
9620                  * Use vnode pointer to acquire a file descriptor for
9621                  * hand-off to userland
9622                  */
9623                 fmode = nspace_open_flags_for_type(nspace_type);
9624                 error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid);
9625                 if (error) goto cleanup;
9626                 vn_get_succsessful = true;
9627
9628                 error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx);
9629                 if (error) goto cleanup;
9630                 vn_open_successful = true;
9631
9632                 error = falloc(p, &fp, &indx, ctx);
9633                 if (error) goto cleanup;
9634                 fp_alloc_successful = true;
9635
9636                 fp->f_fglob->fg_flag = fmode;
9637                 fp->f_fglob->fg_ops = &vnops;
9638                 fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp;
9639
9640                 proc_fdlock(p);
9641                 procfdtbl_releasefd(p, indx, NULL);
9642                 fp_drop(p, indx, fp, 1);
9643                 proc_fdunlock(p);
9644
9645                 /*
9646                  * All variants of the namespace handler struct support these three fields:
9647                  * token, flags, and the FD pointer
9648                  */
9649                 error = copyout(&nspace_items[i].token, nhd->token, sizeof(uint32_t));
9650                 if (error) goto cleanup;
9651                 error = copyout(&nspace_items[i].op, nhd->flags, sizeof(uint64_t));
9652                 if (error) goto cleanup;
9653                 error = copyout(&indx, nhd->fdptr, sizeof(uint32_t));
9654                 if (error) goto cleanup;
9655
9656                 /*
9657                  * Handle optional fields:
9658                  * extended version support an info ptr (offset, length), and the
9659                  *
9660                  * namedata version supports a unique per-link object ID
9661                  *
9662                  */
9663                 if (nhd->infoptr) {
9664                         uio_t uio = (uio_t)nspace_items[i].arg;
9665                         uint64_t u_offset, u_length;
9666
9667                         if (uio) {
9668                                 u_offset = uio_offset(uio);
9669                                 u_length = uio_resid(uio);
9670                         } else {
9671                                 u_offset = 0;
9672                                 u_length = 0;
9673                         }
9674                         error = copyout(&u_offset, nhd->infoptr, sizeof(uint64_t));
9675                         if (error) goto cleanup;
9676                         error = copyout(&u_length, nhd->infoptr + sizeof(uint64_t), sizeof(uint64_t));
9677                         if (error) goto cleanup;
9678                 }
9679
9680                 if (nhd->objid) {
9681                         VATTR_INIT(&va);
9682                         VATTR_WANTED(&va, va_linkid);
9683                         error = vnode_getattr(nspace_items[i].vp, &va, ctx);
9684                         if (error) goto cleanup;
9685
9686                         uint64_t linkid = 0;
9687                         if (VATTR_IS_SUPPORTED (&va, va_linkid)) {
9688                                 linkid = (uint64_t)va.va_linkid;
9689                         }
9690                         error = copyout(&linkid, nhd->objid, sizeof(uint64_t));
9691                 }
9692 cleanup:
9693                 if (error) {
9694                         if (fp_alloc_successful) fp_free(p, indx, fp);
9695                         if (vn_open_successful) vn_close(nspace_items[i].vp, fmode, ctx);
9696                         unblock = 1;
9697                 }
9698
9699                 if (vn_get_succsessful) vnode_put(nspace_items[i].vp);
9700
9701                 break;
9702         }
9703
9704         if (unblock) {
9705                 if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) {
9706                         vnode_lock_spin(nspace_items[i].vp);
9707                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
9708                         vnode_unlock(nspace_items[i].vp);
9709                 }
9710                 nspace_items[i].vp = NULL;
9711                 nspace_items[i].vid = 0;
9712                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9713                 nspace_items[i].token = 0;
9714
9715                 wakeup((caddr_t)&(nspace_items[i].vp));
9716         }
9717
9718         if (nspace_type == NSPACE_HANDLER_SNAPSHOT) {
9719                 // just go through every snapshot event and unblock it immediately.
9720                 if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) {
9721                         for(i = 0; i < MAX_NSPACE_ITEMS; i++) {
9722                                 if (nspace_items[i].flags & NSPACE_ITEM_NEW) {
9723                                         if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) {
9724                                                 nspace_items[i].vp = NULL;
9725                                                 nspace_items[i].vid = 0;
9726                                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
9727                                                 nspace_items[i].token = 0;
9728
9729                                                 wakeup((caddr_t)&(nspace_items[i].vp));
9730                                         }
9731                                 }
9732                         }
9733                 }
9734         }
9735
9736         lck_mtx_unlock(&nspace_handler_lock);
9737
9738         lck_mtx_lock(&nspace_handler_exclusion_lock);
9739         nspace_handlers[nspace_type].handler_busy = 0;
9740         lck_mtx_unlock(&nspace_handler_exclusion_lock);
9741
9742         return error;
9743 }
9744
9745 static inline int validate_namespace_args (int is64bit, int size) {
9746
9747         if (is64bit) {
9748                 /* Must be one of these */
9749                 if (size == sizeof(user64_namespace_handler_info)) {
9750                         goto sizeok;
9751                 }
9752                 if (size == sizeof(user64_namespace_handler_info_ext)) {
9753                         goto sizeok;
9754                 }
9755                 if (size == sizeof(user64_namespace_handler_data)) {
9756                         goto sizeok;
9757                 }
9758                 return EINVAL;
9759         }
9760         else {
9761                 /* 32 bit -- must be one of these */
9762                 if (size == sizeof(user32_namespace_handler_info)) {
9763                         goto sizeok;
9764                 }
9765                 if (size == sizeof(user32_namespace_handler_info_ext)) {
9766                         goto sizeok;
9767                 }
9768                 if (size == sizeof(user32_namespace_handler_data)) {
9769                         goto sizeok;
9770                 }
9771                 return EINVAL;
9772         }
9773
9774 sizeok:
9775
9776         return 0;
9777
9778 }
9779
9780 static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data)
9781 {
9782         int error = 0;
9783         namespace_handler_data nhd;
9784
9785         bzero (&nhd, sizeof(namespace_handler_data));
9786
9787         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9788                 return error;
9789         }
9790
9791         error = validate_namespace_args (is64bit, size);
9792         if (error) {
9793                 return error;
9794         }
9795
9796         /* Copy in the userland pointers into our kernel-only struct */
9797
9798         if (is64bit) {
9799                 /* 64 bit userland structures */
9800                 nhd.token = (user_addr_t)((user64_namespace_handler_info *)data)->token;
9801                 nhd.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags;
9802                 nhd.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr;
9803
9804                 /* If the size is greater than the standard info struct, add in extra fields */
9805                 if (size > (sizeof(user64_namespace_handler_info))) {
9806                         if (size >= (sizeof(user64_namespace_handler_info_ext))) {
9807                                 nhd.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr;
9808                         }
9809                         if (size == (sizeof(user64_namespace_handler_data))) {
9810                                 nhd.objid = (user_addr_t)((user64_namespace_handler_data*)data)->objid;
9811                         }
9812                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9813                 }
9814         }
9815         else {
9816                 /* 32 bit userland structures */
9817                 nhd.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token);
9818                 nhd.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags);
9819                 nhd.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr);
9820
9821                 if (size > (sizeof(user32_namespace_handler_info))) {
9822                         if (size >= (sizeof(user32_namespace_handler_info_ext))) {
9823                                 nhd.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr);
9824                         }
9825                         if (size == (sizeof(user32_namespace_handler_data))) {
9826                                 nhd.objid = (user_addr_t)((user32_namespace_handler_data*)data)->objid;
9827                         }
9828                         /* Otherwise the fields were pre-zeroed when we did the bzero above. */
9829                 }
9830         }
9831
9832         return wait_for_namespace_event(&nhd, nspace_type);
9833 }
9834
9835 /*
9836  * Make a filesystem-specific control call:
9837  */
9838 /* ARGSUSED */
9839 static int
9840 fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long options, vfs_context_t ctx)
9841 {
9842         int error=0;
9843         boolean_t is64bit;
9844         u_int size;
9845 #define STK_PARAMS 128
9846         char stkbuf[STK_PARAMS] = {0};
9847         caddr_t data, memp;
9848         vnode_t vp = *arg_vp;
9849
9850         size = IOCPARM_LEN(cmd);
9851         if (size > IOCPARM_MAX) return (EINVAL);
9852
9853         is64bit = proc_is64bit(p);
9854
9855         memp = NULL;
9856
9857
9858         /*
9859          * ensure the buffer is large enough for underlying calls
9860          */
9861 #ifndef HFSIOC_GETPATH
9862         typedef char pn_t[MAXPATHLEN];
9863 #define HFSIOC_GETPATH  _IOWR('h', 13, pn_t)
9864 #endif
9865
9866 #ifndef HFS_GETPATH
9867 #define HFS_GETPATH  IOCBASECMD(HFSIOC_GETPATH)
9868 #endif
9869         if (IOCBASECMD(cmd) == HFS_GETPATH) {
9870                 /* Round up to MAXPATHLEN regardless of user input */
9871                 size = MAXPATHLEN;
9872         }
9873
9874         if (size > sizeof (stkbuf)) {
9875                 if ((memp = (caddr_t)kalloc(size)) == 0) return ENOMEM;
9876                 data = memp;
9877         } else {
9878                 data = &stkbuf[0];
9879         };
9880
9881         if (cmd & IOC_IN) {
9882                 if (size) {
9883                         error = copyin(udata, data, size);
9884                         if (error) {
9885                                 if (memp) {
9886                                         kfree (memp, size);
9887                                 }
9888                                 return error;
9889                         }
9890                 } else {
9891                         if (is64bit) {
9892                                 *(user_addr_t *)data = udata;
9893                         }
9894                         else {
9895                                 *(uint32_t *)data = (uint32_t)udata;
9896                         }
9897                 };
9898         } else if ((cmd & IOC_OUT) && size) {
9899                 /*
9900                  * Zero the buffer so the user always
9901                  * gets back something deterministic.
9902                  */
9903                 bzero(data, size);
9904         } else if (cmd & IOC_VOID) {
9905                 if (is64bit) {
9906                         *(user_addr_t *)data = udata;
9907                 }
9908                 else {
9909                         *(uint32_t *)data = (uint32_t)udata;
9910                 }
9911         }
9912
9913         /* Check to see if it's a generic command */
9914         switch (IOCBASECMD(cmd)) {
9915
9916                 case FSCTL_SYNC_VOLUME: {
9917                         mount_t mp = vp->v_mount;
9918                         int arg = *(uint32_t*)data;
9919
9920                         /* record vid of vp so we can drop it below. */
9921                         uint32_t vvid = vp->v_id;
9922
9923                         /*
9924                          * Then grab mount_iterref so that we can release the vnode.
9925                          * Without this, a thread may call vnode_iterate_prepare then
9926                          * get into a deadlock because we've never released the root vp
9927                          */
9928                         error = mount_iterref (mp, 0);
9929                         if (error)  {
9930                                 break;
9931                         }
9932                         vnode_put(vp);
9933
9934                         /* issue the sync for this volume */
9935                         (void)sync_callback(mp, (arg & FSCTL_SYNC_WAIT) ? &arg : NULL);
9936
9937                         /*
9938                          * Then release the mount_iterref once we're done syncing; it's not
9939                          * needed for the VNOP_IOCTL below
9940                          */
9941                         mount_iterdrop(mp);
9942
9943                         if (arg & FSCTL_SYNC_FULLSYNC) {
9944                                 /* re-obtain vnode iocount on the root vp, if possible */
9945                                 error = vnode_getwithvid (vp, vvid);
9946                                 if (error == 0) {
9947                                         error = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
9948                                         vnode_put (vp);
9949                                 }
9950                         }
9951                         /* mark the argument VP as having been released */
9952                         *arg_vp = NULL;
9953                 }
9954                 break;
9955
9956                 case FSCTL_ROUTEFS_SETROUTEID: {
9957 #if ROUTEFS
9958                         char routepath[MAXPATHLEN];
9959                         size_t len = 0;
9960
9961                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
9962                                 break;
9963                         }
9964                         bzero(routepath, MAXPATHLEN);
9965                         error = copyinstr(udata, &routepath[0], MAXPATHLEN, &len);
9966                         if (error) {
9967                                 break;
9968                         }
9969                         error = routefs_kernel_mount(routepath);
9970                         if (error) {
9971                                 break;
9972                         }
9973 #endif
9974                 }
9975                 break;
9976
9977                 case FSCTL_SET_PACKAGE_EXTS: {
9978                         user_addr_t ext_strings;
9979                         uint32_t    num_entries;
9980                         uint32_t    max_width;
9981
9982                         if ((error = priv_check_cred(kauth_cred_get(), PRIV_PACKAGE_EXTENSIONS, 0)))
9983                                 break;
9984
9985                         if (   (is64bit && size != sizeof(user64_package_ext_info))
9986                                         || (is64bit == 0 && size != sizeof(user32_package_ext_info))) {
9987
9988                                 // either you're 64-bit and passed a 64-bit struct or
9989                                 // you're 32-bit and passed a 32-bit struct.  otherwise
9990                                 // it's not ok.
9991                                 error = EINVAL;
9992                                 break;
9993                         }
9994
9995                         if (is64bit) {
9996                                 ext_strings = ((user64_package_ext_info *)data)->strings;
9997                                 num_entries = ((user64_package_ext_info *)data)->num_entries;
9998                                 max_width   = ((user64_package_ext_info *)data)->max_width;
9999                         } else {
10000                                 ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings);
10001                                 num_entries = ((user32_package_ext_info *)data)->num_entries;
10002                                 max_width   = ((user32_package_ext_info *)data)->max_width;
10003                         }
10004                         error = set_package_extensions_table(ext_strings, num_entries, max_width);
10005                 }
10006                 break;
10007
10008                 /* namespace handlers */
10009                 case FSCTL_NAMESPACE_HANDLER_GET: {
10010                         error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data);
10011                 }
10012                 break;
10013
10014                 /* Snapshot handlers */
10015                 case FSCTL_OLD_SNAPSHOT_HANDLER_GET: {
10016                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10017                 }
10018                 break;
10019
10020                 case FSCTL_SNAPSHOT_HANDLER_GET_EXT: {
10021                         error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data);
10022                 }
10023                 break;
10024
10025                 case FSCTL_NAMESPACE_HANDLER_UPDATE: {
10026                         uint32_t token, val;
10027                         int i;
10028
10029                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10030                                 break;
10031                         }
10032
10033                         if (!nspace_is_special_process(p)) {
10034                                 error = EINVAL;
10035                                 break;
10036                         }
10037
10038                         token = ((uint32_t *)data)[0];
10039                         val   = ((uint32_t *)data)[1];
10040
10041                         lck_mtx_lock(&nspace_handler_lock);
10042
10043                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10044                                 if (nspace_items[i].token == token) {
10045                                         break;  /* exit for loop, not case stmt */
10046                                 }
10047                         }
10048
10049                         if (i >= MAX_NSPACE_ITEMS) {
10050                                 error = ENOENT;
10051                         } else {
10052                                 //
10053                                 // if this bit is set, when resolve_nspace_item() times out
10054                                 // it will loop and go back to sleep.
10055                                 //
10056                                 nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER;
10057                         }
10058
10059                         lck_mtx_unlock(&nspace_handler_lock);
10060
10061                         if (error) {
10062                                 printf("nspace-handler-update: did not find token %u\n", token);
10063                         }
10064                 }
10065                 break;
10066
10067                 case FSCTL_NAMESPACE_HANDLER_UNBLOCK: {
10068                         uint32_t token, val;
10069                         int i;
10070
10071                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10072                                 break;
10073                         }
10074
10075                         if (!nspace_is_special_process(p)) {
10076                                 error = EINVAL;
10077                                 break;
10078                         }
10079
10080                         token = ((uint32_t *)data)[0];
10081                         val   = ((uint32_t *)data)[1];
10082
10083                         lck_mtx_lock(&nspace_handler_lock);
10084
10085                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10086                                 if (nspace_items[i].token == token) {
10087                                         break; /* exit for loop, not case statement */
10088                                 }
10089                         }
10090
10091                         if (i >= MAX_NSPACE_ITEMS) {
10092                                 printf("nspace-handler-unblock: did not find token %u\n", token);
10093                                 error = ENOENT;
10094                         } else {
10095                                 if (val == 0 && nspace_items[i].vp) {
10096                                         vnode_lock_spin(nspace_items[i].vp);
10097                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10098                                         vnode_unlock(nspace_items[i].vp);
10099                                 }
10100
10101                                 nspace_items[i].vp = NULL;
10102                                 nspace_items[i].arg = NULL;
10103                                 nspace_items[i].op = 0;
10104                                 nspace_items[i].vid = 0;
10105                                 nspace_items[i].flags = NSPACE_ITEM_DONE;
10106                                 nspace_items[i].token = 0;
10107
10108                                 wakeup((caddr_t)&(nspace_items[i].vp));
10109                         }
10110
10111                         lck_mtx_unlock(&nspace_handler_lock);
10112                 }
10113                 break;
10114
10115                 case FSCTL_NAMESPACE_HANDLER_CANCEL: {
10116                         uint32_t token, val;
10117                         int i;
10118
10119                         if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) {
10120                                 break;
10121                         }
10122
10123                         if (!nspace_is_special_process(p)) {
10124                                 error = EINVAL;
10125                                 break;
10126                         }
10127
10128                         token = ((uint32_t *)data)[0];
10129                         val   = ((uint32_t *)data)[1];
10130
10131                         lck_mtx_lock(&nspace_handler_lock);
10132
10133                         for(i=0; i < MAX_NSPACE_ITEMS; i++) {
10134                                 if (nspace_items[i].token == token) {
10135                                         break;  /* exit for loop, not case stmt */
10136                                 }
10137                         }
10138
10139                         if (i >= MAX_NSPACE_ITEMS) {
10140                                 printf("nspace-handler-cancel: did not find token %u\n", token);
10141                                 error = ENOENT;
10142                         } else {
10143                                 if (nspace_items[i].vp) {
10144                                         vnode_lock_spin(nspace_items[i].vp);
10145                                         nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT;
10146                                         vnode_unlock(nspace_items[i].vp);
10147                                 }
10148
10149                                 nspace_items[i].vp = NULL;
10150                                 nspace_items[i].arg = NULL;
10151                                 nspace_items[i].vid = 0;
10152                                 nspace_items[i].token = val;
10153                                 nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING;
10154                                 nspace_items[i].flags |= NSPACE_ITEM_CANCELLED;
10155
10156                                 wakeup((caddr_t)&(nspace_items[i].vp));
10157                         }
10158
10159                         lck_mtx_unlock(&nspace_handler_lock);
10160                 }
10161                 break;
10162
10163                 case FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME: {
10164                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10165                                 break;
10166                         }
10167
10168                         // we explicitly do not do the namespace_handler_proc check here
10169
10170                         lck_mtx_lock(&nspace_handler_lock);
10171                         snapshot_timestamp = ((uint32_t *)data)[0];
10172                         wakeup(&nspace_item_idx);
10173                         lck_mtx_unlock(&nspace_handler_lock);
10174                         printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp);
10175
10176                 }
10177                 break;
10178
10179                 case FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS:
10180                 {
10181                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10182                                 break;
10183                         }
10184
10185                         lck_mtx_lock(&nspace_handler_lock);
10186                         nspace_allow_virtual_devs = ((uint32_t *)data)[0];
10187                         lck_mtx_unlock(&nspace_handler_lock);
10188                         printf("nspace-snapshot-handler will%s allow events on disk-images\n",
10189                                         nspace_allow_virtual_devs ? "" : " NOT");
10190                         error = 0;
10191
10192                 }
10193                 break;
10194
10195                 case FSCTL_SET_FSTYPENAME_OVERRIDE:
10196                 {
10197                         if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) {
10198                                 break;
10199                         }
10200                         if (vp->v_mount) {
10201                                 mount_lock(vp->v_mount);
10202                                 if (data[0] != 0) {
10203                                         strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN);
10204                                         vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE;
10205                                         if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
10206                                                 vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY;
10207                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE;
10208                                         }
10209                                 } else {
10210                                         if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) {
10211                                                 vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY;
10212                                         }
10213                                         vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE;
10214                                         vp->v_mount->fstypename_override[0] = '\0';
10215                                 }
10216                                 mount_unlock(vp->v_mount);
10217                         }
10218                 }
10219                 break;
10220
10221                 default: {
10222                         /* Invoke the filesystem-specific code */
10223                         error = VNOP_IOCTL(vp, IOCBASECMD(cmd), data, options, ctx);
10224                 }
10225
10226         } /* end switch stmt */
10227
10228         /*
10229          * if no errors, copy any data to user. Size was
10230          * already set and checked above.
10231          */
10232         if (error == 0 && (cmd & IOC_OUT) && size)
10233                 error = copyout(data, udata, size);
10234
10235         if (memp) {
10236                 kfree(memp, size);
10237         }
10238
10239         return error;
10240 }
10241
10242 /* ARGSUSED */
10243 int
10244 fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval)
10245 {
10246         int error;
10247         struct nameidata nd;
10248         u_long nameiflags;
10249         vnode_t vp = NULL;
10250         vfs_context_t ctx = vfs_context_current();
10251
10252         AUDIT_ARG(cmd, uap->cmd);
10253         AUDIT_ARG(value32, uap->options);
10254         /* Get the vnode for the file we are getting info on:  */
10255         nameiflags = 0;
10256         if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW;
10257         NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1,
10258                UIO_USERSPACE, uap->path, ctx);
10259         if ((error = namei(&nd))) goto done;
10260         vp = nd.ni_vp;
10261         nameidone(&nd);
10262
10263 #if CONFIG_MACF
10264         error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd);
10265         if (error) {
10266                 goto done;
10267         }
10268 #endif
10269
10270         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
10271
10272 done:
10273         if (vp)
10274                 vnode_put(vp);
10275         return error;
10276 }
10277 /* ARGSUSED */
10278 int
10279 ffsctl (proc_t p, struct ffsctl_args *uap, __unused int32_t *retval)
10280 {
10281         int error;
10282         vnode_t vp = NULL;
10283         vfs_context_t ctx = vfs_context_current();
10284         int fd = -1;
10285
10286         AUDIT_ARG(fd, uap->fd);
10287         AUDIT_ARG(cmd, uap->cmd);
10288         AUDIT_ARG(value32, uap->options);
10289
10290         /* Get the vnode for the file we are getting info on:  */
10291         if ((error = file_vnode(uap->fd, &vp)))
10292                 return error;
10293         fd = uap->fd;
10294         if ((error = vnode_getwithref(vp))) {
10295                 file_drop(fd);
10296                 return error;
10297         }
10298
10299 #if CONFIG_MACF
10300         if ((error = mac_mount_check_fsctl(ctx, vnode_mount(vp), uap->cmd))) {
10301                 file_drop(fd);
10302                 vnode_put(vp);
10303                 return error;
10304         }
10305 #endif
10306
10307         error = fsctl_internal(p, &vp, uap->cmd, (user_addr_t)uap->data, uap->options, ctx);
10308
10309         file_drop(fd);
10310
10311         /*validate vp; fsctl_internal() can drop iocount and reset vp to NULL*/
10312         if (vp) {
10313                 vnode_put(vp);
10314         }
10315
10316         return error;
10317 }
10318 /* end of fsctl system call */
10319
10320 /*
10321  *  Retrieve the data of an extended attribute.
10322  */
10323 int
10324 getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
10325 {
10326         vnode_t vp;
10327         struct nameidata nd;
10328         char attrname[XATTR_MAXNAMELEN+1];
10329         vfs_context_t ctx = vfs_context_current();
10330         uio_t auio = NULL;
10331         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10332         size_t attrsize = 0;
10333         size_t namelen;
10334         u_int32_t nameiflags;
10335         int error;
10336         char uio_buf[ UIO_SIZEOF(1) ];
10337
10338         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10339                 return (EINVAL);
10340
10341         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10342         NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx);
10343         if ((error = namei(&nd))) {
10344                 return (error);
10345         }
10346         vp = nd.ni_vp;
10347         nameidone(&nd);
10348
10349         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10350                 goto out;
10351         }
10352         if (xattr_protected(attrname)) {
10353                 if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) {
10354                         error = EPERM;
10355                         goto out;
10356                 }
10357         }
10358         /*
10359          * the specific check for 0xffffffff is a hack to preserve
10360          * binaray compatibilty in K64 with applications that discovered
10361          * that passing in a buf pointer and a size of -1 resulted in
10362          * just the size of the indicated extended attribute being returned.
10363          * this isn't part of the documented behavior, but because of the
10364          * original implemtation's check for "uap->size > 0", this behavior
10365          * was allowed. In K32 that check turned into a signed comparison
10366          * even though uap->size is unsigned...  in K64, we blow by that
10367          * check because uap->size is unsigned and doesn't get sign smeared
10368          * in the munger for a 32 bit user app.  we also need to add a
10369          * check to limit the maximum size of the buffer being passed in...
10370          * unfortunately, the underlying fileystems seem to just malloc
10371          * the requested size even if the actual extended attribute is tiny.
10372          * because that malloc is for kernel wired memory, we have to put a
10373          * sane limit on it.
10374          *
10375          * U32 running on K64 will yield 0x00000000ffffffff for uap->size
10376          * U64 running on K64 will yield -1 (64 bits wide)
10377          * U32/U64 running on K32 will yield -1 (32 bits wide)
10378          */
10379         if (uap->size == 0xffffffff || uap->size == (size_t)-1)
10380                 goto no_uio;
10381
10382         if (uap->value) {
10383                 if (uap->size > (size_t)XATTR_MAXSIZE)
10384                         uap->size = XATTR_MAXSIZE;
10385
10386                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
10387                                             &uio_buf[0], sizeof(uio_buf));
10388                 uio_addiov(auio, uap->value, uap->size);
10389         }
10390 no_uio:
10391         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, ctx);
10392 out:
10393         vnode_put(vp);
10394
10395         if (auio) {
10396                 *retval = uap->size - uio_resid(auio);
10397         } else {
10398                 *retval = (user_ssize_t)attrsize;
10399         }
10400
10401         return (error);
10402 }
10403
10404 /*
10405  * Retrieve the data of an extended attribute.
10406  */
10407 int
10408 fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
10409 {
10410         vnode_t vp;
10411         char attrname[XATTR_MAXNAMELEN+1];
10412         uio_t auio = NULL;
10413         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10414         size_t attrsize = 0;
10415         size_t namelen;
10416         int error;
10417         char uio_buf[ UIO_SIZEOF(1) ];
10418
10419         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10420                 return (EINVAL);
10421
10422         if ( (error = file_vnode(uap->fd, &vp)) ) {
10423                 return (error);
10424         }
10425         if ( (error = vnode_getwithref(vp)) ) {
10426                 file_drop(uap->fd);
10427                 return(error);
10428         }
10429         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10430                 goto out;
10431         }
10432         if (xattr_protected(attrname)) {
10433                 error = EPERM;
10434                 goto out;
10435         }
10436         if (uap->value && uap->size > 0) {
10437                 auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ,
10438                                             &uio_buf[0], sizeof(uio_buf));
10439                 uio_addiov(auio, uap->value, uap->size);
10440         }
10441
10442         error = vn_getxattr(vp, attrname, auio, &attrsize, uap->options, vfs_context_current());
10443 out:
10444         (void)vnode_put(vp);
10445         file_drop(uap->fd);
10446
10447         if (auio) {
10448                 *retval = uap->size - uio_resid(auio);
10449         } else {
10450                 *retval = (user_ssize_t)attrsize;
10451         }
10452         return (error);
10453 }
10454
10455 /*
10456  * Set the data of an extended attribute.
10457  */
10458 int
10459 setxattr(proc_t p, struct setxattr_args *uap, int *retval)
10460 {
10461         vnode_t vp;
10462         struct nameidata nd;
10463         char attrname[XATTR_MAXNAMELEN+1];
10464         vfs_context_t ctx = vfs_context_current();
10465         uio_t auio = NULL;
10466         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10467         size_t namelen;
10468         u_int32_t nameiflags;
10469         int error;
10470         char uio_buf[ UIO_SIZEOF(1) ];
10471
10472         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10473                 return (EINVAL);
10474
10475         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10476                 if (error == EPERM) {
10477                         /* if the string won't fit in attrname, copyinstr emits EPERM */
10478                         return (ENAMETOOLONG);
10479                 }
10480                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
10481                 return error;
10482         }
10483         if (xattr_protected(attrname))
10484                 return(EPERM);
10485         if (uap->size != 0 && uap->value == 0) {
10486                 return (EINVAL);
10487         }
10488
10489         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10490         NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx);
10491         if ((error = namei(&nd))) {
10492                 return (error);
10493         }
10494         vp = nd.ni_vp;
10495         nameidone(&nd);
10496
10497         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
10498                                     &uio_buf[0], sizeof(uio_buf));
10499         uio_addiov(auio, uap->value, uap->size);
10500
10501         error = vn_setxattr(vp, attrname, auio, uap->options, ctx);
10502 #if CONFIG_FSE
10503         if (error == 0) {
10504                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
10505                     FSE_ARG_VNODE, vp,
10506                     FSE_ARG_DONE);
10507         }
10508 #endif
10509         vnode_put(vp);
10510         *retval = 0;
10511         return (error);
10512 }
10513
10514 /*
10515  * Set the data of an extended attribute.
10516  */
10517 int
10518 fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
10519 {
10520         vnode_t vp;
10521         char attrname[XATTR_MAXNAMELEN+1];
10522         uio_t auio = NULL;
10523         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10524         size_t namelen;
10525         int error;
10526         char uio_buf[ UIO_SIZEOF(1) ];
10527 #if CONFIG_FSE
10528         vfs_context_t ctx = vfs_context_current();
10529 #endif
10530
10531         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10532                 return (EINVAL);
10533
10534         if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
10535                 if (error == EPERM) {
10536                         /* if the string won't fit in attrname, copyinstr emits EPERM */
10537                         return (ENAMETOOLONG);
10538                 }
10539                 /* Otherwise return the default error from copyinstr to detect ERANGE, etc */
10540                 return error;
10541         }
10542         if (xattr_protected(attrname))
10543                 return(EPERM);
10544         if (uap->size != 0 && uap->value == 0) {
10545                 return (EINVAL);
10546         }
10547         if ( (error = file_vnode(uap->fd, &vp)) ) {
10548                 return (error);
10549         }
10550         if ( (error = vnode_getwithref(vp)) ) {
10551                 file_drop(uap->fd);
10552                 return(error);
10553         }
10554         auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_WRITE,
10555                                     &uio_buf[0], sizeof(uio_buf));
10556         uio_addiov(auio, uap->value, uap->size);
10557
10558         error = vn_setxattr(vp, attrname, auio, uap->options, vfs_context_current());
10559 #if CONFIG_FSE
10560         if (error == 0) {
10561                 add_fsevent(FSE_XATTR_MODIFIED, ctx,
10562                     FSE_ARG_VNODE, vp,
10563                     FSE_ARG_DONE);
10564         }
10565 #endif
10566         vnode_put(vp);
10567         file_drop(uap->fd);
10568         *retval = 0;
10569         return (error);
10570 }
10571
10572 /*
10573  * Remove an extended attribute.
10574  * XXX Code duplication here.
10575  */
10576 int
10577 removexattr(proc_t p, struct removexattr_args *uap, int *retval)
10578 {
10579         vnode_t vp;
10580         struct nameidata nd;
10581         char attrname[XATTR_MAXNAMELEN+1];
10582         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10583         vfs_context_t ctx = vfs_context_current();
10584         size_t namelen;
10585         u_int32_t nameiflags;
10586         int error;
10587
10588         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10589                 return (EINVAL);
10590
10591         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10592         if (error != 0) {
10593                 return (error);
10594         }
10595         if (xattr_protected(attrname))
10596                 return(EPERM);
10597         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10598         NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx);
10599         if ((error = namei(&nd))) {
10600                 return (error);
10601         }
10602         vp = nd.ni_vp;
10603         nameidone(&nd);
10604
10605         error = vn_removexattr(vp, attrname, uap->options, ctx);
10606 #if CONFIG_FSE
10607         if (error == 0) {
10608                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10609                     FSE_ARG_VNODE, vp,
10610                     FSE_ARG_DONE);
10611         }
10612 #endif
10613         vnode_put(vp);
10614         *retval = 0;
10615         return (error);
10616 }
10617
10618 /*
10619  * Remove an extended attribute.
10620  * XXX Code duplication here.
10621  */
10622 int
10623 fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval)
10624 {
10625         vnode_t vp;
10626         char attrname[XATTR_MAXNAMELEN+1];
10627         size_t namelen;
10628         int error;
10629 #if CONFIG_FSE
10630         vfs_context_t ctx = vfs_context_current();
10631 #endif
10632
10633         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10634                 return (EINVAL);
10635
10636         error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
10637         if (error != 0) {
10638                 return (error);
10639         }
10640         if (xattr_protected(attrname))
10641                 return(EPERM);
10642         if ( (error = file_vnode(uap->fd, &vp)) ) {
10643                 return (error);
10644         }
10645         if ( (error = vnode_getwithref(vp)) ) {
10646                 file_drop(uap->fd);
10647                 return(error);
10648         }
10649
10650         error = vn_removexattr(vp, attrname, uap->options, vfs_context_current());
10651 #if CONFIG_FSE
10652         if (error == 0) {
10653                 add_fsevent(FSE_XATTR_REMOVED, ctx,
10654                     FSE_ARG_VNODE, vp,
10655                     FSE_ARG_DONE);
10656         }
10657 #endif
10658         vnode_put(vp);
10659         file_drop(uap->fd);
10660         *retval = 0;
10661         return (error);
10662 }
10663
10664 /*
10665  * Retrieve the list of extended attribute names.
10666  * XXX Code duplication here.
10667  */
10668 int
10669 listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval)
10670 {
10671         vnode_t vp;
10672         struct nameidata nd;
10673         vfs_context_t ctx = vfs_context_current();
10674         uio_t auio = NULL;
10675         int spacetype = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10676         size_t attrsize = 0;
10677         u_int32_t nameiflags;
10678         int error;
10679         char uio_buf[ UIO_SIZEOF(1) ];
10680
10681         if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
10682                 return (EINVAL);
10683
10684         nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW;
10685         NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx);
10686         if ((error = namei(&nd))) {
10687                 return (error);
10688         }
10689         vp = nd.ni_vp;
10690         nameidone(&nd);
10691         if (uap->namebuf != 0 && uap->bufsize > 0) {
10692                 auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ,
10693                                             &uio_buf[0], sizeof(uio_buf));
10694                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10695         }
10696
10697         error = vn_listxattr(vp, auio, &attrsize, uap->options, ctx);
10698
10699         vnode_put(vp);
10700         if (auio) {
10701                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10702         } else {
10703                 *retval = (user_ssize_t)attrsize;
10704         }
10705         return (error);
10706 }
10707
10708 /*
10709  * Retrieve the list of extended attribute names.
10710  * XXX Code duplication here.
10711  */
10712 int
10713 flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval)
10714 {
10715         vnode_t vp;
10716         uio_t auio = NULL;
10717         int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
10718         size_t attrsize = 0;
10719         int error;
10720         char uio_buf[ UIO_SIZEOF(1) ];
10721
10722         if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
10723                 return (EINVAL);
10724
10725         if ( (error = file_vnode(uap->fd, &vp)) ) {
10726                 return (error);
10727         }
10728         if ( (error = vnode_getwithref(vp)) ) {
10729                 file_drop(uap->fd);
10730                 return(error);
10731         }
10732         if (uap->namebuf != 0 && uap->bufsize > 0) {
10733                 auio = uio_createwithbuffer(1, 0, spacetype,
10734                                                                           UIO_READ, &uio_buf[0], sizeof(uio_buf));
10735                 uio_addiov(auio, uap->namebuf, uap->bufsize);
10736         }
10737
10738         error = vn_listxattr(vp, auio, &attrsize, uap->options, vfs_context_current());
10739
10740         vnode_put(vp);
10741         file_drop(uap->fd);
10742         if (auio) {
10743                 *retval = (user_ssize_t)uap->bufsize - uio_resid(auio);
10744         } else {
10745                 *retval = (user_ssize_t)attrsize;
10746         }
10747         return (error);
10748 }
10749
10750 static int fsgetpath_internal(
10751         vfs_context_t ctx, int volfs_id, uint64_t objid,
10752         vm_size_t bufsize, caddr_t buf, int *pathlen)
10753 {
10754         int error;
10755         struct mount *mp = NULL;
10756         vnode_t vp;
10757         int length;
10758         int bpflags;
10759         /* maximum number of times to retry build_path */
10760         unsigned int retries = 0x10;
10761
10762         if (bufsize > PAGE_SIZE) {
10763                 return (EINVAL);
10764         }
10765
10766         if (buf == NULL) {
10767                 return (ENOMEM);
10768         }
10769
10770 retry:
10771         if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) {
10772                 error = ENOTSUP;  /* unexpected failure */
10773                 return ENOTSUP;
10774         }
10775
10776 unionget:
10777         if (objid == 2) {
10778                 error = VFS_ROOT(mp, &vp, ctx);
10779         } else {
10780                 error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx);
10781         }
10782
10783         if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) {
10784                 /*
10785                  * If the fileid isn't found and we're in a union
10786                  * mount volume, then see if the fileid is in the
10787                  * mounted-on volume.
10788                  */
10789                 struct mount *tmp = mp;
10790                 mp = vnode_mount(tmp->mnt_vnodecovered);
10791                 vfs_unbusy(tmp);
10792                 if (vfs_busy(mp, LK_NOWAIT) == 0)
10793                         goto unionget;
10794         } else {
10795                 vfs_unbusy(mp);
10796         }
10797
10798         if (error) {
10799                 return error;
10800         }
10801
10802 #if CONFIG_MACF
10803         error = mac_vnode_check_fsgetpath(ctx, vp);
10804         if (error) {
10805                 vnode_put(vp);
10806                 return error;
10807         }
10808 #endif
10809
10810         /* Obtain the absolute path to this vnode. */
10811         bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0;
10812         bpflags |= BUILDPATH_CHECK_MOVED;
10813         error = build_path(vp, buf, bufsize, &length, bpflags, ctx);
10814         vnode_put(vp);
10815
10816         if (error) {
10817                 /* there was a race building the path, try a few more times */
10818                 if (error == EAGAIN) {
10819                         --retries;
10820                         if (retries > 0)
10821                                 goto retry;
10822
10823                         error = ENOENT;
10824                 }
10825                 goto out;
10826         }
10827
10828         AUDIT_ARG(text, buf);
10829
10830         if (kdebug_enable) {
10831                 long dbg_parms[NUMPARMS];
10832                 int  dbg_namelen;
10833
10834                 dbg_namelen = (int)sizeof(dbg_parms);
10835
10836         if (length < dbg_namelen) {
10837                         memcpy((char *)dbg_parms, buf, length);
10838                         memset((char *)dbg_parms + length, 0, dbg_namelen - length);
10839
10840                         dbg_namelen = length;
10841                 } else {
10842                         memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
10843                 }
10844
10845                 kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE);
10846         }
10847
10848         *pathlen = (user_ssize_t)length; /* may be superseded by error */
10849
10850 out:
10851         return (error);
10852 }
10853
10854 /*
10855  * Obtain the full pathname of a file system object by id.
10856  *
10857  * This is a private SPI used by the File Manager.
10858  */
10859 __private_extern__
10860 int
10861 fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
10862 {
10863         vfs_context_t ctx = vfs_context_current();
10864         fsid_t fsid;
10865         char *realpath;
10866         int length;
10867         int error;
10868
10869         if ((error = copyin(uap->fsid, (caddr_t)&fsid, sizeof(fsid)))) {
10870                 return (error);
10871         }
10872         AUDIT_ARG(value32, fsid.val[0]);
10873         AUDIT_ARG(value64, uap->objid);
10874         /* Restrict output buffer size for now. */
10875
10876         if (uap->bufsize > PAGE_SIZE) {
10877                 return (EINVAL);
10878         }
10879         MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK);
10880         if (realpath == NULL) {
10881                 return (ENOMEM);
10882         }
10883
10884         error = fsgetpath_internal(
10885                 ctx, fsid.val[0], uap->objid,
10886                 uap->bufsize, realpath, &length);
10887
10888         if (error) {
10889                 goto out;
10890         }
10891
10892         error = copyout((caddr_t)realpath, uap->buf, length);
10893
10894         *retval = (user_ssize_t)length; /* may be superseded by error */
10895 out:
10896         if (realpath) {
10897                 FREE(realpath, M_TEMP);
10898         }
10899         return (error);
10900 }
10901
10902 /*
10903  * Common routine to handle various flavors of statfs data heading out
10904  *      to user space.
10905  *
10906  * Returns:     0                       Success
10907  *              EFAULT
10908  */
10909 static int
10910 munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
10911     user_addr_t bufp, int *sizep, boolean_t is_64_bit,
10912     boolean_t partial_copy)
10913 {
10914         int             error;
10915         int             my_size, copy_size;
10916
10917         if (is_64_bit) {
10918                 struct user64_statfs sfs;
10919                 my_size = copy_size = sizeof(sfs);
10920                 bzero(&sfs, my_size);
10921                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10922                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10923                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10924                 sfs.f_bsize = (user64_long_t)sfsp->f_bsize;
10925                 sfs.f_iosize = (user64_long_t)sfsp->f_iosize;
10926                 sfs.f_blocks = (user64_long_t)sfsp->f_blocks;
10927                 sfs.f_bfree = (user64_long_t)sfsp->f_bfree;
10928                 sfs.f_bavail = (user64_long_t)sfsp->f_bavail;
10929                 sfs.f_files = (user64_long_t)sfsp->f_files;
10930                 sfs.f_ffree = (user64_long_t)sfsp->f_ffree;
10931                 sfs.f_fsid = sfsp->f_fsid;
10932                 sfs.f_owner = sfsp->f_owner;
10933                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
10934                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
10935                 } else {
10936                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
10937                 }
10938                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
10939                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
10940
10941                 if (partial_copy) {
10942                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
10943                 }
10944                 error = copyout((caddr_t)&sfs, bufp, copy_size);
10945         }
10946         else {
10947                 struct user32_statfs sfs;
10948
10949                 my_size = copy_size = sizeof(sfs);
10950                 bzero(&sfs, my_size);
10951
10952                 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
10953                 sfs.f_type = mp->mnt_vtable->vfc_typenum;
10954                 sfs.f_reserved1 = (short)sfsp->f_fssubtype;
10955
10956                 /*
10957                  * It's possible for there to be more than 2^^31 blocks in the filesystem, so we
10958                  * have to fudge the numbers here in that case.   We inflate the blocksize in order
10959                  * to reflect the filesystem size as best we can.
10960                  */
10961                 if ((sfsp->f_blocks > INT_MAX)
10962                         /* Hack for 4061702 . I think the real fix is for Carbon to
10963                          * look for some volume capability and not depend on hidden
10964                          * semantics agreed between a FS and carbon.
10965                          * f_blocks, f_bfree, and f_bavail set to -1 is the trigger
10966                          * for Carbon to set bNoVolumeSizes volume attribute.
10967                          * Without this the webdavfs files cannot be copied onto
10968                          * disk as they look huge. This change should not affect
10969                          * XSAN as they should not setting these to -1..
10970                          */
10971                          && (sfsp->f_blocks != 0xffffffffffffffffULL)
10972                          && (sfsp->f_bfree != 0xffffffffffffffffULL)
10973                          && (sfsp->f_bavail != 0xffffffffffffffffULL)) {
10974                         int             shift;
10975
10976                         /*
10977                          * Work out how far we have to shift the block count down to make it fit.
10978                          * Note that it's possible to have to shift so far that the resulting
10979                          * blocksize would be unreportably large.  At that point, we will clip
10980                          * any values that don't fit.
10981                          *
10982                          * For safety's sake, we also ensure that f_iosize is never reported as
10983                          * being smaller than f_bsize.
10984                          */
10985                         for (shift = 0; shift < 32; shift++) {
10986                                 if ((sfsp->f_blocks >> shift) <= INT_MAX)
10987                                         break;
10988                                 if ((sfsp->f_bsize << (shift + 1)) > INT_MAX)
10989                                         break;
10990                         }
10991 #define __SHIFT_OR_CLIP(x, s)   ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
10992                         sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_blocks, shift);
10993                         sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bfree, shift);
10994                         sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sfsp->f_bavail, shift);
10995 #undef __SHIFT_OR_CLIP
10996                         sfs.f_bsize = (user32_long_t)(sfsp->f_bsize << shift);
10997                         sfs.f_iosize = lmax(sfsp->f_iosize, sfsp->f_bsize);
10998                 } else {
10999                         /* filesystem is small enough to be reported honestly */
11000                         sfs.f_bsize = (user32_long_t)sfsp->f_bsize;
11001                         sfs.f_iosize = (user32_long_t)sfsp->f_iosize;
11002                         sfs.f_blocks = (user32_long_t)sfsp->f_blocks;
11003                         sfs.f_bfree = (user32_long_t)sfsp->f_bfree;
11004                         sfs.f_bavail = (user32_long_t)sfsp->f_bavail;
11005                 }
11006                 sfs.f_files = (user32_long_t)sfsp->f_files;
11007                 sfs.f_ffree = (user32_long_t)sfsp->f_ffree;
11008                 sfs.f_fsid = sfsp->f_fsid;
11009                 sfs.f_owner = sfsp->f_owner;
11010                 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
11011                         strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSNAMELEN);
11012                 } else {
11013                         strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN);
11014                 }
11015                 strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN);
11016                 strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN);
11017
11018                 if (partial_copy) {
11019                         copy_size -= (sizeof(sfs.f_reserved3) + sizeof(sfs.f_reserved4));
11020                 }
11021                 error = copyout((caddr_t)&sfs, bufp, copy_size);
11022         }
11023
11024         if (sizep != NULL) {
11025                 *sizep = my_size;
11026         }
11027         return(error);
11028 }
11029
11030 /*
11031  * copy stat structure into user_stat structure.
11032  */
11033 void munge_user64_stat(struct stat *sbp, struct user64_stat *usbp)
11034 {
11035         bzero(usbp, sizeof(*usbp));
11036
11037         usbp->st_dev = sbp->st_dev;
11038         usbp->st_ino = sbp->st_ino;
11039         usbp->st_mode = sbp->st_mode;
11040         usbp->st_nlink = sbp->st_nlink;
11041         usbp->st_uid = sbp->st_uid;
11042         usbp->st_gid = sbp->st_gid;
11043         usbp->st_rdev = sbp->st_rdev;
11044 #ifndef _POSIX_C_SOURCE
11045         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11046         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11047         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11048         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11049         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11050         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11051 #else
11052         usbp->st_atime = sbp->st_atime;
11053         usbp->st_atimensec = sbp->st_atimensec;
11054         usbp->st_mtime = sbp->st_mtime;
11055         usbp->st_mtimensec = sbp->st_mtimensec;
11056         usbp->st_ctime = sbp->st_ctime;
11057         usbp->st_ctimensec = sbp->st_ctimensec;
11058 #endif
11059         usbp->st_size = sbp->st_size;
11060         usbp->st_blocks = sbp->st_blocks;
11061         usbp->st_blksize = sbp->st_blksize;
11062         usbp->st_flags = sbp->st_flags;
11063         usbp->st_gen = sbp->st_gen;
11064         usbp->st_lspare = sbp->st_lspare;
11065         usbp->st_qspare[0] = sbp->st_qspare[0];
11066         usbp->st_qspare[1] = sbp->st_qspare[1];
11067 }
11068
11069 void munge_user32_stat(struct stat *sbp, struct user32_stat *usbp)
11070 {
11071         bzero(usbp, sizeof(*usbp));
11072
11073         usbp->st_dev = sbp->st_dev;
11074         usbp->st_ino = sbp->st_ino;
11075         usbp->st_mode = sbp->st_mode;
11076         usbp->st_nlink = sbp->st_nlink;
11077         usbp->st_uid = sbp->st_uid;
11078         usbp->st_gid = sbp->st_gid;
11079         usbp->st_rdev = sbp->st_rdev;
11080 #ifndef _POSIX_C_SOURCE
11081         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11082         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11083         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11084         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11085         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11086         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11087 #else
11088         usbp->st_atime = sbp->st_atime;
11089         usbp->st_atimensec = sbp->st_atimensec;
11090         usbp->st_mtime = sbp->st_mtime;
11091         usbp->st_mtimensec = sbp->st_mtimensec;
11092         usbp->st_ctime = sbp->st_ctime;
11093         usbp->st_ctimensec = sbp->st_ctimensec;
11094 #endif
11095         usbp->st_size = sbp->st_size;
11096         usbp->st_blocks = sbp->st_blocks;
11097         usbp->st_blksize = sbp->st_blksize;
11098         usbp->st_flags = sbp->st_flags;
11099         usbp->st_gen = sbp->st_gen;
11100         usbp->st_lspare = sbp->st_lspare;
11101         usbp->st_qspare[0] = sbp->st_qspare[0];
11102         usbp->st_qspare[1] = sbp->st_qspare[1];
11103 }
11104
11105 /*
11106  * copy stat64 structure into user_stat64 structure.
11107  */
11108 void munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp)
11109 {
11110         bzero(usbp, sizeof(*usbp));
11111
11112         usbp->st_dev = sbp->st_dev;
11113         usbp->st_ino = sbp->st_ino;
11114         usbp->st_mode = sbp->st_mode;
11115         usbp->st_nlink = sbp->st_nlink;
11116         usbp->st_uid = sbp->st_uid;
11117         usbp->st_gid = sbp->st_gid;
11118         usbp->st_rdev = sbp->st_rdev;
11119 #ifndef _POSIX_C_SOURCE
11120         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11121         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11122         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11123         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11124         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11125         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11126         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11127         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11128 #else
11129         usbp->st_atime = sbp->st_atime;
11130         usbp->st_atimensec = sbp->st_atimensec;
11131         usbp->st_mtime = sbp->st_mtime;
11132         usbp->st_mtimensec = sbp->st_mtimensec;
11133         usbp->st_ctime = sbp->st_ctime;
11134         usbp->st_ctimensec = sbp->st_ctimensec;
11135         usbp->st_birthtime = sbp->st_birthtime;
11136         usbp->st_birthtimensec = sbp->st_birthtimensec;
11137 #endif
11138         usbp->st_size = sbp->st_size;
11139         usbp->st_blocks = sbp->st_blocks;
11140         usbp->st_blksize = sbp->st_blksize;
11141         usbp->st_flags = sbp->st_flags;
11142         usbp->st_gen = sbp->st_gen;
11143         usbp->st_lspare = sbp->st_lspare;
11144         usbp->st_qspare[0] = sbp->st_qspare[0];
11145         usbp->st_qspare[1] = sbp->st_qspare[1];
11146 }
11147
11148 void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp)
11149 {
11150         bzero(usbp, sizeof(*usbp));
11151
11152         usbp->st_dev = sbp->st_dev;
11153         usbp->st_ino = sbp->st_ino;
11154         usbp->st_mode = sbp->st_mode;
11155         usbp->st_nlink = sbp->st_nlink;
11156         usbp->st_uid = sbp->st_uid;
11157         usbp->st_gid = sbp->st_gid;
11158         usbp->st_rdev = sbp->st_rdev;
11159 #ifndef _POSIX_C_SOURCE
11160         usbp->st_atimespec.tv_sec = sbp->st_atimespec.tv_sec;
11161         usbp->st_atimespec.tv_nsec = sbp->st_atimespec.tv_nsec;
11162         usbp->st_mtimespec.tv_sec = sbp->st_mtimespec.tv_sec;
11163         usbp->st_mtimespec.tv_nsec = sbp->st_mtimespec.tv_nsec;
11164         usbp->st_ctimespec.tv_sec = sbp->st_ctimespec.tv_sec;
11165         usbp->st_ctimespec.tv_nsec = sbp->st_ctimespec.tv_nsec;
11166         usbp->st_birthtimespec.tv_sec = sbp->st_birthtimespec.tv_sec;
11167         usbp->st_birthtimespec.tv_nsec = sbp->st_birthtimespec.tv_nsec;
11168 #else
11169         usbp->st_atime = sbp->st_atime;
11170         usbp->st_atimensec = sbp->st_atimensec;
11171         usbp->st_mtime = sbp->st_mtime;
11172         usbp->st_mtimensec = sbp->st_mtimensec;
11173         usbp->st_ctime = sbp->st_ctime;
11174         usbp->st_ctimensec = sbp->st_ctimensec;
11175         usbp->st_birthtime = sbp->st_birthtime;
11176         usbp->st_birthtimensec = sbp->st_birthtimensec;
11177 #endif
11178         usbp->st_size = sbp->st_size;
11179         usbp->st_blocks = sbp->st_blocks;
11180         usbp->st_blksize = sbp->st_blksize;
11181         usbp->st_flags = sbp->st_flags;
11182         usbp->st_gen = sbp->st_gen;
11183         usbp->st_lspare = sbp->st_lspare;
11184         usbp->st_qspare[0] = sbp->st_qspare[0];
11185         usbp->st_qspare[1] = sbp->st_qspare[1];
11186 }
11187
11188 /*
11189  * Purge buffer cache for simulating cold starts
11190  */
11191 static int vnode_purge_callback(struct vnode *vp, __unused void *cargs)
11192 {
11193         ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL /* off_t *resid_off */, UBC_PUSHALL | UBC_INVALIDATE);
11194
11195         return VNODE_RETURNED;
11196 }
11197
11198 static int vfs_purge_callback(mount_t mp, __unused void * arg)
11199 {
11200         vnode_iterate(mp, VNODE_WAIT | VNODE_ITERATE_ALL, vnode_purge_callback, NULL);
11201
11202         return VFS_RETURNED;
11203 }
11204
11205 int
11206 vfs_purge(__unused struct proc *p, __unused struct vfs_purge_args *uap, __unused int32_t *retval)
11207 {
11208         if (!kauth_cred_issuser(kauth_cred_get()))
11209                 return EPERM;
11210
11211         vfs_iterate(0/* flags */, vfs_purge_callback, NULL);
11212
11213         return 0;
11214 }
11215
11216 /*
11217  * gets the vnode associated with the (unnamed) snapshot directory
11218  * for a Filesystem. The snapshot directory vnode is returned with
11219  * an iocount on it.
11220  */
11221 int
11222 vnode_get_snapdir(vnode_t rvp, vnode_t *sdvpp, vfs_context_t ctx)
11223 {
11224         return (VFS_VGET_SNAPDIR(vnode_mount(rvp), sdvpp, ctx));
11225 }
11226
11227 /*
11228  * Get the snapshot vnode.
11229  *
11230  * If successful, the call returns with an iocount on *rvpp ,*sdvpp and
11231  * needs nameidone() on ndp.
11232  *
11233  * If the snapshot vnode exists it is returned in ndp->ni_vp.
11234  *
11235  * If it returns with an error, *rvpp, *sdvpp are NULL and nameidone() is
11236  * not needed.
11237  */
11238 static int
11239 vnode_get_snapshot(int dirfd, vnode_t *rvpp, vnode_t *sdvpp,
11240     user_addr_t name, struct nameidata *ndp, int32_t op,
11241 #if !CONFIG_TRIGGERS
11242     __unused
11243 #endif
11244     enum path_operation pathop,
11245     vfs_context_t ctx)
11246 {
11247         int error, i;
11248         caddr_t name_buf;
11249         size_t name_len;
11250         struct vfs_attr vfa;
11251
11252         *sdvpp = NULLVP;
11253         *rvpp = NULLVP;
11254
11255         error = vnode_getfromfd(ctx, dirfd, rvpp);
11256         if (error)
11257                 return (error);
11258
11259         if (!vnode_isvroot(*rvpp)) {
11260                 error = EINVAL;
11261                 goto out;
11262         }
11263
11264         /* Make sure the filesystem supports snapshots */
11265         VFSATTR_INIT(&vfa);
11266         VFSATTR_WANTED(&vfa, f_capabilities);
11267         if ((vfs_getattr(vnode_mount(*rvpp), &vfa, ctx) != 0) ||
11268             !VFSATTR_IS_SUPPORTED(&vfa, f_capabilities) ||
11269             !((vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] &
11270             VOL_CAP_INT_SNAPSHOT)) ||
11271             !((vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] &
11272             VOL_CAP_INT_SNAPSHOT))) {
11273                 error = ENOTSUP;
11274                 goto out;
11275         }
11276
11277         error = vnode_get_snapdir(*rvpp, sdvpp, ctx);
11278         if (error)
11279                 goto out;
11280
11281         MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11282         error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11283         if (error)
11284                 goto out1;
11285
11286         /*
11287          * Some sanity checks- name can't be empty, "." or ".." or have slashes.
11288          * (the length returned by copyinstr includes the terminating NUL)
11289          */
11290         if ((name_len == 1) || (name_len == 2 && name_buf[0] == '.') ||
11291             (name_len == 3 && name_buf[0] == '.' && name_buf[1] == '.')) {
11292                 error = EINVAL;
11293                 goto out1;
11294         }
11295         for (i = 0; i < (int)name_len && name_buf[i] != '/'; i++);
11296         if (i < (int)name_len) {
11297                 error = EINVAL;
11298                 goto out1;
11299         }
11300
11301 #if CONFIG_MACF
11302         if (op == CREATE) {
11303                 error = mac_mount_check_snapshot_create(ctx, vnode_mount(*rvpp),
11304                     name_buf);
11305         } else if (op == DELETE) {
11306                 error = mac_mount_check_snapshot_delete(ctx, vnode_mount(*rvpp),
11307                     name_buf);
11308         }
11309         if (error)
11310                 goto out1;
11311 #endif
11312
11313         /* Check if the snapshot already exists ... */
11314         NDINIT(ndp, op, pathop, USEDVP | NOCACHE | AUDITVNPATH1,
11315             UIO_SYSSPACE, CAST_USER_ADDR_T(name_buf), ctx);
11316         ndp->ni_dvp = *sdvpp;
11317
11318         error = namei(ndp);
11319 out1:
11320         FREE(name_buf, M_TEMP);
11321 out:
11322         if (error) {
11323                 if (*sdvpp) {
11324                         vnode_put(*sdvpp);
11325                         *sdvpp = NULLVP;
11326                 }
11327                 if (*rvpp) {
11328                         vnode_put(*rvpp);
11329                         *rvpp = NULLVP;
11330                 }
11331         }
11332         return (error);
11333 }
11334
11335 /*
11336  * create a filesystem snapshot (for supporting filesystems)
11337  *
11338  * A much simplified version of openat(dirfd, name, O_CREAT | O_EXCL)
11339  * We get to the (unnamed) snapshot directory vnode and create the vnode
11340  * for the snapshot in it.
11341  *
11342  * Restrictions:
11343  *
11344  *    a) Passed in name for snapshot cannot have slashes.
11345  *    b) name can't be "." or ".."
11346  *
11347  * Since this requires superuser privileges, vnode_authorize calls are not
11348  * made.
11349  */
11350 static int
11351 snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags,
11352     vfs_context_t ctx)
11353 {
11354         vnode_t rvp, snapdvp;
11355         int error;
11356         struct nameidata namend;
11357
11358         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, CREATE,
11359             OP_LINK, ctx);
11360         if (error)
11361                 return (error);
11362
11363         if (namend.ni_vp) {
11364                 vnode_put(namend.ni_vp);
11365                 error = EEXIST;
11366         } else {
11367                 struct vnode_attr va;
11368                 vnode_t vp = NULLVP;
11369
11370                 VATTR_INIT(&va);
11371                 VATTR_SET(&va, va_type, VREG);
11372                 VATTR_SET(&va, va_mode, 0);
11373
11374                 error = vn_create(snapdvp, &vp, &namend, &va,
11375                     VN_CREATE_NOAUTH  | VN_CREATE_NOINHERIT, 0, NULL, ctx);
11376                 if (!error && vp)
11377                         vnode_put(vp);
11378         }
11379
11380         nameidone(&namend);
11381         vnode_put(snapdvp);
11382         vnode_put(rvp);
11383         return (error);
11384 }
11385
11386 /*
11387  * Delete a Filesystem snapshot
11388  *
11389  * get the vnode for the unnamed snapshot directory and the snapshot and
11390  * delete the snapshot.
11391  */
11392 static int
11393 snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags,
11394     vfs_context_t ctx)
11395 {
11396         vnode_t rvp, snapdvp;
11397         int error;
11398         struct nameidata namend;
11399
11400         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, DELETE,
11401             OP_UNLINK, ctx);
11402         if (error)
11403                 goto out;
11404
11405         error = VNOP_REMOVE(snapdvp, namend.ni_vp, &namend.ni_cnd,
11406             VNODE_REMOVE_SKIP_NAMESPACE_EVENT, ctx);
11407
11408         vnode_put(namend.ni_vp);
11409         nameidone(&namend);
11410         vnode_put(snapdvp);
11411         vnode_put(rvp);
11412 out:
11413         return (error);
11414 }
11415
11416 /*
11417  * Revert a filesystem to a snapshot
11418  *
11419  * Marks the filesystem to revert to the given snapshot on next mount.
11420  */
11421 static int
11422 snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags,
11423                 vfs_context_t ctx)
11424 {
11425     int error;
11426     vnode_t rvp;
11427     mount_t mp;
11428     struct fs_snapshot_revert_args revert_data;
11429     struct componentname cnp;
11430     caddr_t name_buf;
11431     size_t name_len;
11432
11433     error = vnode_getfromfd(ctx, dirfd, &rvp);
11434     if (error) {
11435         return (error);
11436     }
11437     mp = vnode_mount(rvp);
11438
11439     MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11440     error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11441     if (error) {
11442         FREE(name_buf, M_TEMP);
11443         vnode_put(rvp);
11444         return (error);
11445     }
11446
11447 #if CONFIG_MACF
11448     error = mac_mount_check_snapshot_revert(ctx, mp, name_buf);
11449     if (error) {
11450         FREE(name_buf, M_TEMP);
11451         vnode_put(rvp);
11452         return (error);
11453     }
11454 #endif
11455
11456     /*
11457      * Grab mount_iterref so that we can release the vnode,
11458      * since VFSIOC_REVERT_SNAPSHOT could conceivably cause a sync.
11459      */
11460     error = mount_iterref (mp, 0);
11461     vnode_put(rvp);
11462     if (error) {
11463         FREE(name_buf, M_TEMP);
11464         return (error);
11465     }
11466
11467     memset(&cnp, 0, sizeof(cnp));
11468     cnp.cn_pnbuf = (char *)name_buf;
11469     cnp.cn_nameiop = LOOKUP;
11470     cnp.cn_flags = ISLASTCN | HASBUF;
11471     cnp.cn_pnlen = MAXPATHLEN;
11472     cnp.cn_nameptr = cnp.cn_pnbuf;
11473     cnp.cn_namelen = (int)name_len;
11474     revert_data.sr_cnp = &cnp;
11475
11476     error = VFS_IOCTL(mp, VFSIOC_REVERT_SNAPSHOT, (caddr_t)&revert_data, 0, ctx);
11477     mount_iterdrop(mp);
11478     FREE(name_buf, M_TEMP);
11479
11480     if (error) {
11481         /* If there was any error, try again using VNOP_IOCTL */
11482
11483         vnode_t snapdvp;
11484         struct nameidata namend;
11485
11486         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, &namend, LOOKUP,
11487                                    OP_LOOKUP, ctx);
11488         if (error) {
11489             return (error);
11490         }
11491
11492
11493 #ifndef APFSIOC_REVERT_TO_SNAPSHOT
11494 #define APFSIOC_REVERT_TO_SNAPSHOT  _IOW('J', 1, u_int64_t)
11495 #endif
11496
11497 #ifndef APFS_REVERT_TO_SNAPSHOT
11498 #define APFS_REVERT_TO_SNAPSHOT     IOCBASECMD(APFSIOC_REVERT_TO_SNAPSHOT)
11499 #endif
11500
11501         error = VNOP_IOCTL(namend.ni_vp, APFS_REVERT_TO_SNAPSHOT, (caddr_t) NULL,
11502                            0, ctx);
11503
11504         vnode_put(namend.ni_vp);
11505         nameidone(&namend);
11506         vnode_put(snapdvp);
11507         vnode_put(rvp);
11508     }
11509
11510         return (error);
11511 }
11512
11513 /*
11514  * rename a Filesystem snapshot
11515  *
11516  * get the vnode for the unnamed snapshot directory and the snapshot and
11517  * rename the snapshot. This is a very specialised (and simple) case of
11518  * rename(2) (which has to deal with a lot more complications). It differs
11519  * slightly from rename(2) in that EEXIST is returned if the new name exists.
11520  */
11521 static int
11522 snapshot_rename(int dirfd, user_addr_t old, user_addr_t new,
11523     __unused uint32_t flags, vfs_context_t ctx)
11524 {
11525         vnode_t rvp, snapdvp;
11526         int error, i;
11527         caddr_t newname_buf;
11528         size_t name_len;
11529         vnode_t fvp;
11530         struct nameidata *fromnd, *tond;
11531         /* carving out a chunk for structs that are too big to be on stack. */
11532         struct {
11533                 struct nameidata from_node;
11534                 struct nameidata to_node;
11535         } * __rename_data;
11536
11537         MALLOC(__rename_data, void *, sizeof(*__rename_data), M_TEMP, M_WAITOK);
11538         fromnd = &__rename_data->from_node;
11539         tond = &__rename_data->to_node;
11540
11541         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, old, fromnd, DELETE,
11542             OP_UNLINK, ctx);
11543         if (error)
11544                 goto out;
11545         fvp  = fromnd->ni_vp;
11546
11547         MALLOC(newname_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11548         error = copyinstr(new, newname_buf, MAXPATHLEN, &name_len);
11549         if (error)
11550                 goto out1;
11551
11552         /*
11553          * Some sanity checks- new name can't be empty, "." or ".." or have
11554          * slashes.
11555          * (the length returned by copyinstr includes the terminating NUL)
11556          *
11557          * The FS rename VNOP is suppossed to handle this but we'll pick it
11558          * off here itself.
11559          */
11560         if ((name_len == 1) || (name_len == 2 && newname_buf[0] == '.') ||
11561             (name_len == 3 && newname_buf[0] == '.' && newname_buf[1] == '.')) {
11562                 error = EINVAL;
11563                 goto out1;
11564         }
11565         for (i = 0; i < (int)name_len && newname_buf[i] != '/'; i++);
11566         if (i < (int)name_len) {
11567                 error = EINVAL;
11568                 goto out1;
11569         }
11570
11571 #if CONFIG_MACF
11572         error = mac_mount_check_snapshot_create(ctx, vnode_mount(rvp),
11573             newname_buf);
11574         if (error)
11575                 goto out1;
11576 #endif
11577
11578         NDINIT(tond, RENAME, OP_RENAME, USEDVP | NOCACHE | AUDITVNPATH2,
11579             UIO_SYSSPACE, CAST_USER_ADDR_T(newname_buf), ctx);
11580         tond->ni_dvp = snapdvp;
11581
11582         error = namei(tond);
11583         if (error) {
11584                 goto out2;
11585         } else if (tond->ni_vp) {
11586                 /*
11587                  * snapshot rename behaves differently than rename(2) - if the
11588                  * new name exists, EEXIST is returned.
11589                  */
11590                 vnode_put(tond->ni_vp);
11591                 error = EEXIST;
11592                 goto out2;
11593         }
11594
11595         error = VNOP_RENAME(snapdvp, fvp, &fromnd->ni_cnd, snapdvp, NULLVP,
11596             &tond->ni_cnd, ctx);
11597
11598 out2:
11599         nameidone(tond);
11600 out1:
11601         FREE(newname_buf, M_TEMP);
11602         vnode_put(fvp);
11603         vnode_put(snapdvp);
11604         vnode_put(rvp);
11605         nameidone(fromnd);
11606 out:
11607         FREE(__rename_data, M_TEMP);
11608         return (error);
11609 }
11610
11611 /*
11612  * Mount a Filesystem snapshot
11613  *
11614  * get the vnode for the unnamed snapshot directory and the snapshot and
11615  * mount the snapshot.
11616  */
11617 static int
11618 snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory,
11619     __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx)
11620 {
11621         vnode_t rvp, snapdvp, snapvp, vp, pvp;
11622         int error;
11623         struct nameidata *snapndp, *dirndp;
11624         /* carving out a chunk for structs that are too big to be on stack. */
11625         struct {
11626                 struct nameidata snapnd;
11627                 struct nameidata dirnd;
11628         } * __snapshot_mount_data;
11629
11630         MALLOC(__snapshot_mount_data, void *, sizeof(*__snapshot_mount_data),
11631             M_TEMP, M_WAITOK);
11632         snapndp = &__snapshot_mount_data->snapnd;
11633         dirndp = &__snapshot_mount_data->dirnd;
11634
11635         error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, snapndp, LOOKUP,
11636             OP_LOOKUP, ctx);
11637         if (error)
11638                 goto out;
11639
11640         snapvp  = snapndp->ni_vp;
11641         if (!vnode_mount(rvp) || (vnode_mount(rvp) == dead_mountp)) {
11642                 error = EIO;
11643                 goto out1;
11644         }
11645
11646         /* Get the vnode to be covered */
11647         NDINIT(dirndp, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT,
11648             UIO_USERSPACE, directory, ctx);
11649         error = namei(dirndp);
11650         if (error)
11651                 goto out1;
11652
11653         vp = dirndp->ni_vp;
11654         pvp = dirndp->ni_dvp;
11655
11656         if ((vp->v_flag & VROOT) && (vp->v_mount->mnt_flag & MNT_ROOTFS)) {
11657                 error = EINVAL;
11658         } else {
11659                 mount_t mp = vnode_mount(rvp);
11660                 struct fs_snapshot_mount_args smnt_data;
11661
11662                 smnt_data.sm_mp  = mp;
11663                 smnt_data.sm_cnp = &snapndp->ni_cnd;
11664                 error = mount_common(mp->mnt_vfsstat.f_fstypename, pvp, vp,
11665                    &dirndp->ni_cnd, CAST_USER_ADDR_T(&smnt_data), 0,
11666                    KERNEL_MOUNT_SNAPSHOT, NULL, FALSE, ctx);
11667         }
11668
11669         vnode_put(vp);
11670         vnode_put(pvp);
11671         nameidone(dirndp);
11672 out1:
11673         vnode_put(snapvp);
11674         vnode_put(snapdvp);
11675         vnode_put(rvp);
11676         nameidone(snapndp);
11677 out:
11678         FREE(__snapshot_mount_data, M_TEMP);
11679         return (error);
11680 }
11681
11682 /*
11683  * Root from a snapshot of the filesystem
11684  *
11685  * Marks the filesystem to root from the given snapshot on next boot.
11686  */
11687 static int
11688 snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags,
11689                 vfs_context_t ctx)
11690 {
11691     int error;
11692     vnode_t rvp;
11693     mount_t mp;
11694     struct fs_snapshot_root_args root_data;
11695     struct componentname cnp;
11696     caddr_t name_buf;
11697     size_t name_len;
11698
11699     error = vnode_getfromfd(ctx, dirfd, &rvp);
11700     if (error) {
11701         return (error);
11702     }
11703     mp = vnode_mount(rvp);
11704
11705     MALLOC(name_buf, caddr_t, MAXPATHLEN, M_TEMP, M_WAITOK);
11706     error = copyinstr(name, name_buf, MAXPATHLEN, &name_len);
11707     if (error) {
11708         FREE(name_buf, M_TEMP);
11709         vnode_put(rvp);
11710         return (error);
11711     }
11712
11713     // XXX MAC checks ?
11714
11715     /*
11716      * Grab mount_iterref so that we can release the vnode,
11717      * since VFSIOC_ROOT_SNAPSHOT could conceivably cause a sync.
11718      */
11719     error = mount_iterref (mp, 0);
11720     vnode_put(rvp);
11721     if (error) {
11722         FREE(name_buf, M_TEMP);
11723         return (error);
11724     }
11725
11726     memset(&cnp, 0, sizeof(cnp));
11727     cnp.cn_pnbuf = (char *)name_buf;
11728     cnp.cn_nameiop = LOOKUP;
11729     cnp.cn_flags = ISLASTCN | HASBUF;
11730     cnp.cn_pnlen = MAXPATHLEN;
11731     cnp.cn_nameptr = cnp.cn_pnbuf;
11732     cnp.cn_namelen = (int)name_len;
11733     root_data.sr_cnp = &cnp;
11734
11735     error = VFS_IOCTL(mp, VFSIOC_ROOT_SNAPSHOT, (caddr_t)&root_data, 0, ctx);
11736
11737     mount_iterdrop(mp);
11738     FREE(name_buf, M_TEMP);
11739
11740     return (error);
11741 }
11742
11743 /*
11744  * FS snapshot operations dispatcher
11745  */
11746 int
11747 fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
11748     __unused int32_t *retval)
11749 {
11750         int error;
11751         vfs_context_t ctx = vfs_context_current();
11752
11753         AUDIT_ARG(fd, uap->dirfd);
11754         AUDIT_ARG(value32, uap->op);
11755
11756         error = priv_check_cred(vfs_context_ucred(ctx), PRIV_VFS_SNAPSHOT, 0);
11757         if (error)
11758                 return (error);
11759
11760         switch (uap->op) {
11761         case SNAPSHOT_OP_CREATE:
11762                 error = snapshot_create(uap->dirfd, uap->name1, uap->flags, ctx);
11763                 break;
11764         case SNAPSHOT_OP_DELETE:
11765                 error = snapshot_delete(uap->dirfd, uap->name1, uap->flags, ctx);
11766                 break;
11767         case SNAPSHOT_OP_RENAME:
11768                 error = snapshot_rename(uap->dirfd, uap->name1, uap->name2,
11769                     uap->flags, ctx);
11770                 break;
11771         case SNAPSHOT_OP_MOUNT:
11772                 error = snapshot_mount(uap->dirfd, uap->name1, uap->name2,
11773                     uap->data, uap->flags, ctx);
11774                 break;
11775     case SNAPSHOT_OP_REVERT:
11776         error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
11777         break;
11778         case SNAPSHOT_OP_ROOT:
11779                 error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
11780                 break;
11781         default:
11782                 error = ENOSYS;
11783         }
11784
11785         return (error);
11786 }